In [None]:
import pandas as pd
import matplotlib.pyplot as plt

data = pd.read_csv('un-general-debates.csv')

# Drop NA values in the dataset
data = data.dropna()

# Get all the unique years
dates = data['year'].unique()
# print(dates.size)
# print(dates.min(), dates.max())

# Get all the unique countries
countries = data['country'].unique()
# print(countries)

# How many times did each country speak at the UN?
country_counts = data['country'].value_counts()
print(country_counts)

# Graph a histogram of the number of times each country spoke at the UN
country_counts.hist(bins=len(country_counts))
plt.xlabel('Number of Speeches')
plt.ylabel('Number of Countries')
plt.title('Number of Speeches by Country')
plt.show()


Montenegro spoke the least in the UN. That makes sense, as they only joined in 2006.

We only have data from 46 years (1970-2015). Most countries have spoken 45 or 46 times, which indicates that most countries in the UN have been in it since 1970 and were present for all general debates since then.

In [None]:
# What is the average word count of speeches by each country?
data['wordCount'] = data['text'].str.split().str.len()
average_word_count = data.groupby('country')['wordCount'].mean().round(0)

# What is the average word count of speeches across all countries by year?
average_word_count_all = data.groupby('year')['wordCount'].mean().round(0)
print(f"Average word count is: {average_word_count_all.mean()}")

# Print the average word count of speeches by each country
# print(average_word_count)

# Graph a histogram of the average word count of speeches by each country
average_word_count.hist()
plt.xlabel('Average Word Count')
plt.ylabel('Number of Countries')
plt.title('Average Word Count of Speeches by Country')
plt.show()

# Graph the average word count of speeches across all countries by year
average_word_count_all.plot()
plt.xlabel('Year')
plt.ylabel('Average Word Count')
plt.title('Average Word Count of Speeches by Year')
plt.show()

# Graph the average word count of speeches by the US by year
us_data = data[data['country'] == 'USA']
us_data.groupby('year')['wordCount'].mean().plot()
plt.xlabel('Year')
plt.ylabel('Average Word Count')
plt.title('Average Word Count of Speeches by USA')
plt.show()

# Print the average word count of speeches by the United States
us_word_count = data[data['country'] == 'USA']['wordCount']
print(f"US average word count: {average_word_count['USA']}")

# Save the bottom 5 countries with the lowest average word count
bottom_5_countries = average_word_count.nsmallest(5)
print("Bottom 5 countries with the lowest average word count:")
print(bottom_5_countries)

# Save the bottom 1 country with the lowest average word count
bottom_1_country = average_word_count.nsmallest(1)

# Save the top 5 countries with the highest average word count
top_5_countries = average_word_count.nlargest(5)
print("Top 5 countries with the highest average word count:")
print(top_5_countries)

# Save the top 1 country with the highest average word count
top_1_country = average_word_count.nlargest(1)

# Graph the average word count of bottom_1_country by year
bottom_1_country_data = data[data['country'] == bottom_1_country.index[0]]
bottom_1_country_data.groupby('year')['wordCount'].mean().plot()
plt.xlabel('Year')
plt.ylabel('Average Word Count')
plt.title(f'Average Word Count of Speeches by {bottom_1_country.index[0]}')
plt.show()

# Graph the average word count of top_1_country by year
top_1_country_data = data[data['country'] == top_1_country.index[0]]
top_1_country_data.groupby('year')['wordCount'].mean().plot()
plt.xlabel('Year')
plt.ylabel('Average Word Count')
plt.title(f'Average Word Count of Speeches by {top_1_country.index[0]}')
plt.show()

Most countries hover around 3000 words per speech. The UN asks people to apply a voluntary 15-minute time limit, which is around 3000 words, so that makes sense.

What happened in 1986 to tank Russia's word count so much?

They got a new ambassador to the UN!

Yakov Malik
(1868-1976)

Oleg Troyanovsky (1976-1986)

Yuri Dubinin
(1986-1986)

Alexander Belonogov
(1986-1990)

OHOHOH, but also, there was a change in the USSR leadership to Mikhail Gorbachev, who was the leader from 1985-1991

What happened in the US between 1995-2005?

Not sure, lots of ambassadors. BUT, there was a change in ambassadors right around when the US started talking a lot again:

Zalmay Khalilzad
(2007-2009)

Susan Rice
(2009-2013)

But the president was addressing the UN for the USA in 2009-2010 (and longer?), so it woudn't be because of Susan Rice.

It was a presidency change, from Bush to Obama (Obama's first speech in the UN was in 2009). It seems that Obama talks more than Bush.

<!-- Is the average word count even necessary?? Do countries speak more than once each year anyway? -->



In [3]:
# Print all of Russia's wordCounts for each year
russia_data = data[data['country'] == 'RUS']

# Order the russia_data by the year
russia_data = russia_data.sort_values(by='year')
# print(russia_data[['year', 'wordCount']])



In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np

data = pd.read_csv('un-general-debates.csv')

# Word frequency
vectorizer = CountVectorizer(stop_words='english')  # remove stop words -> cleaner frequency counts
X = vectorizer.fit_transform(data['text'])

word_freq = np.asarray(X.sum(axis=0)).flatten()
word_freq_df = pd.DataFrame({'word': vectorizer.get_feature_names_out(), 'frequency': word_freq})
word_freq_df_sorted = word_freq_df.sort_values(by='frequency', ascending=False)

# display the top 5 most common words
top_5_words = word_freq_df_sorted.head(5)
print("\nTop 5 most common words and their frequencies:")
print(top_5_words)

X_train, X_test, y_train, y_test = train_test_split(X, data['country'], test_size=0.2, random_state=42)
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)
y_pred = nb_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f'\nModel Performance:')
print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1-Score: {f1:.2f}')


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
X_tfidf = tfidf_vectorizer.fit_transform(data['text'])

X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(X_tfidf, data['country'], test_size=0.2, random_state=42)

nb_classifier_tfidf = MultinomialNB()
nb_classifier_tfidf.fit(X_train_tfidf, y_train)
y_pred_tfidf = nb_classifier_tfidf.predict(X_test_tfidf)

accuracy_tfidf = accuracy_score(y_test, y_pred_tfidf)
precision_tfidf = precision_score(y_test, y_pred_tfidf, average='weighted')
recall_tfidf = recall_score(y_test, y_pred_tfidf, average='weighted')
f1_tfidf = f1_score(y_test, y_pred_tfidf, average='weighted')

print(f'TF-IDF Accuracy: {accuracy_tfidf:.2f}')
print(f'TF-IDF Precision: {precision_tfidf:.2f}')
print(f'TF-IDF Recall: {recall_tfidf:.2f}')
print(f'TF-IDF F1-Score: {f1_tfidf:.2f}')


In [None]:
import numpy as np

# average word length
def avg_word_length(text):
    words = text.split()
    return np.mean([len(word) for word in words]) if len(words) > 0 else 0

data['avg_word_length'] = data['text'].apply(avg_word_length)

X_word_length = data['avg_word_length'].values.reshape(-1, 1)

X_train_word_length, X_test_word_length, y_train, y_test = train_test_split(X_word_length, data['country'], test_size=0.2, random_state=42)

# Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB
nb_classifier_word_length = GaussianNB()
nb_classifier_word_length.fit(X_train_word_length, y_train)
y_pred_word_length = nb_classifier_word_length.predict(X_test_word_length)

accuracy_word_length = accuracy_score(y_test, y_pred_word_length)
precision_word_length = precision_score(y_test, y_pred_word_length, average='weighted')
recall_word_length = recall_score(y_test, y_pred_word_length, average='weighted')
f1_word_length = f1_score(y_test, y_pred_word_length, average='weighted')

print(f'Word Length Accuracy: {accuracy_word_length:.2f}')
print(f'Word Length Precision: {precision_word_length:.2f}')
print(f'Word Length Recall: {recall_word_length:.2f}')
print(f'Word Length F1-Score: {f1_word_length:.2f}')


In [None]:
#lexical diversity
def lexical_diversity(text):
    words = text.split()
    return len(set(words)) / len(words) if len(words) > 0 else 0

data['lexical_diversity'] = data['text'].apply(lexical_diversity)

X_lexical_diversity = data['lexical_diversity'].values.reshape(-1, 1)

X_train_lexical_diversity, X_test_lexical_diversity, y_train, y_test = train_test_split(X_lexical_diversity, data['country'], test_size=0.2, random_state=42)

#Gaussian Naive Bayes
nb_classifier_lexical_diversity = GaussianNB()
nb_classifier_lexical_diversity.fit(X_train_lexical_diversity, y_train)
y_pred_lexical_diversity = nb_classifier_lexical_diversity.predict(X_test_lexical_diversity)

accuracy_lexical_diversity = accuracy_score(y_test, y_pred_lexical_diversity)
precision_lexical_diversity = precision_score(y_test, y_pred_lexical_diversity, average='weighted')
recall_lexical_diversity = recall_score(y_test, y_pred_lexical_diversity, average='weighted')
f1_lexical_diversity = f1_score(y_test, y_pred_lexical_diversity, average='weighted')

print(f'Lexical Diversity Accuracy: {accuracy_lexical_diversity:.2f}')
print(f'Lexical Diversity Precision: {precision_lexical_diversity:.2f}')
print(f'Lexical Diversity Recall: {recall_lexical_diversity:.2f}')
print(f'Lexical Diversity F1-Score: {f1_lexical_diversity:.2f}')

In [None]:
import matplotlib.pyplot as plt
import numpy as np

methods = ['Word Frequency', 'TF-IDF', 'Average Word Length', 'Lexical Diversity']

accuracies = [accuracy, accuracy_tfidf, accuracy_word_length, accuracy_lexical_diversity]
precisions = [precision, precision_tfidf, precision_word_length, precision_lexical_diversity]
recalls = [recall, recall_tfidf, recall_word_length, recall_lexical_diversity]
f1_scores = [f1, f1_tfidf, f1_word_length, f1_lexical_diversity]

bar_width = 0.2
r1 = np.arange(len(methods))
r2 = [x + bar_width for x in r1]
r3 = [x + bar_width for x in r2]
r4 = [x + bar_width for x in r3]

plt.figure(figsize=(10, 6))
plt.bar(r1, accuracies, color='blue', width=bar_width, edgecolor='grey', label='Accuracy')
plt.bar(r2, precisions, color='green', width=bar_width, edgecolor='grey', label='Precision')
plt.bar(r3, recalls, color='red', width=bar_width, edgecolor='grey', label='Recall')
plt.bar(r4, f1_scores, color='purple', width=bar_width, edgecolor='grey', label='F1-Score')

plt.xlabel('Lexical Analysis Methods', fontweight='bold')
plt.xticks([r + bar_width for r in range(len(methods))], methods)
plt.ylabel('Scores', fontweight='bold')
plt.title('Performance Comparison of Lexical Analysis Methods')

plt.legend()

plt.tight_layout()
plt.show()


In [None]:
print(data['country'].unique())

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def map_country_to_region(country_code):
    region_map = {
        'Africa': ['NER', 'ZWE', 'SDN', 'MAR', 'EGY', 'RWA', 'MOZ', 'GAB', 'TCD', 'SYR', 'CMR', 'AGO', 'CIV', 'TGO',
                   'LBR', 'LCA', 'MLI', 'GHA', 'GIN', 'GNB', 'BFA', 'SEN', 'STP', 'MRT', 'MLT', 'SWZ', 'ETH', 'MDG',
                   'TUN', 'COD', 'GNQ', 'DZA', 'LSO', 'GMB', 'NGA', 'ZAF', 'MWI', 'DJI', 'BEN', 'SOM', 'ZMB', 'CPV',
                   'COM', 'UGA', 'SLE', 'ERI', 'CAF', 'SSD'],
        'Asia': ['MDV', 'PHL', 'RUS', 'CHN', 'MYS', 'NPL', 'BLR', 'BGD', 'JPN', 'KHM', 'IND', 'IDN', 'IRQ', 'IRN',
                 'PAK', 'ISR', 'TUR', 'AFG', 'LKA', 'SGP', 'LAO', 'MMR', 'THA', 'JOR', 'SAU', 'QAT', 'KWT', 'TJK',
                 'UZB', 'AZE', 'KAZ', 'TKM', 'KGZ', 'ARM', 'PSE', 'PRK', 'KOR', 'YEM', 'LBN', 'BTN', 'VNM'],
        'Europe': ['FIN', 'ESP', 'PRT', 'BEL', 'ALB', 'GRC', 'LUX', 'ITA', 'BHR', 'CYP', 'NOR', 'ISL', 'UKR', 'FRA',
                   'GBR', 'HUN', 'AUT', 'POL', 'BGR', 'ROU', 'NLD', 'DEU', 'DNK', 'YUG', 'CSK', 'BIH', 'HRV', 'SVK',
                   'LTU', 'AND', 'TUR', 'LVA', 'CZE', 'SVN', 'CHE', 'VAT', 'MCO', 'MKD', 'LIE', 'GEO', 'EST', 'SMR', 'EU'],
        'Americas': ['URY', 'ARG', 'SLV', 'COL', 'CAN', 'USA', 'MEX', 'BRA', 'PER', 'ECU', 'PAN', 'CUB', 'VEN', 'BOL',
                     'HND', 'CRI', 'DOM', 'TTO', 'GTM', 'BLZ', 'GRD', 'JAM', 'HTI', 'BRB', 'BRN', 'PRY', 'BHS', 'SUR',
                     'VCT', 'ATG', 'KNA'],
        'Oceania': ['VUT', 'PNG', 'SLB', 'FJI', 'AUS', 'NZL', 'TON', 'NRU', 'TUV', 'FSM', 'KIR', 'WSM', 'MHL', 'PLW'],
        'Middle East': ['SYR', 'ISR', 'IRN', 'IRQ', 'LBN', 'OMN', 'ARE', 'YEM', 'SAU', 'QAT', 'KWT', 'PSE', 'JOR'],
    }
    
    for region, countries in region_map.items():
        if country_code in countries:
            return region
    return 'Unknown'  # shouldn't happen I think

data['region'] = data['country'].apply(map_country_to_region)

X = data['text'] 
y_region = data['region']

X_train_region, X_test_region, y_train_region, y_test_region = train_test_split(X, y_region, test_size=0.2, random_state=42)

# Vectorize the text using TF-IDF
tfidf_vectorizer_region = TfidfVectorizer(stop_words='english')
X_train_tfidf_region = tfidf_vectorizer_region.fit_transform(X_train_region)
X_test_tfidf_region = tfidf_vectorizer_region.transform(X_test_region)

nb_classifier_region = MultinomialNB()
nb_classifier_region.fit(X_train_tfidf_region, y_train_region)
y_pred_region = nb_classifier_region.predict(X_test_tfidf_region)

accuracy_region = accuracy_score(y_test_region, y_pred_region)
precision_region = precision_score(y_test_region, y_pred_region, average='weighted')
recall_region = recall_score(y_test_region, y_pred_region, average='weighted')
f1_region = f1_score(y_test_region, y_pred_region, average='weighted')

print(f'Region Classification Accuracy: {accuracy_region:.2f}')
print(f'Region Classification Precision: {precision_region:.2f}')
print(f'Region Classification Recall: {recall_region:.2f}')
print(f'Region Classification F1-Score: {f1_region:.2f}')

# ------------------------- part 2 ----------------------------------------

regions = data['region'].unique()
results = []

for region in regions:
    region_data = data[data['region'] == region]
    X_region = region_data['text']
    y_country = region_data['country']
    
    X_train_country, X_test_country, y_train_country, y_test_country = train_test_split(X_region, y_country, test_size=0.2, random_state=42)
    
    tfidf_vectorizer_country = TfidfVectorizer(stop_words='english')
    X_train_tfidf_country = tfidf_vectorizer_country.fit_transform(X_train_country)
    X_test_tfidf_country = tfidf_vectorizer_country.transform(X_test_country)
    
    nb_classifier_country = MultinomialNB()
    nb_classifier_country.fit(X_train_tfidf_country, y_train_country)
    y_pred_country = nb_classifier_country.predict(X_test_tfidf_country)
    
    accuracy_country = accuracy_score(y_test_country, y_pred_country)
    precision_country = precision_score(y_test_country, y_pred_country, average='weighted')
    recall_country = recall_score(y_test_country, y_pred_country, average='weighted')
    f1_country = f1_score(y_test_country, y_pred_country, average='weighted')
    
    results.append({
        'region': region,
        'accuracy': accuracy_country,
        'precision': precision_country,
        'recall': recall_country,
        'f1': f1_country
    })
    
    print(f'\nResults for Region: {region}')
    print(f'Country Prediction Accuracy: {accuracy_country:.2f}')
    print(f'Country Prediction Precision: {precision_country:.2f}')
    print(f'Country Prediction Recall: {recall_country:.2f}')
    print(f'Country Prediction F1-Score: {f1_country:.2f}')

results_df = pd.DataFrame(results)
print("\nOverall Results by Region:")
print(results_df)


In [None]:
region_results = results_df.groupby('region').mean().reset_index()
region_results = region_results[region_results['region'] != 'Unknown']
region_results.set_index('region', inplace=True)

ax = region_results.plot(kind='bar', figsize=(12, 6))
plt.title('Overall Results by Region Using Regional Classification')
plt.xlabel('Region')
plt.ylabel('Score')
plt.xticks(rotation=45)
plt.legend(title='Metrics')
plt.tight_layout() 
plt.show()