In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

data = pd.read_csv('un-general-debates.csv')

# word frequency
vectorizer = CountVectorizer(stop_words='english')  #remove stop words for cleaner frequency counts
X = vectorizer.fit_transform(data['text'])  #convert text to word frequency matrix

X_train, X_test, y_train, y_test = train_test_split(X, data['country'], test_size=0.2, random_state=42)

nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

y_pred = nb_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1-Score: {f1:.2f}')


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
X_tfidf = tfidf_vectorizer.fit_transform(data['text'])

X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(X_tfidf, data['country'], test_size=0.2, random_state=42)

nb_classifier_tfidf = MultinomialNB()
nb_classifier_tfidf.fit(X_train_tfidf, y_train)
y_pred_tfidf = nb_classifier_tfidf.predict(X_test_tfidf)

accuracy_tfidf = accuracy_score(y_test, y_pred_tfidf)
precision_tfidf = precision_score(y_test, y_pred_tfidf, average='weighted')
recall_tfidf = recall_score(y_test, y_pred_tfidf, average='weighted')
f1_tfidf = f1_score(y_test, y_pred_tfidf, average='weighted')

print(f'TF-IDF Accuracy: {accuracy_tfidf:.2f}')
print(f'TF-IDF Precision: {precision_tfidf:.2f}')
print(f'TF-IDF Recall: {recall_tfidf:.2f}')
print(f'TF-IDF F1-Score: {f1_tfidf:.2f}')


In [None]:
import numpy as np

# average word length
def avg_word_length(text):
    words = text.split()
    return np.mean([len(word) for word in words]) if len(words) > 0 else 0

data['avg_word_length'] = data['text'].apply(avg_word_length)

X_word_length = data['avg_word_length'].values.reshape(-1, 1)

X_train_word_length, X_test_word_length, y_train, y_test = train_test_split(X_word_length, data['country'], test_size=0.2, random_state=42)

# Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB
nb_classifier_word_length = GaussianNB()
nb_classifier_word_length.fit(X_train_word_length, y_train)
y_pred_word_length = nb_classifier_word_length.predict(X_test_word_length)

accuracy_word_length = accuracy_score(y_test, y_pred_word_length)
precision_word_length = precision_score(y_test, y_pred_word_length, average='weighted')
recall_word_length = recall_score(y_test, y_pred_word_length, average='weighted')
f1_word_length = f1_score(y_test, y_pred_word_length, average='weighted')

print(f'Word Length Accuracy: {accuracy_word_length:.2f}')
print(f'Word Length Precision: {precision_word_length:.2f}')
print(f'Word Length Recall: {recall_word_length:.2f}')
print(f'Word Length F1-Score: {f1_word_length:.2f}')


In [None]:
#lexical diversity
def lexical_diversity(text):
    words = text.split()
    return len(set(words)) / len(words) if len(words) > 0 else 0

data['lexical_diversity'] = data['text'].apply(lexical_diversity)

X_lexical_diversity = data['lexical_diversity'].values.reshape(-1, 1)

X_train_lexical_diversity, X_test_lexical_diversity, y_train, y_test = train_test_split(X_lexical_diversity, data['country'], test_size=0.2, random_state=42)

#Gaussian Naive Bayes
nb_classifier_lexical_diversity = GaussianNB()
nb_classifier_lexical_diversity.fit(X_train_lexical_diversity, y_train)
y_pred_lexical_diversity = nb_classifier_lexical_diversity.predict(X_test_lexical_diversity)

accuracy_lexical_diversity = accuracy_score(y_test, y_pred_lexical_diversity)
precision_lexical_diversity = precision_score(y_test, y_pred_lexical_diversity, average='weighted')
recall_lexical_diversity = recall_score(y_test, y_pred_lexical_diversity, average='weighted')
f1_lexical_diversity = f1_score(y_test, y_pred_lexical_diversity, average='weighted')

print(f'Lexical Diversity Accuracy: {accuracy_lexical_diversity:.2f}')
print(f'Lexical Diversity Precision: {precision_lexical_diversity:.2f}')
print(f'Lexical Diversity Recall: {recall_lexical_diversity:.2f}')
print(f'Lexical Diversity F1-Score: {f1_lexical_diversity:.2f}')

In [None]:
import matplotlib.pyplot as plt
import numpy as np

methods = ['Word Frequency', 'TF-IDF', 'Average Word Length', 'Lexical Diversity']

accuracies = [accuracy, accuracy_tfidf, accuracy_word_length, accuracy_lexical_diversity]
precisions = [precision, precision_tfidf, precision_word_length, precision_lexical_diversity]
recalls = [recall, recall_tfidf, recall_word_length, recall_lexical_diversity]
f1_scores = [f1, f1_tfidf, f1_word_length, f1_lexical_diversity]

bar_width = 0.2
r1 = np.arange(len(methods))
r2 = [x + bar_width for x in r1]
r3 = [x + bar_width for x in r2]
r4 = [x + bar_width for x in r3]

plt.figure(figsize=(10, 6))
plt.bar(r1, accuracies, color='blue', width=bar_width, edgecolor='grey', label='Accuracy')
plt.bar(r2, precisions, color='green', width=bar_width, edgecolor='grey', label='Precision')
plt.bar(r3, recalls, color='red', width=bar_width, edgecolor='grey', label='Recall')
plt.bar(r4, f1_scores, color='purple', width=bar_width, edgecolor='grey', label='F1-Score')

plt.xlabel('Lexical Analysis Methods', fontweight='bold')
plt.xticks([r + bar_width for r in range(len(methods))], methods)
plt.ylabel('Scores', fontweight='bold')
plt.title('Performance Comparison of Lexical Analysis Methods')

plt.legend()

plt.tight_layout()
plt.show()
