In [4]:
import os
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split


# Objective 3. Using different time periods to predict resolutions

In [5]:
#Loading dataset
votes_df_2 = os.path.join("undataset2", 'UNVotes-1.csv')
votes_df_2 = pd.read_csv(votes_df_2, encoding='latin-1')

#Selecting the country that the experiment will be conducted on, in this case, the USA.
votes_df_2 = votes_df_2.loc[votes_df_2['Countryname'] == 'United States of America']
votes_df_2 = votes_df_2.loc[(votes_df_2["vote"] != 2 )]
votes_df_2 = votes_df_2.loc[(votes_df_2["vote"] != 8 )]
votes_df_2 = votes_df_2.loc[(votes_df_2["vote"] != 9 )]


  votes_df_2 = pd.read_csv(votes_df_2, encoding='latin-1')


In [6]:
### import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.utils import resample
from sklearn.metrics import classification_report


# Loading the dataset

df = votes_df_2
# Selecting the 4 year time period for a particular term
train = df.loc[(df['year'] >= 2009) & (df['year'] <= 2013)]
# Selecting the year to be the test test. Used as the same test for all the different time periods
test = df.loc[(df['year'] == 2018)]
print(len(train))
print(len(test))

X_train = train['descr']
X_test = test['descr']
y_train = train['vote']
y_test = test['vote']


# Vectorizing the text data

vectorizer = TfidfVectorizer()
X_train_vectors = vectorizer.fit_transform(X_train.values.astype('U').ravel())
X_test_vectors = vectorizer.transform(X_test.values.astype('U').ravel())

# Training the  Naive Bayes classifier
clf = MultinomialNB()
clf.fit(X_train_vectors, y_train)

# Evaluate the performance of the classifier
y_pred = clf.predict(X_test_vectors)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("\t\t\t Naive Bayes report:\n",classification_report(y_pred,y_test))

368
85
Accuracy: 0.8235294117647058
			 Naive Bayes report:
               precision    recall  f1-score   support

           1       0.32      0.75      0.44         8
           3       0.97      0.83      0.90        77

    accuracy                           0.82        85
   macro avg       0.64      0.79      0.67        85
weighted avg       0.91      0.82      0.85        85



# Logistic Regression

In [7]:

from sklearn.linear_model import LogisticRegression
reg = LogisticRegression(class_weight='balanced')
reg.fit(X_train_vectors, y_train)
y_pred_lr = reg.predict(X_test_vectors)
accuracy_lr = accuracy_score(y_test, y_pred_lr)
print("Accuracy:", accuracy_lr)
print(y_pred_lr.tolist().count(1), y_test.tolist().count(1))
print("\t\t\t Logistic Regression report:\n",classification_report(y_pred_lr,y_test))

Accuracy: 0.788235294117647
17 19
			 Logistic Regression report:
               precision    recall  f1-score   support

           1       0.47      0.53      0.50        17
           3       0.88      0.85      0.87        68

    accuracy                           0.79        85
   macro avg       0.68      0.69      0.68        85
weighted avg       0.80      0.79      0.79        85



# Decision Tree

In [8]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(X_train_vectors, y_train)
y_pred_dt = dtc.predict(X_test_vectors)
accuracy_dt = accuracy_score(y_test, y_pred_dt)
print("Accuracy:", accuracy_dt)
print(y_pred_dt.tolist().count(3), y_test.tolist().count(3))
print("\t\t\t Decision Tree report:\n",classification_report(y_pred_dt,y_test))

Accuracy: 0.7764705882352941
61 66
			 Decision Tree report:
               precision    recall  f1-score   support

           1       0.63      0.50      0.56        24
           3       0.82      0.89      0.85        61

    accuracy                           0.78        85
   macro avg       0.72      0.69      0.70        85
weighted avg       0.77      0.78      0.77        85



# Random Forest

In [9]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train_vectors, y_train)
y_pred_rf = rfc.predict(X_test_vectors)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("Accuracy:", accuracy_rf)
print(y_pred_rf.tolist().count(3), y_test.tolist().count(3))
print("\t\t\t Random Forest report:\n",classification_report(y_pred_rf,y_test))

Accuracy: 0.6235294117647059
40 66
			 Random Forest report:
               precision    recall  f1-score   support

           1       0.84      0.36      0.50        45
           3       0.56      0.93      0.70        40

    accuracy                           0.62        85
   macro avg       0.70      0.64      0.60        85
weighted avg       0.71      0.62      0.59        85



# SVM

In [10]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train_vectors, y_train)
y_pred_svm = svc.predict(X_test_vectors)
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print("Accuracy:", accuracy_svm)
print(y_pred_svm.tolist().count(3), y_test.tolist().count(3))
print("\t\t\t SVM report:\n",classification_report(y_pred_svm,y_test))

Accuracy: 0.8117647058823529
74 66
			 SVM report:
               precision    recall  f1-score   support

           1       0.37      0.64      0.47        11
           3       0.94      0.84      0.89        74

    accuracy                           0.81        85
   macro avg       0.65      0.74      0.68        85
weighted avg       0.87      0.81      0.83        85



# K Neighbours

In [11]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train_vectors, y_train)
y_pred_knn = knn.predict(X_test_vectors)
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print("Accuracy:", accuracy_knn)
print(y_pred_knn.tolist().count(3), y_test.tolist().count(3))
print("\t\t\t KNN report:\n",classification_report(y_pred_knn,y_test))

Accuracy: 0.8352941176470589
72 66
			 KNN report:
               precision    recall  f1-score   support

           1       0.47      0.69      0.56        13
           3       0.94      0.86      0.90        72

    accuracy                           0.84        85
   macro avg       0.71      0.78      0.73        85
weighted avg       0.87      0.84      0.85        85



  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [12]:
list1 = [y_pred,y_pred_lr,y_pred_dt,y_pred_rf,y_pred_svm, y_pred_knn]
d =['NaiveBayes','LogisticRegression','DecisionTree','RandomForest','SVM', 'KNN']
accuracies={} 
k=0
list2 = []
for i in list1:
    list2.append(accuracy_score(i,y_test)*100)
for i in d:
    accuracies[i] = list2[k]
    k+=1

print("All Accuracies: ", accuracies)
print("The most accurate model is: ", max(accuracies, key=accuracies.get))

All Accuracies:  {'NaiveBayes': 82.35294117647058, 'LogisticRegression': 78.82352941176471, 'DecisionTree': 77.64705882352942, 'RandomForest': 62.35294117647059, 'SVM': 81.17647058823529, 'KNN': 83.52941176470588}
The most accurate model is:  KNN
