In [49]:
import os
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split


# Describing what each vote value represents
#### 1 : Yes (Y)
#### 2 : Abstain (A)
#### 3 : No (N)
#### 8 : Non-participating
#### 9 : Not eligible to participate



# Objective 1. Predicting how a country will vote for a resolution

In [61]:

# Loading dataset
votes_df_2 = os.path.join("undataset2", 'UNVotes-1.csv')
votes_df_2 = pd.read_csv(votes_df_2, encoding='latin-1')

# Ignoring all rows that contain a 2,8 or 9 vote
votes_df_2 = votes_df_2.loc[(votes_df_2["vote"] != 2 )]
votes_df_2 = votes_df_2.loc[(votes_df_2["vote"] != 8 )]
votes_df_2 = votes_df_2.loc[(votes_df_2["vote"] != 9 )]

# Selecting the country to predict how it will vote for resolutions
votes_df_2 = votes_df_2.loc[votes_df_2['Countryname'] == 'United States of America']





  votes_df_2 = pd.read_csv(votes_df_2, encoding='latin-1')


In [62]:

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.utils import resample
from sklearn.metrics import classification_report

df = votes_df_2
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['descr'], df['vote'], test_size=0.10, random_state=27)

X = pd.concat([X_train, y_train], axis=1)
yes = X[X.vote==1]
no = X[X.vote==3]

# Applying Downsampling on the training set

df_yes = yes
df_no = no

if (len(df_yes) > len(df_no)):
    print("Yes has more votes than no")
    df_sampled = resample(df_yes, 
                                     replace=False,     # sample with replacement
                                     n_samples=len(df_no),    # to match minority class
                                     random_state=27) # reproducible results
    sampled = pd.concat([df_sampled, df_no])
    
if(len(df_no) > len(df_yes)):
    print("No has more votes than yes")
    df_sampled = resample(df_no, 
                                     replace=False,     # sample with replacement
                                     n_samples=len(df_yes),    # to match minority class
                                     random_state=27) # reproducible results
    sampled = pd.concat([df_yes, df_sampled])



pd.options.display.max_colwidth = 500


X_train = sampled.descr
y_train = sampled.vote


# Vectorizing the text data

vectorizer = TfidfVectorizer()
X_train_vectors = vectorizer.fit_transform(X_train.values.astype('U').ravel())
X_test_vectors = vectorizer.transform(X_test.values.astype('U').ravel())

# Training the Naivye Bayes classifier
clf = MultinomialNB()
clf.fit(X_train_vectors, y_train)

# Evaluating the performance of the classifier
y_pred = clf.predict(X_test_vectors)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("\t\t\t Naive Bayes report:\n",classification_report(y_pred,y_test))

No has more votes than yes
Accuracy: 0.7763975155279503
			 Naive Bayes report:
               precision    recall  f1-score   support

           1       0.80      0.65      0.72       214
           3       0.76      0.87      0.81       269

    accuracy                           0.78       483
   macro avg       0.78      0.76      0.77       483
weighted avg       0.78      0.78      0.77       483



# Logistic Regression

In [63]:

from sklearn.linear_model import LogisticRegression
reg = LogisticRegression()
reg.fit(X_train_vectors, y_train)
y_pred_lr = reg.predict(X_test_vectors)
accuracy_lr = accuracy_score(y_test, y_pred_lr)
print("Accuracy:", accuracy_lr)
print(y_pred_lr.tolist().count(1), y_test.tolist().count(1))
print("\t\t\t Logistic Regression report:\n",classification_report(y_pred_lr,y_test))

Accuracy: 0.7929606625258799
216 174
			 Logistic Regression report:
               precision    recall  f1-score   support

           1       0.83      0.67      0.74       216
           3       0.77      0.89      0.83       267

    accuracy                           0.79       483
   macro avg       0.80      0.78      0.78       483
weighted avg       0.80      0.79      0.79       483



# Decision Tree

In [64]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(X_train_vectors, y_train)
y_pred_dt = dtc.predict(X_test_vectors)
accuracy_dt = accuracy_score(y_test, y_pred_dt)
print("Accuracy:", accuracy_dt)
print(y_pred_dt.tolist().count(3), y_test.tolist().count(3))
print("\t\t\t Decision Tree report:\n",classification_report(y_pred_dt,y_test))

Accuracy: 0.7536231884057971
272 309
			 Decision Tree report:
               precision    recall  f1-score   support

           1       0.76      0.63      0.69       211
           3       0.75      0.85      0.80       272

    accuracy                           0.75       483
   macro avg       0.76      0.74      0.74       483
weighted avg       0.75      0.75      0.75       483



# Random Forest

In [65]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train_vectors, y_train)
y_pred_rf = rfc.predict(X_test_vectors)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("Accuracy:", accuracy_rf)
print(y_pred_rf.tolist().count(3), y_test.tolist().count(3))
print("\t\t\t Random Forest report:\n",classification_report(y_pred_rf,y_test))

Accuracy: 0.8074534161490683
260 309
			 Random Forest report:
               precision    recall  f1-score   support

           1       0.87      0.68      0.77       223
           3       0.77      0.92      0.84       260

    accuracy                           0.81       483
   macro avg       0.82      0.80      0.80       483
weighted avg       0.82      0.81      0.80       483



# SVM

In [66]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train_vectors, y_train)
y_pred_svm = svc.predict(X_test_vectors)
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print("Accuracy:", accuracy_svm)
print(y_pred_svm.tolist().count(3), y_test.tolist().count(3))
print("\t\t\t SVM report:\n",classification_report(y_pred_svm,y_test))

Accuracy: 0.8240165631469979
264 309
			 SVM report:
               precision    recall  f1-score   support

           1       0.89      0.70      0.78       219
           3       0.79      0.92      0.85       264

    accuracy                           0.82       483
   macro avg       0.84      0.81      0.82       483
weighted avg       0.83      0.82      0.82       483



# K Neighbours

In [67]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train_vectors, y_train)
y_pred_knn = knn.predict(X_test_vectors)
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print("Accuracy:", accuracy_knn)
print(y_pred_knn.tolist().count(3), y_test.tolist().count(3))
print("\t\t\t KNN report:\n",classification_report(y_pred_knn,y_test))

Accuracy: 0.7971014492753623
279 309
			 KNN report:
               precision    recall  f1-score   support

           1       0.80      0.69      0.74       204
           3       0.79      0.88      0.83       279

    accuracy                           0.80       483
   macro avg       0.80      0.78      0.79       483
weighted avg       0.80      0.80      0.79       483



  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [68]:
list1 = [y_pred,y_pred_lr,y_pred_dt,y_pred_rf,y_pred_svm, y_pred_knn]
d =['NaiveBayes','LogisticRegression','DecisionTree','RandomForest','SVM', 'KNN']
accuracies={} 
k=0
list2 = []
for i in list1:
    list2.append(accuracy_score(i,y_test)*100)
for i in d:
    accuracies[i] = list2[k]
    k+=1

print("All Accuracies: ", accuracies)
print("The most accurate model is: ", max(accuracies, key=accuracies.get))

All Accuracies:  {'NaiveBayes': 77.63975155279503, 'LogisticRegression': 79.29606625258799, 'DecisionTree': 75.36231884057972, 'RandomForest': 80.74534161490683, 'SVM': 82.40165631469979, 'KNN': 79.71014492753623}
The most accurate model is:  SVM
