In [1]:
import os
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.utils import resample
from sklearn.metrics import classification_report


# Objective 2. Predicting if a resolution will pass or not as a resolution

In [2]:
# Creating training set
votes_df_2 = os.path.join("undataset2", 'UNVotes-1.csv')
votes_df_2 = pd.read_csv(votes_df_2, encoding='latin-1')

votes_df_2 = votes_df_2.loc[(votes_df_2["vote"] != 2 )]
votes_df_2 = votes_df_2.loc[(votes_df_2["vote"] != 8 )]
votes_df_2 = votes_df_2.loc[(votes_df_2["vote"] != 9 )]

# Randomly selecting a country to only have one instance of a single resolution
votes_df_2 = votes_df_2.loc[votes_df_2['Countryname'] == 'United States of America']

df = votes_df_2

df.loc[df['yes'] > df['no'], 'pass'] = 1
df.loc[df['yes'] < df['no'] ,'pass'] = 0

# Removing an possible rows that contain NA values
df = df[df['pass'].notna()]

print(df['pass'], df['descr'])
print(df['pass'].value_counts())

  votes_df_2 = pd.read_csv(votes_df_2, encoding='latin-1')


0          1.0
197        0.0
394        1.0
591        0.0
788        1.0
          ... 
1198256    1.0
1198257    1.0
1198258    1.0
1198259    1.0
1198260    1.0
Name: pass, Length: 4747, dtype: float64 0          TO ADOPT A CUBAN AMENDMENT TO THE UK PROPOSAL ...
197        TO ADOPT A USSR PROPOSAL ADJOURNING DEBATE ON ...
394        TO ADOPT THE KOREAN PROPOSAL THAT INVALID BALL...
591        TO ADOPT A CUBAN PROPOSAL (A/3-C) THAT AN ITEM...
788        TO ADOPT A 6TH COMMITTEE AMENDMENT (A/14) TO T...
                                 ...                        
1198256    A/73/251 128n - Cooperation between the United...
1198257    A/73/251 38 - The situation in the Middle East...
1198258    A/73/251 109 - Crime prevention and criminal j...
1198259    A/73/251 28a - Implementation of the outcome o...
1198260    A/73/251 101aa - Transparency and confidence-b...
Name: descr, Length: 4747, dtype: object
1.0    4432
0.0     315
Name: pass, dtype: int64


In [10]:
from sklearn.naive_bayes import GaussianNB


X_train, X_test, y_train, y_test = train_test_split(df['descr'], df['pass'], test_size=0.10, random_state=27)

#Downsampling or upsampling is not applied for this objective to achieve better results

# Vectorizing the text data
vectorizer = TfidfVectorizer()
X_train_vectors = vectorizer.fit_transform(X_train.values.astype('U'))
X_test_vectors = vectorizer.transform(X_test.values.astype('U'))


# Training the classifier
clf = MultinomialNB()
clf.fit(X_train_vectors, y_train)
y_pred = clf.predict(X_test_vectors)

# Evaluating the performance of the classifier
y_pred = clf.predict(X_test_vectors)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
# print(y_pred.tolist().count(1), y_test.tolist().count(1))
print("\t\t\t Report:\n",classification_report(y_pred,y_test))
print(np.where(y_pred_lr == 0)[0])

Accuracy: 0.9368421052631579
			 Report:
               precision    recall  f1-score   support

         0.0       0.00      0.00      0.00         0
         1.0       1.00      0.94      0.97       475

    accuracy                           0.94       475
   macro avg       0.50      0.47      0.48       475
weighted avg       1.00      0.94      0.97       475

[  2   9  16  22  34  36  54  68  83  87  92 105 108 125 140 142 143 146
 147 152 160 161 163 184 196 197 204 210 216 225 229 232 237 238 240 242
 254 268 280 288 289 293 294 297 310 311 337 339 348 354 359 401 403 404
 407 415 418 419 421 446 461 474]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Logistic Regression

In [4]:
from sklearn.linear_model import LogisticRegression
reg = LogisticRegression(class_weight='balanced')
reg.fit(X_train_vectors, y_train)
y_pred_lr = reg.predict(X_test_vectors)
accuracy_lr = accuracy_score(y_test, y_pred_lr)
print("Accuracy:", accuracy_lr)
print(y_pred_lr.tolist().count(1), y_test.tolist().count(1))
print("\t\t\t Logistic Regression report:\n",classification_report(y_pred_lr,y_test))
print(np.where(y_pred_lr == 0)[0])

Accuracy: 0.9242105263157895
413 445
			 Logistic Regression report:
               precision    recall  f1-score   support

         0.0       0.93      0.45      0.61        62
         1.0       0.92      1.00      0.96       413

    accuracy                           0.92       475
   macro avg       0.93      0.72      0.78       475
weighted avg       0.92      0.92      0.91       475

[  2   9  16  22  34  36  54  68  83  87  92 105 108 125 140 142 143 146
 147 152 160 161 163 184 196 197 204 210 216 225 229 232 237 238 240 242
 254 268 280 288 289 293 294 297 310 311 337 339 348 354 359 401 403 404
 407 415 418 419 421 446 461 474]


# Decision Tree

In [5]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(X_train_vectors, y_train)
y_pred_dt = dtc.predict(X_test_vectors)
accuracy_dt = accuracy_score(y_test, y_pred_dt)
print("Accuracy:", accuracy_dt)
print("\t\t\t Decision Tree report:\n",classification_report(y_pred_dt,y_test))


Accuracy: 0.9663157894736842
			 Decision Tree report:
               precision    recall  f1-score   support

         0.0       0.77      0.72      0.74        32
         1.0       0.98      0.98      0.98       443

    accuracy                           0.97       475
   macro avg       0.87      0.85      0.86       475
weighted avg       0.97      0.97      0.97       475



# Random Forest

In [6]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train_vectors, y_train)
y_pred_rf = rfc.predict(X_test_vectors)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("Accuracy:", accuracy_rf)
print(y_pred_rf.tolist().count(1), y_test.tolist().count(1))
print("\t\t\t Random Forest report:\n",classification_report(y_pred_rf,y_test))
print(np.where(y_pred_rf == 0)[0])
print((y_pred_rf[34]))
print((X_test.values[34]))

Accuracy: 0.96
462 445
			 Random Forest report:
               precision    recall  f1-score   support

         0.0       0.40      0.92      0.56        13
         1.0       1.00      0.96      0.98       462

    accuracy                           0.96       475
   macro avg       0.70      0.94      0.77       475
weighted avg       0.98      0.96      0.97       475

[ 34  54 140 197 225 238 242 280 310 337 339 348 403]
0.0
TO ADOPT A USSR AMENDMENT TO JOINT 2ND/3RD COMM. DRAFT RESOL. (A/246), ADDING THE WORDS \AND ALSO THE RIGHTS TO PRESENT WRITTEN AND VERBAL STATEMENTS TO THE ECONOMIC AND SOCIAL COUNCIL ON ALL MATTERS OF CONCERN FOR THE FEDERATION.\\ THE DRAFT R"


# SVM

In [7]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train_vectors, y_train)
y_pred_svm = svc.predict(X_test_vectors)
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print("Accuracy:", accuracy_svm)
print(y_pred_svm.tolist().count(3), y_test.tolist().count(3))
print("\t\t\t SVM report:\n",classification_report(y_pred_svm,y_test))
print(np.where(y_pred_svm == 0)[0])

Accuracy: 0.9621052631578947
0 0
			 SVM report:
               precision    recall  f1-score   support

         0.0       0.47      0.88      0.61        16
         1.0       1.00      0.97      0.98       459

    accuracy                           0.96       475
   macro avg       0.73      0.92      0.79       475
weighted avg       0.98      0.96      0.97       475

[ 34  54 140 142 152 163 184 197 225 229 238 242 280 310 348 403]


# KNN

In [8]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train_vectors, y_train)
y_pred_knn = knn.predict(X_test_vectors)
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print("Accuracy:", accuracy_knn)
print(y_pred_knn.tolist().count(3), y_test.tolist().count(3))
print("\t\t\t KNN report:\n",classification_report(y_pred_knn,y_test))

Accuracy: 0.9642105263157895
0 0
			 KNN report:
               precision    recall  f1-score   support

         0.0       0.60      0.78      0.68        23
         1.0       0.99      0.97      0.98       452

    accuracy                           0.96       475
   macro avg       0.79      0.88      0.83       475
weighted avg       0.97      0.96      0.97       475



  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [9]:
list1 = [y_pred,y_pred_lr,y_pred_dt,y_pred_rf,y_pred_svm, y_pred_knn]
d =['NaiveBayes','LogisticRegression','DecisionTree','RandomForest','SVM', 'KNN']
accuracies={} 
k=0
list2 = []
for i in list1:
    list2.append(accuracy_score(i,y_test)*100)
for i in d:
    accuracies[i] = list2[k]
    k+=1

print("All Accuracies: ", accuracies)
print("The most accurate model is: ", max(accuracies, key=accuracies.get))

All Accuracies:  {'NaiveBayes': 93.6842105263158, 'LogisticRegression': 92.42105263157895, 'DecisionTree': 96.63157894736842, 'RandomForest': 96.0, 'SVM': 96.21052631578947, 'KNN': 96.42105263157895}
The most accurate model is:  DecisionTree
