In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn import svm
from sklearn import tree
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
import cv2 as cv
from matplotlib import pyplot

In [2]:
def print_cm(cm, labels, hide_zeroes=False, hide_diagonal=False, hide_threshold=None):
    """pretty print for confusion matrixes"""
    columnwidth = max([len(x) for x in labels] )  # 5 is value length
    empty_cell = " " * columnwidth
    # Print header
    print(" " + empty_cell, end=" ")
    for label in labels:
        print("%{0}s".format(columnwidth) % label, end=" ")
    print()
    # Print rows
    for i, label1 in enumerate(labels):
        print("%{0}s".format(columnwidth) % label1, end=" ")
        for j in range(len(labels)):
            cell = "%{0}s".format(columnwidth) % cm[i, j]
            if hide_zeroes:
                cell = cell if float(cm[i, j]) != 0 else empty_cell
            if hide_diagonal:
                cell = cell if i != j else empty_cell
            if hide_threshold:
                cell = cell if cm[i, j] > hide_threshold else empty_cell
            print(cell, end=" ")
        print()

In [3]:
train_set = pd.read_csv('prepared_train.csv')
X_train = train_set.drop(columns=['Vote'])
y_train = train_set['Vote']

In [4]:
knn_models = []
for i in range(1,10):
    model = KNeighborsClassifier(n_neighbors = i)
    knn_models.append(model)
    accuracy = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=10)
    print("Accuracy of KNN with k =", i,  " with cross-validation:", accuracy.mean())
    clf = tree.DecisionTreeClassifier()

Accuracy of KNN with k = 1  with cross-validation: 0.8469291327909232
Accuracy of KNN with k = 2  with cross-validation: 0.8297994411133642
Accuracy of KNN with k = 3  with cross-validation: 0.8436727809879017
Accuracy of KNN with k = 4  with cross-validation: 0.8360673097951598
Accuracy of KNN with k = 5  with cross-validation: 0.8402528106493022
Accuracy of KNN with k = 6  with cross-validation: 0.836604065003692
Accuracy of KNN with k = 7  with cross-validation: 0.831097350914576
Accuracy of KNN with k = 8  with cross-validation: 0.8270599507475233
Accuracy of KNN with k = 9  with cross-validation: 0.8296612159024305


In [5]:
gnb = GaussianNB()
accuracy = cross_val_score(gnb, X_train, y_train, scoring='accuracy', cv=10)
print("Accuracy of naive bayes on train test with cross-validation:", accuracy.mean())
svm = svm.SVC(gamma='scale')
accuracy = cross_val_score(svm, X_train, y_train, scoring='accuracy', cv=10)
print("Accuracy of SVM on train test with cross-validation:", accuracy.mean())
clf = tree.DecisionTreeClassifier(criterion='gini')
accuracy = cross_val_score(clf, X_train, y_train, scoring='accuracy', cv=10)
print("Accuracy of Decision Tree on train test with cross-validation:", accuracy.mean())

Accuracy of naive bayes on train test with cross-validation: 0.8113725759646415
Accuracy of SVM on train test with cross-validation: 0.9138632019300758
Accuracy of Decision Tree on train test with cross-validation: 0.8866627566546755


In [6]:
# load validation set
validation_set = pd.read_csv('prepared_validation.csv')
X_validation = validation_set.drop(columns=['Vote'])
y_validation = validation_set['Vote']

In [7]:
# get histogram of validation set
parties = y_validation.values
parties = np.unique(parties)
validation_hist = validation_set['Vote'].value_counts(normalize=True) * 100

In [30]:
# KNN k=1 check
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_validation)
knn_pred = pd.DataFrame(data=knn_pred.flatten())
knn_hist = knn_pred[0].value_counts(normalize=True) * 100

print("classification report of knn k=1:")
print(classification_report(y_validation, knn_pred))
cm = confusion_matrix(y_validation, knn_pred)
print_cm(cm, parties)
d = knn_hist.to_frame().join(validation_hist)
d['diff'] = abs(d['Vote']-d[0])
print("Histogram compare - knn vs validation:")
print(d)
print("total diff =", d['diff'].sum())
print("correlation:", d['Vote'].corr(d[0]))

classification report of knn k=1:
              precision    recall  f1-score   support

       Blues       0.90      0.77      0.83        70
      Browns       0.83      0.89      0.86       133
      Greens       0.88      0.96      0.92        70
       Greys       0.87      0.89      0.88        65
      Khakis       0.78      0.84      0.81       259
     Oranges       0.93      0.74      0.82        54
       Pinks       0.85      0.70      0.77        40
     Purples       0.94      0.98      0.96       337
        Reds       0.81      0.92      0.87        66
  Turquoises       0.45      0.41      0.43        73
     Violets       0.52      0.43      0.47        54
      Whites       0.72      0.67      0.69        69
     Yellows       0.81      0.63      0.71        60

   micro avg       0.82      0.82      0.82      1350
   macro avg       0.79      0.76      0.77      1350
weighted avg       0.82      0.82      0.82      1350

                 Blues     Browns     Greens 

In [31]:
# NB check
gnb.fit(X_train, y_train)
gnb_pred = gnb.predict(X_validation)
gnb_pred = pd.DataFrame(data=gnb_pred.flatten())
gnb_hist = gnb_pred[0].value_counts(normalize=True) * 100
print("classification report of naive bayes:")
print(classification_report(y_validation, gnb_pred))
cm = confusion_matrix(y_validation, gnb_pred)
print_cm(cm, parties)
d = gnb_hist.to_frame().join(validation_hist)
d['diff'] = abs(d['Vote']-d[0])
print("Histogram compare - naive bayes vs validation:")
print(d)
print("total diff =", d['diff'].sum())
print("correlation:", d['Vote'].corr(d[0]))

classification report of naive bayes:
              precision    recall  f1-score   support

       Blues       0.94      0.91      0.93        70
      Browns       0.74      0.81      0.77       133
      Greens       1.00      0.97      0.99        70
       Greys       0.98      1.00      0.99        65
      Khakis       0.66      0.93      0.77       259
     Oranges       0.98      1.00      0.99        54
       Pinks       0.96      0.60      0.74        40
     Purples       0.95      0.95      0.95       337
        Reds       1.00      0.97      0.98        66
  Turquoises       1.00      0.01      0.03        73
     Violets       0.00      0.00      0.00        54
      Whites       0.59      0.55      0.57        69
     Yellows       0.64      0.95      0.77        60

   micro avg       0.82      0.82      0.82      1350
   macro avg       0.80      0.74      0.73      1350
weighted avg       0.81      0.82      0.78      1350

                 Blues     Browns     Gre

  'precision', 'predicted', average, warn_for)


In [32]:
# svm check
svm.fit(X_train, y_train)
svm_pred = svm.predict(X_validation)
svm_pred = pd.DataFrame(data=svm_pred.flatten())
svm_hist = svm_pred[0].value_counts(normalize=True) * 100
print("classification report of SVM:")
print(classification_report(y_validation, svm_pred))
cm = confusion_matrix(y_validation, svm_pred)
print_cm(cm, parties)
d = svm_hist.to_frame().join(validation_hist)
d['diff'] = abs(d['Vote']-d[0])
print("Histogram compare - SVM vs validation:")
print(d)
print("total diff =", d['diff'].sum())
print("correlation:", d['Vote'].corr(d[0]))

classification report of SVM:
              precision    recall  f1-score   support

       Blues       0.95      0.87      0.91        70
      Browns       0.88      0.96      0.92       133
      Greens       0.97      0.97      0.97        70
       Greys       0.98      0.98      0.98        65
      Khakis       0.86      0.97      0.91       259
     Oranges       1.00      0.96      0.98        54
       Pinks       0.96      0.60      0.74        40
     Purples       0.97      1.00      0.98       337
        Reds       0.97      1.00      0.99        66
  Turquoises       0.90      0.71      0.79        73
     Violets       0.82      0.57      0.67        54
      Whites       0.92      0.80      0.85        69
     Yellows       0.82      0.90      0.86        60

   micro avg       0.92      0.92      0.92      1350
   macro avg       0.92      0.87      0.89      1350
weighted avg       0.92      0.92      0.92      1350

                 Blues     Browns     Greens     

In [33]:
# decision tree check
clf.fit(X_train, y_train)
clf_pred = clf.predict(X_validation)
clf_pred = pd.DataFrame(data=clf_pred.flatten())
clf_hist = clf_pred[0].value_counts(normalize=True) * 100
print("classification report of decision tree:")
print(classification_report(y_validation, clf_pred))
cm = confusion_matrix(y_validation, clf_pred)
print_cm(cm, parties)
d = clf_hist.to_frame().join(validation_hist)
d['diff'] = abs(d['Vote']-d[0])
print("Histogram compare - decision tree vs validation:")
print(d)
print("total diff =", d['diff'].sum())
print("correlation:", d['Vote'].corr(d[0]))

classification report of decision tree:
              precision    recall  f1-score   support

       Blues       0.91      0.90      0.91        70
      Browns       0.83      0.89      0.86       133
      Greens       1.00      0.97      0.99        70
       Greys       0.93      0.97      0.95        65
      Khakis       0.91      0.89      0.90       259
     Oranges       0.91      0.80      0.85        54
       Pinks       0.94      0.82      0.88        40
     Purples       0.96      0.98      0.97       337
        Reds       0.94      0.95      0.95        66
  Turquoises       0.75      0.77      0.76        73
     Violets       0.58      0.70      0.64        54
      Whites       0.82      0.68      0.75        69
     Yellows       0.80      0.80      0.80        60

   micro avg       0.89      0.89      0.89      1350
   macro avg       0.87      0.86      0.86      1350
weighted avg       0.89      0.89      0.89      1350

                 Blues     Browns     G

In [None]:
# Load origin file
df = pd.read_csv('ElectionsData.csv')
print()
parties_by_percents = df['Vote'].value_counts(normalize=True) * 100
print("The division of voters between the various parties:")