In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
from matplotlib import pyplot
import matplotlib.pyplot as plt
import time
import math
import pickle

In [2]:
filename_Xtrain = '../data/X_train.csv'
filename_ytrain = '../data/y_train.csv'
filename_Xtest = '../data/X_test.csv'
filename_ytest = '../data/y_test.csv'

# Set Pandas to show all the columns
pd.set_option('display.max_columns', None)

# Read the data as a dataframe
X_train = pd.read_csv(filename_Xtrain)
y_train = pd.read_csv(filename_ytrain)

X_test = pd.read_csv(filename_Xtest)
y_test = pd.read_csv(filename_ytest)

In [3]:
y_train = y_train.iloc[:,0]
y_test = y_test.iloc[:,0]

### Load decision tree

In [4]:
# load the model from disk
filename = '../models/DecisionTree.sav'
DT_model = pickle.load(open(filename, 'rb'))
result = DT_model.score(X_test, y_test)
print(result)

0.6440913103796191


### Load Random Forest

In [5]:
# load the model from disk
filename = '../models/RandomForest.sav'
RF_model = pickle.load(open(filename, 'rb'))
result = RF_model.score(X_test, y_test)
print(result)

0.6719636776390465


### Load KNN

In [6]:
# load the model from disk
filename = '../models/knn.sav'
knn_model = pickle.load(open(filename, 'rb'))
result = knn_model.score(X_test, y_test)
print(result)

0.5974271660991298


### Load LinearSVC

In [7]:
# load the model from disk
filename = '../models/LinearSVC.sav'
linearSVC_model = pickle.load(open(filename, 'rb'))
result = linearSVC_model.score(X_test, y_test)
print(result)

0.6051204439399672


### Load Linear Regression

In [8]:
# load the model from disk
filename = '../models/Logit.sav'
logit_model = pickle.load(open(filename, 'rb'))
result = logit_model.score(X_test, y_test)
print(result)

0.6041114894690377


### Embedding

In [9]:
from sklearn.ensemble import VotingClassifier

We have two options in the votingClassifier --> voting = hard or voting = soft. The difference is further explained in the report. However, if we want to include the linearSVC we have to use hard voting because we don't have the method of predict_proba for it

In [10]:
voting_hard = VotingClassifier(
            estimators = [("randomForest",RF_model), 
                          ("knn",knn_model),
                          ("svc",linearSVC_model),
                          ("logit",logit_model)],
            voting = 'hard')
voting_hard.fit(X_train, y_train)

VotingClassifier(estimators=[('randomForest',
                              RandomForestClassifier(max_depth=15,
                                                     max_features='sqrt',
                                                     n_estimators=200)),
                             ('knn',
                              KNeighborsClassifier(metric='euclidean',
                                                   n_neighbors=47)),
                             ('svc',
                              LinearSVC(C=0.03162277660168379, dual=False)),
                             ('logit',
                              LogisticRegression(C=0.1, max_iter=10000))])

In [11]:
from sklearn.metrics import accuracy_score
voting_hard_pred = voting_hard.predict(X_test)
print("Voting: ",accuracy_score(y_test, voting_hard_pred))

Voting:  0.6297137091688737


We try now with soft voting, removing linearSVC_model

In [12]:
voting_soft = VotingClassifier(
            estimators = [("randomForest",RF_model), 
                          ("knn",knn_model),
                          ("logit",logit_model)],
            voting = 'soft')
voting_soft.fit(X_train, y_train)

VotingClassifier(estimators=[('randomForest',
                              RandomForestClassifier(max_depth=15,
                                                     max_features='sqrt',
                                                     n_estimators=200)),
                             ('knn',
                              KNeighborsClassifier(metric='euclidean',
                                                   n_neighbors=47)),
                             ('logit',
                              LogisticRegression(C=0.1, max_iter=10000))],
                 voting='soft')

In [13]:
from sklearn.metrics import accuracy_score
voting_soft_pred = voting_soft.predict(X_test)
print("Voting: ",accuracy_score(y_test, voting_soft_pred))

Voting:  0.6563248833396393


We see that the performance doesn't improve as expected as the accuracy is lower to the accuracy of the best model (Random Forest).
<br> Therefore, we conclude that the best model is definately Random Forest