In [30]:
%matplotlib inline 

import sys
import os
import statistics
 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

trainX = pd.read_csv('trainingData.txt','\t', header = None)
trainX.drop(trainX.columns[len(trainX.columns)-1], axis = 1, inplace = True)
trainY = pd.read_csv("trainingTruth.txt", header = None, names = ['Y'])
df = trainX.join(trainY)
# relax the limit a bit, since the cross_val_score is dropping with 1
index = df.isnull().sum(axis=1) <= 2
df = df[index]
df.fillna(df.median(), inplace = True)  
# Is it better to delete the rows with NA in the training? Fill in median could mislead the classifier.
# How about dropping all the rows with NA using the following line?
# df.dropna(axis=0, inplace=True) # drop the row with NA in training.
X = df.iloc[:,0:-1].values
Y = df['Y'].values

In [31]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier, AdaBoostClassifier
# Need to upgrade scikit-learn: 0.16.1-np110py34_0 --> 0.17-np110py34_1
from sklearn.cross_validation import cross_val_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
clf1 = LogisticRegression(random_state=1)
clf2 = RandomForestClassifier(random_state=1, n_estimators=20)
clf3 = GaussianNB()

clf4 = DecisionTreeClassifier(max_depth=4)
clf5 = KNeighborsClassifier(n_neighbors=7)
clf6 = SVC(kernel='rbf', probability=True)

eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3),
                                     ('dt', clf4), ('kn', clf5), ('svc', clf6)], 
                         voting='hard')
eclf2 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3),
                                     ('dt', clf4), ('kn', clf5), ('svc', clf6)], 
                         voting='soft')
#eclf3 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft', weights=[2,1,1])

In [4]:
score1 = cross_val_score(eclf1, X, Y, cv=10)
score2 = cross_val_score(eclf2, X, Y, cv=10)
#score3 = cross_val_score(eclf3, X, Y, cv=10)

print('hard voting', np.mean(score1), np.std(score1))
print('soft voting', np.mean(score2), np.std(score2))
#print('soft voting by weight', np.mean(score3), np.std(score3))

hard voting 0.853845957819 0.00799991819654
soft voting 0.868338889238 0.00540095906608


In [4]:
# Reduce X dimension, Test if the results stay the same
from sklearn.decomposition import PCA
from sklearn import preprocessing

X_scaled = preprocessing.scale(X)
X_PCA = PCA(n_components=30).fit_transform(X_scaled)

score1 = cross_val_score(eclf1, X_PCA, Y, cv=10)
score2 = cross_val_score(eclf2, X_PCA, Y, cv=10)

print('hard voting', np.mean(score1), np.std(score1))
print('soft voting', np.mean(score2), np.std(score2))

hard voting 0.887770306248 0.006194520852
soft voting 0.889523889502 0.00437884476615


In [5]:
from sklearn.decomposition import PCA
from sklearn import preprocessing

X_scaled = preprocessing.scale(X)
X_PCA = PCA(n_components=30).fit_transform(X_scaled)

print(cross_val_score(clf1, X_PCA, Y, cv=10))
print(cross_val_score(clf2, X_PCA, Y, cv=10))
print(cross_val_score(clf3, X_PCA, Y, cv=10))
print(cross_val_score(clf4, X_PCA, Y, cv=10))
print(cross_val_score(clf5, X_PCA, Y, cv=10))
print(cross_val_score(clf6, X_PCA, Y, cv=10))

[ 0.84805195  0.85324675  0.84535413  0.84015595  0.86289799  0.84795322
  0.84080572  0.85045514  0.84980494  0.84710475]
[ 0.86753247  0.87792208  0.86159844  0.8602989   0.87264457  0.86159844
  0.87004548  0.86345904  0.87516255  0.86532206]
[ 0.87012987  0.87727273  0.86419753  0.85834958  0.87979207  0.86289799
  0.86224821  0.86736021  0.86345904  0.86662329]
[ 0.81688312  0.82012987  0.80116959  0.79597141  0.82131254  0.79922027
  0.79402209  0.78153446  0.79323797  0.81522446]
[ 0.87727273  0.88051948  0.8648473   0.87069526  0.88304094  0.86419753
  0.87719298  0.86996099  0.87711313  0.870527  ]
[ 0.90584416  0.91038961  0.90253411  0.89408707  0.90643275  0.90903184
  0.89993502  0.90052016  0.91092328  0.90761223]


### SVC is most accurate.

In [None]:
from sklearn.decomposition import PCA
from sklearn import preprocessing

X_scaled = preprocessing.scale(X)
X_PCA = PCA(n_components=30).fit_transform(X_scaled)

clf7 = AdaBoostClassifier(random_state=1, base_estimator=clf6, algorithm='SAMME.R')
print(cross_val_score(clf7, X_PCA, Y, cv=10))