In [1]:
import numpy as np
import pandas as pd
import time
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV

In [2]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import SVC

  from numpy.core.umath_tests import inner1d


In [3]:
train_df = pd.read_csv("TrainDataFrame.csv")
test_df = pd.read_csv("TestDataFrame.csv")

In [4]:
train_df.set_index(train_df.columns[0], inplace=True)
train_df.reset_index(drop=True, inplace=True)

In [5]:
test_df.set_index(test_df.columns[0], inplace=True)
test_df.reset_index(drop=True, inplace=True)

In [6]:
x_train = train_df.iloc[:,:-1]
y_train = train_df.iloc[:,-1]
x_test = test_df.iloc[:,:-1]
y_test = test_df.iloc[:,-1]

x_train.shape, y_train.shape, x_test.shape, y_test.shape

((15997, 6000), (15997,), (4000, 6000), (4000,))

In [7]:
print("Multinomial Naive Bayes")
mnb = MultinomialNB()
start = time.clock()
mnb.fit(x_train, y_train)
end = time.clock()
print("Time taken =", end-start)
mnb.score(x_test, y_test)

Multinomial Naive Bayes
Time taken = 28.789185318233244


0.74725

In [8]:
print("Logistic Regression")
lr = LogisticRegression()
start = time.clock()
lr.fit(x_train, y_train)
end = time.clock()
print("Time taken =", end-start)
lr.score(x_test, y_test)

Logistic Regression
Time taken = 44.15120960123165


0.7315

In [9]:
print("Decision Tree")
dt = DecisionTreeClassifier()
start = time.clock()
dt.fit(x_train, y_train)
end = time.clock()
print("Time taken =", end-start)
dt.score(x_test, y_test)

Decision Tree
Time taken = 41.61283803170086


0.5515

In [10]:
print("Random Forest")
rf = RandomForestClassifier()
start = time.clock()
rf.fit(x_train, y_train)
end = time.clock()
print("Time taken =", end-start)
rf.score(x_test, y_test)

Random Forest
Time taken = 11.885743010765253


0.6265

In [11]:
print("Extra Trees")
et = ExtraTreesClassifier()
start = time.clock()
et.fit(x_train, y_train)
end = time.clock()
print("Time taken =", end-start)
et.score(x_test, y_test)

Extra Trees
Time taken = 16.558045230291924


0.66125

# Ensemble Models

## Max Voting

In [14]:
from sklearn.ensemble import VotingClassifier

In [31]:
model1 = LogisticRegression(random_state=1)
model2 = DecisionTreeClassifier(random_state=1)
model3 = RandomForestClassifier(random_state=1)
model4 = ExtraTreesClassifier(random_state=1)
model5 = MultinomialNB()

model = VotingClassifier(estimators=[('lr',model1),('rf',model3),('et',model4),('mnb',model5)], voting='hard')

In [32]:
start = time.clock()
model.fit(x_train, y_train)
end = time.clock()
print(end-start)

59.73191523110836


In [33]:
model.score(x_test, y_test)

  if diff:


0.73425

In [34]:
model = VotingClassifier(estimators=[('mnb',model5),('lr',model1)])
start = time.clock()
model.fit(x_train, y_train)
end = time.clock()
print(end-start)
model.score(x_test, y_test)

33.9791991975635


  if diff:


0.734

In [35]:
model = VotingClassifier(estimators=[('rf',model3),('et',model4)], voting='hard')
start = time.clock()
model.fit(x_train, y_train)
end = time.clock()
print(end-start)
model.score(x_test, y_test)

23.190333920387275


  if diff:


0.62675

## Averaging

In [37]:
model1 = LogisticRegression()
model2 = DecisionTreeClassifier()
model3 = RandomForestClassifier()
model4 = ExtraTreesClassifier()
model5 = MultinomialNB()

In [38]:
start = time.clock()
model1.fit(x_train, y_train)
model2.fit(x_train, y_train)
model5.fit(x_train, y_train)
end = time.clock()
print(end-start)

105.63434329768279


In [39]:
pred1 = model1.predict_proba(x_test)
pred2 = model2.predict_proba(x_test)
pred3 = model5.predict_proba(x_test)

In [40]:
finalpred = (pred1 + pred2 + pred3) / 3

In [42]:
finalpred.shape

(4000, 20)

In [43]:
finalpred

array([[9.25646277e-01, 1.52834573e-05, 1.93924999e-04, ...,
        3.43057729e-04, 1.11211695e-02, 5.32403596e-02],
       [4.26191521e-02, 1.95227886e-05, 5.51529205e-05, ...,
        3.35192227e-06, 3.16107988e-06, 9.56776701e-01],
       [3.16760637e-01, 1.46509523e-06, 4.96799361e-08, ...,
        4.95417758e-05, 2.25680013e-04, 6.82172366e-01],
       ...,
       [9.83756344e-01, 9.02885436e-05, 1.80242652e-04, ...,
        2.13172804e-04, 4.14321541e-05, 1.19920582e-02],
       [9.73101562e-01, 8.61011533e-09, 5.58092019e-07, ...,
        1.80621063e-05, 2.05668561e-05, 1.90673502e-02],
       [9.79301479e-01, 1.15097277e-04, 3.66005870e-05, ...,
        9.90446755e-04, 2.09416048e-04, 1.86613934e-02]])