In [43]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, BayesianRidge, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
import sklearn.model_selection as ms
from sklearn.svm import SVC
import random

In [2]:
n = 6325 #number of records in file
s = 2000 #desired sample size
filename = "train.csv"
skip = sorted(random.sample(xrange(n),n-s))
df_train_sample = pd.read_csv(filename, skiprows=skip, header=None)

In [3]:
# df_train_sample = pd.read_csv("train.csv", nrows=1000, header=None)
df_test = pd.read_csv("test.csv", header=None)
df_test.drop(df_test.columns[[0]], axis=1, inplace=True)
Y_train = df_train_sample.iloc[:,-1]
df_train_sample.drop(df_train_sample.columns[[-1]], axis=1, inplace=True)
df_train_sample.head()


Unnamed: 0,0.000000000000000000e+00,7.122828847711242667e-04,1.423987452924897062e-03,2.134535857390711718e-03,2.843351189810218362e-03,3.549857948940669979e-03,4.253482507915815579e-03,4.953653579984939985e-03,5.649802682351916933e-03,6.341364597737685245e-03,...,-7.027777833283280727e-03,-6.341364597729205049e-03,-5.649802682343052496e-03,-4.953653579975694776e-03,-4.253482507906193068e-03,-3.549857948941927219e-03,-2.843351189811149909e-03,-2.134535857391314968e-03,-1.423987452925169197e-03,-7.122828847710639850e-04
0,0.020292,0.141777,0.275296,0.111601,0.224555,0.226531,0.062487,0.259656,-0.077982,-0.092875,...,-0.043492,0.09503,0.080447,0.104674,-0.004612,0.013853,0.000293,-0.091339,-0.091172,-0.013201
1,0.004376,0.013978,0.019232,0.02349,0.025136,0.024229,0.021665,0.017912,0.011754,0.004436,...,-0.021485,-0.027345,-0.0319,-0.034985,-0.03623,-0.035834,-0.033799,-0.03001,-0.024691,-0.018381
2,0.038217,0.084984,0.093411,0.094486,0.077482,0.061309,0.053731,0.054602,0.057732,0.064238,...,-0.004686,-0.011227,-0.016656,-0.020634,-0.022666,-0.022613,-0.020479,-0.016436,-0.011126,-0.004705
3,0.000392,0.010536,0.01676,0.018704,0.020736,0.019504,0.012633,0.005153,0.000507,-0.010927,...,0.005375,-0.000515,-0.008925,-0.015868,-0.012429,-0.011004,-0.01279,-0.009779,-0.010296,-0.001233
4,0.014672,0.030505,0.033381,0.041119,0.043278,0.044334,0.044791,0.047787,0.049341,0.049644,...,-0.028221,-0.027865,-0.027728,-0.026717,-0.024923,-0.023367,-0.020023,-0.01634,-0.013982,-0.008613


In [4]:
X_train, X_validate, y_train, y_validate = ms.train_test_split(df_train_sample, Y_train, test_size=0.2)

In [5]:
LR = LogisticRegression()
LR.fit(X_train, y_train)
LR_pred = LR.predict(X_validate)

In [6]:
RF = RandomForestClassifier()
RF.fit(X_train, y_train)
RF_pred = RF.predict(X_validate)

In [7]:
MLP = MLPClassifier()
MLP.fit(X_train, y_train)
MLP_pred = MLP.predict(X_validate)

In [8]:
KNN = KNeighborsClassifier()
KNN.fit(X_train, y_train)
KNN_pred = KNN.predict(X_validate)

In [44]:
SV = SVC()
SV.fit(X_train, y_train)
SV_pred = SV.predict(X_validate)

In [9]:
def accuracy(predictions, actual):
    return (np.sum(predictions == actual) / float(len(predictions)))

In [15]:
def similarity(pred1, pred2):
    a = np.loadtxt(pred1, skiprows=1, usecols=1, delimiter=',')
    b = np.loadtxt(pred2, skiprows=1, usecols=1, delimiter=',')
    acc = accuracy(a, b)
    del a
    del b
    return acc

In [21]:
def objfilesim(obj, f):
    a = np.loadtxt(f, skiprows=1, usecols=1, delimiter=',')
    acc = accuracy(obj, a)
    del a
    return acc

In [45]:
print accuracy(LR_pred, y_validate)
print accuracy(RF_pred, y_validate)
print accuracy(MLP_pred, y_validate)
print accuracy(KNN_pred, y_validate)
print accuracy(SV_pred, y_validate)

0.9925
0.6875
0.995
0.905
0.29


In [35]:
LR_test_pred = LR.predict(df_test)
KNN_test_pred = KNN.predict(df_test)

In [46]:
RF_test_pred = RF.predict(df_test)
SV_test_pred = SV.predict(df_test)

In [37]:
def write_to_file(filename, predictions):
    with open(filename, "w") as f:
        f.write("Id,Prediction\n")
        for i,p in enumerate(predictions):
            f.write(str(i) + "," + str(int(p)) + "\n")

In [47]:
write_to_file('lrpred.csv', LR_test_pred)
write_to_file('knnpred.csv', KNN_test_pred)
write_to_file('rfpred.csv', RF_test_pred)
write_to_file('svpred.csv', SV_test_pred)

In [34]:
print similarity('phil.csv', 'lrpred.csv')
print objfilesim(LR_test_pred, 'phil.csv')

0.715
0.715


In [56]:
def write_convert_zeros(filename, predictions, cls, arr):
    with open(filename, "w") as f:
        f.write("Id,Prediction\n")
        for i,p in enumerate(predictions):
            if i in arr:
                f.write(str(i) + "," + str(int(cls)) + "\n")
            else:
                f.write(str(i) + "," + str(int(p)) + "\n")

In [57]:
rws = np.where(~df_test.any(axis=1))[0]

In [59]:
write_convert_zeros('knn5.csv', KNN_test_pred, 5, rws)