In [1]:
def _loadData():
    import numpy as np
    finalList = list()
    
    with open("final_data.csv") as f:
        for idx,line in enumerate(f):
            if idx==0:
                continue
            
            else:
                tempList = list()
                vals = line.strip().split(",")
                for i in vals:
                    tempList.append(float(i))
                finalList.append(tempList)
                
    return finalList

In [9]:
import sklearn
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn import neighbors
import numpy as np

In [3]:
data = np.array(_loadData())
data.shape

(176456, 194)

In [4]:
### select all rows except last column
X_matrix = data[:, :-1] 
### select last column
Y = data[:, -1]

#Splitting training and testing data (features)
X_train, X_test, y_train, y_test = train_test_split(X_matrix, Y, test_size=0.2, random_state=33)

In [5]:
# Linear regression
# Create logistic regression object
regr = linear_model.LogisticRegression()

# Train the model using the training sets
regr.fit(X_train, y_train)

# The coefficients
print('Coefficients:', regr.coef_)
# The mean square error
print("Residual sum of squares: %.2f"
      % np.mean((regr.predict(X_test) - y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % regr.score(X_test, y_test))

('Coefficients:', array([[  3.72452877e-01,   2.47995660e-01,   2.32995513e-01,
         -6.00822458e-03,   1.81739900e-01,  -2.31890533e-01,
         -6.87529631e-02,   4.03510531e-01,   4.64695304e-01,
          6.46490798e-02,   1.65239188e+00,   3.21543469e-01,
          1.76056889e+00,   2.78339180e+00,   5.58917499e-02,
          1.16178360e-01,   9.10971920e-01,   1.17121822e+00,
         -2.39370160e-01,  -1.91244251e-01,   6.45625717e-02,
          4.82799673e-01,  -1.59853148e-01,   3.94111541e-02,
          2.00169467e-02,   2.36420510e-01,   2.23431399e-01,
         -3.45043073e-01,   5.67704069e-01,   1.77271198e-01,
         -6.66513183e-01,  -1.47458321e-03,  -1.32119171e-01,
         -2.33017481e-01,   7.84619226e-02,   6.54707580e-02,
          9.28110499e-01,  -2.69435735e-02,  -3.44370436e-02,
          4.46351728e-01,  -1.80850699e-02,  -1.39268601e-01,
         -2.51662008e-01,   5.41233685e-02,   2.04245307e-01,
          5.41216074e-02,   7.78868090e-01,   9.1144

In [10]:
kf = KFold(5, shuffle=True, random_state=33)

recallList = list()
precisionList = list()
f1List = list()
accuracyList = list()

for train_index, test_index in kf.split(X_matrix):
    regr.fit(X_matrix[train_index], Y[train_index])
    
    test = Y[test_index]
    pred = regr.predict(X_matrix[test_index]))
    
    recallList.append(recall_score(test, pred))
    precisionList.append(precision_score(test, pred))
    f1List.append(f1_score(test,pred))
    accuracyList.append(accuracy_score(test,pred))


print("Mean precision: " + str(np.mean(precisionList)))
print("StDev precision: " + str(np.std(precisionList)))
print("")
print("Mean recall: " + str(np.mean(recallList)))
print("StDev recall: " + str(np.std(recallList)))
print("")
print("Mean f1: " + str(np.mean(f1List)))
print("StDev f1: " + str(np.std(f1List)))
print("")
print("Mean accuracy: " + str(np.mean(accuracyList)))
print("StDev accuracy: " + str(np.std(accuracyList)))

Mean precision: 0.999953004151
StDev precision: 2.97656324108e-05

Mean recall: 1.0
StDev recall: 0.0

Mean f1: 0.999976501302
StDev f1: 1.48835171761e-05

Mean accuracy: 0.999971664324
StDev accuracy: 1.7921156446e-05
