In [1]:
from __future__ import division
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier

def accuracy(pred, ground_truth):
    
    tp = 0
    tn = 0
    fp = 0
    fn = 0

    total = len(pred)
    for i in range(0, total):
        if pred[i] == 1 and ground_truth[i] == 1:
            tp += 1
        elif pred[i] == 1 and ground_truth[i] == 0:
            fp += 1
        elif pred[i] == 0 and ground_truth[i] == 0:
            tn += 1
        elif pred[i] == 0 and ground_truth[i] == 1:
            fn += 1

    accuracy = (tp + tn) / total

    if tp == 0 and fp == 0:
        precision = -1
        recall = -1
    else:
        precision = tp / (tp + fp)
        recall = tp / (tp + fn)

    f1 = 2 * precision * recall / (precision + recall)

    print("True Positives: {}".format(tp))
    print("True Negatives: {}".format(tn))
    print("False Positives: {}".format(fp))
    print("False Negatives: {}".format(fn))
    print("Total number of data: {}".format(total))

    print("Accuracy: {}".format(accuracy))
    print("Precision: {}".format(precision))
    print("Recall: {}".format(recall))
    print("F1: {}".format(f1))
    
    return f1, precision, recall

# cross validation
trainX = pd.read_csv('results/train/features.csv', sep=',', index_col=0).values
trainY = pd.read_csv('results/train/labels.csv', sep=',', index_col=0, header=None).values.flatten()

X_train, X_test, y_train, y_test = train_test_split(trainX, trainY, test_size=0.4, random_state=0)


clf = svm.SVC(kernel='linear', C=5).fit(X_train, y_train)
clf.score(X_test, y_test)
pred = clf.predict(X_test)
print('-----------------SVM results----------------')
accuracy(pred, y_test)


clf = RandomForestClassifier(n_estimators=10, max_depth=5, random_state=0).fit(X_train, y_train)
pred = clf.predict(X_test)
print('-----------------RF results----------------')
accuracy(pred, y_test)


clf = tree.DecisionTreeClassifier(criterion="entropy", random_state=0).fit(X_train, y_train)
pred = clf.predict(X_test)
print('-----------------DT results----------------')
F1, Precision, Recall = accuracy(pred, y_test)


clf = LogisticRegression(fit_intercept=True).fit(X_train, y_train)
pred = clf.predict(X_test)
print('-----------------LogR results----------------')
accuracy(pred, y_test)

reg = LinearRegression().fit(X_train, y_train)
p = reg.predict(X_test)
pred = []
for i in range(0, len(p)):
    if p[i] > 0:
        pred.append(1)
    else:
        pred.append(0)
print('-----------------LinearR results----------------')
accuracy(pred, y_test)

print('\n')
print'Best classifier M --> Decision Tree'
print 'F1:', F1
print 'Precision:', Precision
print 'Recall:', Recall

-----------------SVM results----------------
True Positives: 760
True Negatives: 2435
False Positives: 240
False Negatives: 281
Total number of data: 3716
Accuracy: 0.85979547901
Precision: 0.76
Recall: 0.730067243036
F1: 0.744732974032
-----------------RF results----------------
True Positives: 564
True Negatives: 2574
False Positives: 101
False Negatives: 477
Total number of data: 3716
Accuracy: 0.844456404736
Precision: 0.848120300752
Recall: 0.541786743516
F1: 0.661195779601
-----------------DT results----------------
True Positives: 796
True Negatives: 2475
False Positives: 200
False Negatives: 245
Total number of data: 3716
Accuracy: 0.880247578041
Precision: 0.799196787149
Recall: 0.7646493756
F1: 0.781541482572
-----------------LogR results----------------
True Positives: 745
True Negatives: 2462
False Positives: 213
False Negatives: 296
Total number of data: 3716
Accuracy: 0.863024757804
Precision: 0.777661795407
Recall: 0.715658021134
F1: 0.745372686343
-----------------Linea

  linalg.lstsq(X, y)
