In [2]:
import numpy as np
import os
import random
from sklearn import tree
from sklearn.metrics import precision_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
#from sklearn.model_selection import cross_validate

folderPath = '../'
trainArray = np.load(os.path.join(folderPath,'train.npy'))
testArray = np.load(os.path.join(folderPath, 'test.npy'))

trainData = trainArray[:,0:-1]
trainLabel = trainArray[:,-1]

testData = testArray[:,0:-1]
testLabel = testArray[:,-1]

## train split for debugging
trainMetaDataFile = '../trainMetaData.txt'
trainMetadata = []
for line in open(trainMetaDataFile).readlines():
    trainMetadata.append(line)

testMetaDataFile = '../testMetaData.txt'
testMetaData = []
for line in open(testMetaDataFile).readlines():
    testMetaData.append(line)

#80-20 split
nValSize = int(0.2*trainData.shape[0])
nTrainSize =trainData.shape[0] - nValSize
splitIdxs = random.sample(range(0,trainData.shape[0]), nValSize)

valMetaMapping = { i : sidx for i,sidx in enumerate(splitIdxs) }

nFeatures = 62
nValData = np.zeros((nValSize,nFeatures))
nValLabel = np.zeros((nValSize))

nTrainData = np.zeros((nTrainSize,nFeatures))
nTrainLabel = np.zeros((nTrainSize))

tId = 0
valId = 0
valMetaData = []
for idx in range(trainData.shape[0]):
    if idx in splitIdxs:
        nValData[valId,:] = trainData[idx,:]
        nValLabel[valId] = trainLabel[idx]
        valId += 1
        valMetaData.append(trainMetadata[idx])
    else:
        nTrainData[tId, :] = trainData[idx,:]
        nTrainLabel[tId] = trainLabel[idx]
        tId += 1

In [5]:
dt = tree.DecisionTreeClassifier(min_samples_split=10, random_state=99)
scores = cross_val_score(dt, trainData, trainLabel, scoring='precision', cv=5)

#scoring = ['precision_macro', 'recall_macro']
#scores = cross_validate(dt, trainData, trainLabel, scoring=scoring, cv=5, return_train_score=False)

print ('5-fold cross validtion precision: ' + str(scores))

dt = dt.fit(nTrainData, nTrainLabel)
pred = dt.predict(nValData)
prec = precision_score(nValLabel, pred)
recall = recall_score(nValLabel, pred)
f1Score = f1_score(nValLabel, pred)

print ('Decision Tree:')
print ('\tPrecision = ' + str(prec))
print ('\tRecall = ' + str(recall))
print ('\tF1 = ' + str(f1Score))


5-fold cross validtion precision: [ 0.70042194  0.69273743  0.72673267  0.71713147  0.69102296]
Decision Tree:
	Precision = 0.704103671706
	Recall = 0.626923076923
	F1 = 0.663275686673


In [6]:
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators=150, max_depth=50,  random_state=99)
scores = cross_val_score(RF, trainData, trainLabel, scoring='precision', cv=5)
print ('5-fold cross validtion precision: ' + str(scores))

RF = RF.fit(nTrainData, nTrainLabel)
pred = RF.predict(nValData)
prec = precision_score(nValLabel, pred)
recall = recall_score(nValLabel, pred)
f1Score = f1_score(nValLabel, pred)

print ('Random Forest:')
print ('\tPrecision = ' + str(prec))
print ('\tRecall = ' + str(recall))
print ('\tF1 = ' + str(f1Score))

5-fold cross validtion precision: [ 0.93478261  0.92346939  0.92838196  0.91709845  0.90217391]
Random Forest:
	Precision = 0.921195652174
	Recall = 0.651923076923
	F1 = 0.763513513514


In [7]:
from sklearn import svm
bSVM = svm.SVC(class_weight='balanced')

#scores = cross_val_score(bSVM, trainData, trainLabel, scoring='precision', cv=5)
#print (scores)

bSVM = bSVM.fit(nTrainData, nTrainLabel)
pred = bSVM.predict(nValData)
prec = precision_score(nValLabel, pred)
recall = recall_score(nValLabel, pred)
f1Score = f1_score(nValLabel, pred)

print ('Support Vector Machine:')
print ('\tPrecision = ' + str(prec))
print ('\tRecall = ' + str(recall))
print ('\tF1 = ' + str(f1Score))

Support Vector Machine:
	Precision = 0.695473251029
	Recall = 0.65
	F1 = 0.671968190855


In [10]:
from sklearn.linear_model import LinearRegression
LR = LinearRegression()

#scores = cross_val_score(LR, trainData, trainLabel, scoring='precision', cv=5)
#print ('5-fold cross validtion precision: ' + str(scores))

RF = LR.fit(nTrainData, nTrainLabel)
pred = LR.predict(nValData)
pred = np.where(pred > 0.5, 1, 0)
prec = precision_score(nValLabel, pred)
recall = recall_score(nValLabel, pred)
f1Score = f1_score(nValLabel, pred)

print ('Linear Regerssion:')
print ('\tPrecision = ' + str(prec))
print ('\tRecall = ' + str(recall))
print ('\tF1 = ' + str(f1Score))

Linear Regerssion:
	Precision = 0.8125
	Recall = 0.05
	F1 = 0.0942028985507


In [12]:
from sklearn.linear_model import LogisticRegression
LogR = LogisticRegression(random_state=99, class_weight='balanced', penalty='l1', C=10000)

scores = cross_val_score(LogR, trainData, trainLabel, scoring='precision', cv=5)
print ('5-fold cross validtion precision: ' + str(scores))

LogR = LogR.fit(nTrainData, nTrainLabel)
pred = LogR.predict(nValData)

prec = precision_score(nValLabel, pred)
recall = recall_score(nValLabel, pred)
f1Score = f1_score(nValLabel, pred)

print ('Logistic Regerssion:')
print ('\tPrecision = ' + str(prec))
print ('\tRecall = ' + str(recall))
print ('\tF1 = ' + str(f1Score))

5-fold cross validtion precision: [ 0.30867971  0.31425091  0.32356688  0.31657356  0.30656934]
Logistic Regerssion:
	Precision = 0.3025
	Recall = 0.930769230769
	F1 = 0.456603773585


In [13]:
RF = RandomForestClassifier(n_estimators=150, max_depth=50,  random_state=99)
RF = RF.fit(trainData, trainLabel)
pred = RF.predict(testData)

prec = precision_score(testLabel, pred)
recall = recall_score(testLabel, pred)
f1Score = f1_score(testLabel, pred)

print ('Random Forest result on Test data (Set J):')
print ('\tPrecision = ' + str(prec))
print ('\tRecall = ' + str(recall))
print ('\tF1 = ' + str(f1Score))

Random Forest result on Test data (Set J):
	Precision = 0.918590522479
	Recall = 0.621199671323
	F1 = 0.741176470588
