In [17]:
# Classification Model

In [18]:
import numpy as np
import utils
from sklearn.metrics import *

from sklearn.datasets import load_svmlight_file
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from numpy import mean, array
import Prediction_ML
from sklearn.model_selection import KFold, ShuffleSplit
RANDOM_STATE = 545510477


In [19]:
#input: X_train, Y_train
#output: Y_pred
def logistic_regression_pred(X_train, Y_train):
	#train a logistic regression classifier using X_train and Y_train. Use this to predict labels of X_train
	#use default params for the classifier
    Logistic_Model = LogisticRegression(random_state=RANDOM_STATE).fit(X_train,Y_train)
    Logistic_pred = Logistic_Model.predict(X_train)
    return Logistic_pred

#input: X_train, Y_train
#output: Y_pred
def svm_pred(X_train, Y_train):
	#train a SVM classifier using X_train and Y_train. Use this to predict labels of X_train
	#use default params for the classifier
    SVM_Model = LinearSVC(random_state=RANDOM_STATE).fit(X_train,Y_train)
    SVM_pred = SVM_Model.predict(X_train)
    return SVM_pred

#input: X_train, Y_train
#output: Y_pred
def decisionTree_pred(X_train, Y_train):
	#train a logistic regression classifier using X_train and Y_train. Use this to predict labels of X_train
	#use max_depth as 5
    decisionTree_Model = DecisionTreeClassifier(random_state=RANDOM_STATE, max_depth=5).fit(X_train,Y_train)
    decisionTree_pred = decisionTree_Model.predict(X_train)
    return decisionTree_pred



#output: accuracy, auc, precision, recall, f1-score
def classification_metrics(Y_pred, Y_true):
    return accuracy_score(Y_pred,Y_true),roc_auc_score(Y_pred,Y_true),precision_score(Y_pred,Y_true),recall_score(Y_pred,Y_true),f1_score(Y_pred,Y_true)

#input: Name of classifier, predicted labels, actual labels
def display_metrics(classifierName,Y_pred,Y_true):
	print("______________________________________________")
	print(("Classifier: "+classifierName))
	acc, auc_, precision, recall, f1score = classification_metrics(Y_pred,Y_true)
	print(("Accuracy: "+str(acc)))
	print(("AUC: "+str(auc_)))
	print(("Precision: "+str(precision)))
	print(("Recall: "+str(recall)))
	print(("F1-score: "+str(f1score)))
	print("______________________________________________")
	print("")

def main():
	X_train, Y_train = utils.get_data_from_svmlight("output/features_svmlight.train")
	
	display_metrics("Logistic Regression",logistic_regression_pred(X_train,Y_train),Y_train)
	display_metrics("SVM",svm_pred(X_train,Y_train),Y_train)
	display_metrics("Decision Tree",decisionTree_pred(X_train,Y_train),Y_train)
	

if __name__ == "__main__":
	main()
	

______________________________________________
Classifier: Logistic Regression
Accuracy: 0.9545454545454546
AUC: 0.9613885805894685
Precision: 0.8988095238095238
Recall: 0.9869281045751634
F1-score: 0.9408099688473521
______________________________________________

______________________________________________
Classifier: SVM
Accuracy: 0.9940191387559809
AUC: 0.9930942587679468
Precision: 0.9970238095238095
Recall: 0.9882005899705014
F1-score: 0.9925925925925925
______________________________________________

______________________________________________
Classifier: Decision Tree
Accuracy: 0.7763157894736842
AUC: 0.780760014849313
Precision: 0.6011904761904762
Recall: 0.792156862745098
F1-score: 0.6835871404399323
______________________________________________



# Predictive Modeling

In [20]:
#input: X_train, Y_train, X_test
#output: Y_pred
def logistic_regression_pred(X_train, Y_train, X_test):
	#train a logistic regression classifier using X_train and Y_train. Use this to predict labels of X_train
	#use default params for the classifier
    Logistic_Model = LogisticRegression(random_state=RANDOM_STATE).fit(X_train,Y_train)
    Logistic_pred = Logistic_Model.predict(X_test)
    return Logistic_pred

#input: X_train, Y_train, X_test
#output: Y_pred
def svm_pred(X_train, Y_train, X_test):
	#train a SVM classifier using X_train and Y_train. Use this to predict labels of X_train
	#use default params for the classifier
    SVM_Model = LinearSVC(random_state=RANDOM_STATE).fit(X_train,Y_train)
    SVM_pred = SVM_Model.predict(X_test)
    return SVM_pred

#input: X_train, Y_train, X_test
#output: Y_pred
def decisionTree_pred(X_train, Y_train, X_test):
	#train a logistic regression classifier using X_train and Y_train. Use this to predict labels of X_train
	#use max_depth as 5
    decisionTree_Model = DecisionTreeClassifier(random_state=RANDOM_STATE, max_depth=5).fit(X_train,Y_train)
    decisionTree_pred = decisionTree_Model.predict(X_test)
    return decisionTree_pred


#input: Y_pred,Y_true
#output: accuracy, auc, precision, recall, f1-score
def classification_metrics(Y_pred, Y_true):
    return accuracy_score(Y_pred,Y_true),roc_auc_score(Y_pred,Y_true),precision_score(Y_pred,Y_true),recall_score(Y_pred,Y_true),f1_score(Y_pred,Y_true)

#input: Name of classifier, predicted labels, actual labels
def display_metrics(classifierName,Y_pred,Y_true):
	print("______________________________________________")
	print(("Classifier: "+classifierName))
	acc, auc_, precision, recall, f1score = classification_metrics(Y_pred,Y_true)
	print(("Accuracy: "+str(acc)))
	print(("AUC: "+str(auc_)))
	print(("Precision: "+str(precision)))
	print(("Recall: "+str(recall)))
	print(("F1-score: "+str(f1score)))
	print("______________________________________________")
	print("")

def main():
	X_train, Y_train = utils.get_data_from_svmlight("output/features_svmlight.train")
	X_test, Y_test = utils.get_data_from_svmlight("data/features_svmlight.validate")
	display_metrics("Logistic Regression",logistic_regression_pred(X_train,Y_train,X_test),Y_test)
	display_metrics("SVM",svm_pred(X_train,Y_train,X_test),Y_test)
	display_metrics("Decision Tree",decisionTree_pred(X_train,Y_train,X_test),Y_test)
	

if __name__ == "__main__":
	main()
	

______________________________________________
Classifier: Logistic Regression
Accuracy: 0.7380952380952381
AUC: 0.7340114953015235
Precision: 0.7333333333333333
Recall: 0.6804123711340206
F1-score: 0.7058823529411765
______________________________________________

______________________________________________
Classifier: SVM
Accuracy: 0.7380952380952381
AUC: 0.7347802347802348
Precision: 0.7444444444444445
Recall: 0.6767676767676768
F1-score: 0.708994708994709
______________________________________________

______________________________________________
Classifier: Decision Tree
Accuracy: 0.6714285714285714
AUC: 0.6637839404773408
Precision: 0.5555555555555556
Recall: 0.6329113924050633
F1-score: 0.591715976331361
______________________________________________



# Cross validation

In [21]:
#K-fold Cross validation
#input: training data and corresponding labels
#output: accuracy, auc
def get_acc_auc_kfold(X,Y,k=5):
	#First get the train indices(i) and test indices(j) for each iteration
	#Then train the classifier accordingly
	#Report the mean accuracy and mean auc of all the folds
    
    KFoldCV = KFold(n_splits = k, random_state = RANDOM_STATE)
    ModelFit_KFold = LogisticRegression()
    ListAccracy =[]
    ListAUCkf =[]
    for i,j in KFoldCV.split(X):
        k = ModelFit_KFold.fit(X[i],Y[i])
        AcuracyScore = accuracy_score(k.predict(X[j]),Y[j])
        ListAccracy.append(AcuracyScore)      
        AUCscore = roc_auc_score(k.predict(X[j]),Y[j])
        ListAUCkf.append(AUCscore)
    return mean(AcuracyScore),mean(ListAUCkf)

In [22]:
#Randomized K-fold Cross validation
#input: training data and corresponding labels
#output: accuracy, auc
def get_acc_auc_randomisedCV(X,Y,iterNo=5,test_percent=0.2):
	#First get the train indices(i) and test indices(j) for each iteration
	#Then train the classifier accordingly
	#Report the mean accuracy and mean auc of all the iterations
    
    RKFoldCV = ShuffleSplit(n_splits = iterNo, random_state = RANDOM_STATE, test_size = test_percent)
    ModelFit_RKFold = LogisticRegression()
    ListAccracy =[]
    ListAUCrkf =[]
    for i,j in RKFoldCV.split(X):            
        k = ModelFit_RKFold.fit(X[i],Y[i])
        AcuracyScore = accuracy_score(k.predict(X[j]),Y[j])
        ListAccracy.append(AcuracyScore)         
        AUCscore = roc_auc_score(k.predict(X[j]),Y[j])
        ListAUCrkf.append(AUCscore)
    return array(AcuracyScore).mean(),array(ListAUCrkf).mean()
    

## Accuracy & AUC

In [23]:
def main():
	X,Y = utils.get_data_from_svmlight("output/features_svmlight.train")
	print("Classifier: Logistic Regression__________")
	acc_k,auc_k = get_acc_auc_kfold(X,Y)
	print(("Average Accuracy in KFold CV: "+str(acc_k)))
	print(("Average AUC in KFold CV: "+str(auc_k)))
	acc_r,auc_r = get_acc_auc_randomisedCV(X,Y)
	print(("Average Accuracy in Randomised CV: "+str(acc_r)))
	print(("Average AUC in Randomised CV: "+str(auc_r)))

if __name__ == "__main__":
	main()

Classifier: Logistic Regression__________
Average Accuracy in KFold CV: 0.6646706586826348
Average AUC in KFold CV: 0.7116078348698396
Average Accuracy in Randomised CV: 0.7142857142857143
Average AUC in Randomised CV: 0.7308461500509049


