In [17]:
import utils
import Prediction_ML
from sklearn.model_selection import KFold, ShuffleSplit
from sklearn.linear_model import LogisticRegression
from numpy import mean, array
from sklearn.metrics import *

# USE THIS RANDOM STATE FOR ALL OF YOUR CROSS VALIDATION TESTS, OR THE TESTS WILL NEVER PASS
RANDOM_STATE = 545510477

## K-fold  Cross validation

In [18]:
#input: training data and corresponding labels
#output: accuracy, auc
def get_acc_auc_kfold(X,Y,k=5):
	#First get the train indices(i) and test indices(j) for each iteration
	#Then train the classifier accordingly
	#Report the mean accuracy and mean auc of all the folds
    
    KFoldCV = KFold(n_splits = k, random_state = RANDOM_STATE)
    ModelFit_KFold = LogisticRegression()
    ListAccracy =[]
    ListAUCkf =[]
    for i,j in KFoldCV.split(X):
        k = ModelFit_KFold.fit(X[i],Y[i])
        AcuracyScore = accuracy_score(k.predict(X[j]),Y[j])
        ListAccracy.append(AcuracyScore)      
        AUCscore = roc_auc_score(k.predict(X[j]),Y[j])
        ListAUCkf.append(AUCscore)
    return mean(AcuracyScore),mean(ListAUCkf)

## Randomized K-fold  Cross validation

In [19]:
#input: training data and corresponding labels
#output: accuracy, auc
def get_acc_auc_randomisedCV(X,Y,iterNo=5,test_percent=0.2):
	#First get the train indices(i) and test indices(j) for each iteration
	#Then train the classifier accordingly
	#Report the mean accuracy and mean auc of all the iterations
    
    RKFoldCV = ShuffleSplit(n_splits = iterNo, random_state = RANDOM_STATE, test_size = test_percent)
    ModelFit_RKFold = LogisticRegression()
    ListAccracy =[]
    ListAUCrkf =[]
    for i,j in RKFoldCV.split(X):            
        k = ModelFit_RKFold.fit(X[i],Y[i])
        AcuracyScore = accuracy_score(k.predict(X[j]),Y[j])
        ListAccracy.append(AcuracyScore)         
        AUCscore = roc_auc_score(k.predict(X[j]),Y[j])
        ListAUCrkf.append(AUCscore)
    return array(AcuracyScore).mean(),array(ListAUCrkf).mean()
    

In [20]:
def main():
	X,Y = utils.get_data_from_svmlight("output/features_svmlight.train")
	print("Classifier: Logistic Regression__________")
	acc_k,auc_k = get_acc_auc_kfold(X,Y)
	print(("Average Accuracy in KFold CV: "+str(acc_k)))
	print(("Average AUC in KFold CV: "+str(auc_k)))
	acc_r,auc_r = get_acc_auc_randomisedCV(X,Y)
	print(("Average Accuracy in Randomised CV: "+str(acc_r)))
	print(("Average AUC in Randomised CV: "+str(auc_r)))

if __name__ == "__main__":
	main()

Classifier: Logistic Regression__________
Average Accuracy in KFold CV: 0.6646706586826348
Average AUC in KFold CV: 0.7116078348698396
Average Accuracy in Randomised CV: 0.7142857142857143
Average AUC in Randomised CV: 0.7308461500509049


