In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegressionCV
from sklearn import linear_model
from sklearn.cross_validation import KFold
from sklearn import svm


def importAndClean():
    df_train = pd.read_csv("HCMST_train_cleaned.csv", header=0, low_memory=False)
    y_train = df_train['Breakup_binary'].values
    df_train = df_train.drop(df_train.columns[[0,1,2,116]], axis = 1)
    df_test = pd.read_csv("HCMST_test_cleaned.csv", header=0, low_memory=False)
    y_test = df_test['Breakup_binary'].values
    df_test = df_test.drop(df_test.columns[[0,1,2,116]], axis=1)

    x_train = df_train.values
    x_test = df_test.values
   
    global col_headers
    col_headers = list(df_train.columns.values)
    
    return x_train, y_train, x_test, y_test

def applyClassifier(type,x,y,x_test):
    # Fit and Predict on New Data
    fold = KFold(len(y), n_folds=5, shuffle=True, random_state=777)
    if type == 'logistic':
        regr = LogisticRegressionCV(Cs=list(np.power(10.0, np.arange(-10, 10)))
        ,penalty='l2'
        ,scoring='roc_auc'
        ,cv=fold
        ,random_state=777
        ,max_iter=10000
        ,fit_intercept=True
        ,solver='newton-cg'
        ,tol=10)
        pred = regr.fit(x,y).predict(x_test)  
        return pred, regr.coef_
    elif type == 'lasso':
        clf = linear_model.LassoCV(cv=fold, random_state=777, tol=10)
        pred = clf.fit(x,y).predict(x_test)
        return pred, clf.coef_
    elif type == 'svm':
        clf = svm.SVC(kernel='linear')
        pred = clf.fit(x,y).predict(x_test)
        return pred, clf.coef_

def main():
    x_train, y_train, x_test, y_test = importAndClean()

    pred_log, log_coef_matrix = applyClassifier('logistic', x_train, y_train, x_test)
    pred_lasso, lasso_coef_matrix = applyClassifier('lasso', x_train, y_train, x_test)
    pred_svm, svm_coef_matrix = applyClassifier('svm', x_train, y_train, x_test)
    pred_lasso[pred_lasso >= 0.5] = 1
    pred_lasso[pred_lasso < 0.5] = -1
    
    print("Logistic Regression Classifier Error Rate: %.5f" % np.mean(y_test != pred_log))
    print("Lasso Regression Error Rate: %.5f" % np.mean(y_test != pred_lasso))
    print("SVM Error Rate: %.5f" % np.mean(y_test != pred_svm))

    
    nonzero_lassocoef_names = [col_headers[j] for j in [i for i in range(len(lasso_coef_matrix)) if lasso_coef_matrix[i] != 0]]
    nonzero_lasso_coef = [lasso_coef_matrix[j] for j in [i for i in range(len(lasso_coef_matrix)) if lasso_coef_matrix[i] !=0]]
    print ("Lasso Regression Non-Zero Features (n = %d):" % len(nonzero_lassocoef_names))
    for name, val in zip(nonzero_lassocoef_names, range(len(nonzero_lasso_coef))):
        print ("\t%s: %.6f" % (name, nonzero_lasso_coef[val]))
    
    print ("\nLogistic Regression Coeffient Matrix:")
    print (log_coef_matrix)
    
    print ("\nLasso Regression Coefficient Matrix:")
    print(lasso_coef_matrix)
    
    
        
if __name__ == "__main__":
    main()

