In [134]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegressionCV
from sklearn import linear_model
from sklearn.cross_validation import KFold



def importAndClean():
    df_train = pd.read_csv("HCMST_train_clean.csv", header=0, low_memory=False)
    y_train = df_train['Breakup binary'].values
    df_train = df_train.drop(df_train.columns[[0,1,2,118,119]], axis = 1)
    
    df_test = pd.read_csv("HCMST_test_clean.csv", header=0, low_memory=False)
    y_test = df_test['Breakup binary'].values
    df_test = df_test.drop(df_test.columns[[0,1,2,118,119]], axis=1)
        
    x_train = df_train.values
    x_test = df_test.values
   
    global col_headers
    col_headers = list(df_train.columns.values)
    
    return x_train, y_train, x_test, y_test

def applyClassifier(type,x,y,x_test):
    # Fit and Predict on New Data
    fold = KFold(len(y), n_folds=5, shuffle=True, random_state=777)
    if type == 'logistic':
        regr = LogisticRegressionCV(Cs=list(np.power(10.0, np.arange(-10, 10)))
        ,penalty='l2'
        ,scoring='roc_auc'
        ,cv=fold
        ,random_state=777
        ,max_iter=10000
        ,fit_intercept=True
        ,solver='newton-cg'
        ,tol=10)
        pred = regr.fit(x,y).predict(x_test)  
        return pred, regr.coef_
    elif type == 'lasso':
        clf = linear_model.Lasso(alpha=0.01)
        pred = clf.fit(x,y).predict(x_test)
        return pred, clf.coef_

def main():
    x_train, y_train, x_test, y_test = importAndClean()

    pred_log, log_coef_matrix = applyClassifier('logistic', x_train, y_train, x_test)
    pred_lasso, lasso_coef_matrix = applyClassifier('lasso', x_train, y_train, x_test)
    pred_lasso[pred_lasso >= 0.5] = 1
    pred_lasso[pred_lasso < 0.5] = 0
    
    print("Logistic Regression Classifier Error Rate: %.5f" % np.mean(y_test != pred_log))
    print("Lasso Regression Error Rate: %.5f" % np.mean(y_test != pred_lasso))
    
    nonzero_lassocoef_names = [col_headers[j] for j in [i for i in range(len(lasso_coef_matrix)) if lasso_coef_matrix[i] != 0]]
    print ("Lasso Regression Non-Zero Features (n = %d):" % len(nonzero_lassocoef_names))
    for name in nonzero_lassocoef_names:
        print ("\t%s" % name)
    
    print ("\nLogistic Regression Coeffient Matrix:")
    print (log_coef_matrix)
    
    print ("\nLasso Regression Coefficient Matrix:")
    print(lasso_coef_matrix)
        
if __name__ == "__main__":
    main()



Logistic Regression Classifier Error Rate: 0.15591
Lasso Regression Error Rate: 0.17025
Lasso Regression Non-Zero Features (n = 16):
	PPAGECAT
	PPINCIMP
	Married
	PPWORK
	same religion
	pEducat
	Relatives see/month
	# marriages
	Q19
	Age met
	Age relationship
	Approve
	Q34
	DISTANCEMOVED_10MI
	AGE_DIFFERENCE
	RELATIONSHIP_QUALITY

Logistic Regression Coeffient Matrix:
[[ -5.05472309e-01  -7.39060657e-02   9.25715527e-02  -9.76230923e-03
    2.64486930e-02   4.42282202e-03  -6.21951296e-01  -2.29579155e-02
    6.17792709e-01  -4.19106523e-02   1.50057386e-01  -6.13366477e-02
    1.66259929e-01  -2.67278392e-02   8.55867415e-02  -2.10040693e-02
    1.52645071e-01   9.81628826e-02   2.84937730e-02  -1.21560575e-01
    1.96935824e-02   9.95789171e-02   1.43038585e-02   1.27854347e-01
    2.70378460e-01  -6.99875200e-02  -4.59726410e-02  -7.99916520e-02
    5.39709323e-02   4.57143021e-02   3.99956652e-02   1.49935151e-01
   -7.67046993e-02   6.22324296e-02  -1.80656522e-02  -9.56704318e-03