In [2]:
import pandas as pd
import sqlite3
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import pyspark.sql.functions as F
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm
from sklearn.decomposition import PCA
import os
from sklearn import preprocessing
import matplotlib.pyplot as plt 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import seaborn as sns
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_recall_fscore_support
from sklearn import svm

  from pandas.core import datetools


In [3]:
train_data = pd.read_csv('resampled_training.csv', sep='\t')
test_x = pd.read_csv('test_x.csv', sep='\t')
test_y = pd.read_csv('test_y.csv', sep='\t')

In [4]:
test_x.drop('Unnamed: 0',axis=1,inplace=True)
test_y.drop('Unnamed: 0',axis=1,inplace=True)

In [5]:
y_test = test_y['level_cor']

In [5]:
train_data.columns.tolist()

['Unnamed: 0',
 'X..1.year',
 'X0.years',
 'X1.year',
 'X10..years',
 'X2.years',
 'X3.years',
 'X4.years',
 'X5.years',
 'X6.years',
 'X7.years',
 'X8.years',
 'X9.years',
 'MORTGAGE.1',
 'RENT.1',
 'ACTIVE',
 'BROKEN',
 'COMPLETE',
 'No',
 'AK',
 'AL',
 'AR',
 'AZ',
 'CA',
 'CO',
 'CT',
 'DC',
 'DE',
 'FL',
 'GA',
 'HI',
 'IA',
 'ID',
 'IL',
 'IN',
 'KS',
 'KY',
 'LA',
 'MA',
 'MD',
 'ME',
 'MI',
 'MN',
 'MO',
 'MS',
 'MT',
 'NC',
 'ND',
 'NE',
 'NH',
 'NJ',
 'NM',
 'NV',
 'NY',
 'OH',
 'OK',
 'OR',
 'PA',
 'RI',
 'SC',
 'SD',
 'TN',
 'TX',
 'UT',
 'VA',
 'VT',
 'WA',
 'WI',
 'WV',
 'WY',
 'pc_x_1',
 'pc_x_2',
 'pc_x_3',
 'pc_x_4',
 'pc_x_5',
 'pc_x_6',
 'pc_x_7',
 'pc_x_8',
 'pc_x_9',
 'pc_x_10',
 'pc_x_11',
 'pc_x_12',
 'pc_x_13',
 'pc_x_14',
 'pc_x_15',
 'pc_x_16',
 'pc_x_17',
 'pc_x_18',
 'pc_x_19',
 'pc_x_20',
 'pc_x_21',
 'pc_x_22',
 'pc_x_23',
 'pc_x_24',
 'pc_x_25',
 'pc_x_26',
 'pc_x_27',
 'pc_x_28',
 'pc_x_29',
 'pc_x_30',
 'pc_x_31',
 'pc_x_32',
 'pc_x_33',
 'pc_x_34',
 'p

In [6]:
x_cols = ['X..1.year',
 'X0.years',
 'X1.year',
 'X10..years',
 'X2.years',
 'X3.years',
 'X4.years',
 'X5.years',
 'X6.years',
 'X7.years',
 'X8.years',
 'X9.years',
 'MORTGAGE.1',
 'RENT.1',
 'ACTIVE',
 'BROKEN',
 'COMPLETE',
 'No',
 'AK',
 'AL',
 'AR',
 'AZ',
 'CA',
 'CO',
 'CT',
 'DC',
 'DE',
 'FL',
 'GA',
 'HI',
 'IA',
 'ID',
 'IL',
 'IN',
 'KS',
 'KY',
 'LA',
 'MA',
 'MD',
 'ME',
 'MI',
 'MN',
 'MO',
 'MS',
 'MT',
 'NC',
 'ND',
 'NE',
 'NH',
 'NJ',
 'NM',
 'NV',
 'NY',
 'OH',
 'OK',
 'OR',
 'PA',
 'RI',
 'SC',
 'SD',
 'TN',
 'TX',
 'UT',
 'VA',
 'VT',
 'WA',
 'WI',
 'WV',
 'WY',
 'pc_x_1',
 'pc_x_2',
 'pc_x_3',
 'pc_x_4',
 'pc_x_5',
 'pc_x_6',
 'pc_x_7',
 'pc_x_8',
 'pc_x_9',
 'pc_x_10',
 'pc_x_11',
 'pc_x_12',
 'pc_x_13',
 'pc_x_14',
 'pc_x_15',
 'pc_x_16',
 'pc_x_17',
 'pc_x_18',
 'pc_x_19',
 'pc_x_20',
 'pc_x_21',
 'pc_x_22',
 'pc_x_23',
 'pc_x_24',
 'pc_x_25',
 'pc_x_26',
 'pc_x_27',
 'pc_x_28',
 'pc_x_29',
 'pc_x_30',
 'pc_x_31',
 'pc_x_32',
 'pc_x_33',
 'pc_x_34',
 'pc_x_35',
 'pc_x_36',
 'pc_x_37']

In [7]:
X_train = train_data[x_cols]
y_train = train_data['y_class']

In [8]:
X_train.head()

Unnamed: 0,X..1.year,X0.years,X1.year,X10..years,X2.years,X3.years,X4.years,X5.years,X6.years,X7.years,...,pc_x_28,pc_x_29,pc_x_30,pc_x_31,pc_x_32,pc_x_33,pc_x_34,pc_x_35,pc_x_36,pc_x_37
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.050789,-0.271259,-0.034634,-0.161174,0.21271,0.045789,0.124396,0.297331,0.341977,-0.055169
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.315749,-0.483586,-0.151907,-0.500346,-0.185445,-0.677531,-0.63155,0.03597,0.273273,0.39364
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.392776,0.52939,0.041816,1.15177,-0.327122,-0.280328,0.094972,0.630165,0.171936,0.331221
3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,-0.14516,-0.521151,-0.072225,-0.823366,-0.448846,-0.039712,-0.269536,-0.41529,0.61307,0.131326
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,-1.013234,0.742557,0.107804,0.345109,1.494548,-0.652461,0.129092,0.963577,0.989961,0.114925


In [9]:
X_train_scaled = pd.DataFrame(preprocessing.StandardScaler().fit(X_train).transform(X_train),
                             columns=x_cols)

In [10]:
X_train_scaled.head()

Unnamed: 0,X..1.year,X0.years,X1.year,X10..years,X2.years,X3.years,X4.years,X5.years,X6.years,X7.years,...,pc_x_28,pc_x_29,pc_x_30,pc_x_31,pc_x_32,pc_x_33,pc_x_34,pc_x_35,pc_x_36,pc_x_37
0,-0.264115,-0.246882,-0.238206,-0.64392,-0.279332,3.840565,-0.221891,-0.224074,-0.184335,-0.165416,...,0.070776,-0.328861,-0.045426,-0.202683,0.288362,0.062679,0.171166,0.458669,0.528036,-0.07789
1,-0.264115,-0.246882,4.19805,-0.64392,-0.279332,-0.260378,-0.221891,-0.224074,-0.184335,-0.165416,...,0.412325,-0.597735,-0.206357,-0.656185,-0.234829,-0.932502,-0.899674,0.065034,0.422712,0.660446
2,-0.264115,-0.246882,-0.238206,-0.64392,3.579976,-0.260378,-0.221891,-0.224074,-0.184335,-0.165416,...,0.511616,0.685016,0.059484,1.552833,-0.420996,-0.38601,0.129484,0.959951,0.267362,0.557762
3,-0.264115,-0.246882,-0.238206,-0.64392,-0.279332,-0.260378,4.506714,-0.224074,-0.184335,-0.165416,...,-0.181813,-0.645305,-0.097012,-1.088091,-0.580946,-0.054958,-0.386861,-0.614608,0.943625,0.228914
4,-0.264115,-0.246882,-0.238206,-0.64392,-0.279332,-0.260378,-0.221891,4.462815,-0.184335,-0.165416,...,-1.30081,0.954954,0.150038,0.47426,1.972741,-0.898009,0.177818,1.462102,1.521403,0.201933


In [11]:
# reset classes merging class 3 and 4, and shift class 5 to class 4
y_trainc = y_train.copy()
for i in range(len(y_trainc)):
    if y_trainc[i] == 4:
        y_trainc[i] = 3
    if y_trainc[i] == 5:
        y_trainc[i] = 4

In [None]:
# reset classes merging class 3 and 4, and shift class 5 to class 4
y_testc = y_test.copy()
for i in range(len(y_testc)):
    if y_testc[i] == 4:
        y_testc[i] = 3
    if y_testc[i] == 5:
        y_testc[i] = 4

In [None]:
# # cacheing
# X_train.cache()
# # test_x.cache()
# # y_testc.cache()
# # y_trainc.cache()

In [23]:
# basic logistic regression model
logreg = LogisticRegression(solver = 'saga', multi_class='multinomial', max_iter = 1000 ).fit(X_train, y_trainc)



In [24]:
y_pred = logreg.predict(test_x)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(test_x, y_testc)))

Accuracy of logistic regression classifier on test set: 0.97


In [25]:
precision_recall_fscore_support(y_testc, y_pred)


(array([0.98623591, 0.99622882, 0.65916685, 0.74898035]),
 array([0.99010102, 0.99783594, 0.66256158, 0.70914516]),
 array([0.98816468, 0.99703174, 0.66085985, 0.72851862]),
 array([ 29599, 101661,   4466,   5697]))

In [26]:
pd.DataFrame(logreg.coef_, columns=x_cols)


Unnamed: 0,X..1.year,X0.years,X1.year,X10..years,X2.years,X3.years,X4.years,X5.years,X6.years,X7.years,...,pc_x_28,pc_x_29,pc_x_30,pc_x_31,pc_x_32,pc_x_33,pc_x_34,pc_x_35,pc_x_36,pc_x_37
0,0.749112,1.357469,0.724833,0.658781,0.729314,0.784794,0.680845,0.67611,0.562808,0.551673,...,-0.042882,0.129459,-0.039552,0.157678,0.44057,-0.005026,-0.33217,-0.010597,-0.057949,0.164371
1,0.501202,-0.043874,0.544941,0.418886,0.591251,0.562336,0.531292,0.50536,0.428898,0.300153,...,0.067125,0.473298,-0.034696,-0.685047,-0.924553,0.146547,0.435691,0.167585,0.037649,0.02372
2,-0.884489,-1.176977,-0.819953,-0.607735,-0.960172,-0.918115,-0.803922,-0.806388,-0.75678,-0.649008,...,0.019299,-0.37142,0.201225,0.300085,0.40071,-0.113993,-0.172495,-0.103675,-0.10781,-0.262234
3,-0.365825,-0.136618,-0.449821,-0.469932,-0.360393,-0.429015,-0.408215,-0.375082,-0.234925,-0.202817,...,-0.043541,-0.231337,-0.126977,0.227283,0.083273,-0.027529,0.068974,-0.053313,0.12811,0.074143


In [28]:
# logistic regression model via L1 regularization - for feature selection over large number of features
logregL1 = LogisticRegression(penalty = 'l1', max_iter = 1000, multi_class='multinomial', solver = 'saga').fit(X_train, y_trainc)
y_predL1 = logregL1.predict(test_x)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logregL1.score(test_x, y_testc)))




Accuracy of logistic regression classifier on test set: 0.98


In [29]:
precision_recall_fscore_support(y_testc, y_predL1)

(array([0.98954821, 0.99680822, 0.66448851, 0.7469551 ]),
 array([0.99158755, 0.99840647, 0.66032244, 0.7212568 ]),
 array([0.99056683, 0.9976067 , 0.66239892, 0.73388105]),
 array([ 29599, 101661,   4466,   5697]))

In [30]:
# logistic regression model via L2 regularization
logregL2 = LogisticRegression(penalty = 'l2', max_iter = 1000, multi_class='multinomial', solver = 'sag').fit(X_train, y_trainc)
y_predL2 = logregL2.predict(test_x)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logregL2.score(test_x, y_testc)))




Accuracy of logistic regression classifier on test set: 0.98


In [31]:
precision_recall_fscore_support(y_testc, y_predL2)

(array([0.98984891, 0.99682795, 0.66418751, 0.74632286]),
 array([0.99162134, 0.99845565, 0.65987461, 0.72143233]),
 array([0.99073433, 0.99764114, 0.66202404, 0.73366655]),
 array([ 29599, 101661,   4466,   5697]))

In [None]:
clf_linear = svm.SVC(probability=True, C=10, kernel='linear').fit(X_train, y_trainc) 

In [None]:
clf_rbf = svm.SVC(decision_function_shape='ovo', cache_size = 3000, max_iter = 10, kernel='rbf').fit(X_train, y_trainc) 

In [None]:
clf_poly = svm.SVC(decision_function_shape='ovo', cache_size = 3000, max_iter = 10, degree=3, kernel='poly').fit(X_train, y_trainc) 