# Baseline model

Perform feature selection and baseline modeling!

In [12]:
import pickle 
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import precision_score, recall_score, precision_recall_curve,f1_score, balanced_accuracy_score
from sklearn.preprocessing import StandardScaler

In [3]:
def save(clf, name):
    pickle_out = open(name,"wb")
    pickle.dump(clf, pickle_out)
    pickle_out.close()
    print ('Model ',name,' saved')
    
def load(clf_file):
    pickle_in = open(clf_file,"rb")
    clf = pickle.load(pickle_in)
    return clf

new_data = load('/Users/shiehan/Desktop/kaplan stuff/project5/new_data.pkl')

In [4]:
new_data

Unnamed: 0,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,Q10,...,Q13,Q14,Q15,Q16,Q17,Q18,Q19,Q20,Q21,Q22
0,No,"Yes, they all did",Other,I don't know,Male,Other,No,No,Some did,Other,...,26-100,25,Maybe,TRUE,I don't know,Yes,Other,Yes,TRUE,Other
1,Maybe,Some did,Other,"Yes, always",Male,Maybe,No,Yes,Some did,Other,...,26-100,51,Maybe,TRUE,Yes,No,Other,No,TRUE,Other
2,No,None did,Other,I don't know,Male,No,No,I don't know,None did,Other,...,26-100,27,No,TRUE,I don't know,No,Other,Maybe,TRUE,Other
3,No,Some did,Other,"Yes, always",Male,No,No,Yes,Some did,Other,...,100-500,37,Maybe,TRUE,Yes,Yes,Other,No,TRUE,Other
4,No,None did,Other,I don't know,Male,No,No,No,None did,Other,...,26-100,46,No,TRUE,I don't know,Other,Other,No,TRUE,Other
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1452,No,Some did,Somewhat open,I don't know,Other,No,Other,Yes,Some did,Maybe,...,Other,34,Other,Other,Other,Other,Not applicable to me,No,Other,Other
1453,No,Other,Somewhat not open,Other,Other,No,Other,Yes,Other,Other,...,Other,56,Other,Other,Other,Other,Other,Maybe,Other,Other
1454,Yes,Some did,Somewhat open,Sometimes,Male,Maybe,Yes,Yes,None did,"Yes, it has",...,100-500,52,Yes,TRUE,I don't know,Yes,Other,Maybe,Other,Other
1455,Maybe,None did,Somewhat open,I don't know,Female,Yes,I am not sure,Yes,None did,"No, I don't think it would",...,100-500,30,Yes,FALSE,I don't know,No,Other,Maybe,TRUE,Other


In [5]:
# Dummy all of my categorial features
dummy_data = pd.get_dummies(new_data.drop(['Q6'], axis = 1))

In [6]:
dummy_data = pd.concat([dummy_data, new_data['Q6']], axis = 1)

In [7]:
X = dummy_data.drop(['Q6'], axis = 1)
y = dummy_data['Q6']
seed = 123
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

In [8]:
scaler = StandardScaler()
xtrain_val_scaled = scaler.fit_transform(X_train_val)
xtest_scaled = scaler.transform(X_test)

In [9]:
# increased max_iter because defualt is 1000 and fucking lbfgs wouldn't converge
model = LogisticRegression(solver = 'lbfgs', random_state = seed, max_iter=2000)
model.fit(xtrain_val_scaled, y_train_val)
print("The score for logistic regression is")
print("Training: {:6.2f}%".format(100*model.score(xtrain_val_scaled, y_train_val)))
print("Test set: {:6.2f}%".format(100*model.score(xtest_scaled, y_test)))

The score for logistic regression is
Training:  77.99%
Test set:  77.84%


In [10]:
y_prec = model.predict(xtest_scaled)

print('f1_score:',f1_score(y_test, y_prec,average='micro'))
print('recall:',recall_score(y_test, y_prec,average='micro'))
print('precision:',precision_score(y_test, y_prec,average='micro'))

f1_score: 0.7783783783783784
recall: 0.7783783783783784
precision: 0.7783783783783784


In [13]:
# use random forest to do feature selection
rf = RandomForestClassifier(random_state = seed)
rf.fit(xtrain_val_scaled, y_train_val)


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=123,
                       verbose=0, warm_start=False)

In [14]:
# Build a forest and compute the feature importances
features = dummy_data.drop(['Q6'],axis=1).columns
importances = rf.feature_importances_
importance_lst = list(zip(features,importances))


In [15]:
# only get features with an importance larger than 0.01
important_lst = []
for question, importance in importance_lst:
    if (importance > 0.01):
        print(question,importance)
        important_lst.append(question)

Q14 0.056589611120085925
Q1_Maybe 0.028686084540334694
Q1_No 0.07202929182903542
Q1_Other 0.012860996074425209
Q1_Yes 0.050952889480922414
Q3_Other 0.012256175415815663
Q4_I don't know 0.01133609190247351
Q5_Male 0.010826329941618622
Q7_No 0.010946749989785804
Q8_I don't know 0.011922544498124951
Q8_No 0.01938285406699657
Q8_Yes 0.0167881491320174
Q10_Yes, I think it would 0.01018238196105771
Q11_No 0.02280024341429061
Q11_Other 0.06137055246219633
Q11_Yes 0.10626478562355259
Q12_No 0.034648721205519616
Q12_Yes 0.03907613787118227
Q13_100-500 0.010056584026963168
Q15_Maybe 0.010488579606266374
Q20_Maybe 0.013011496475652567
Q20_No 0.01183722287977208
Q20_Yes 0.011685645887935876


In [16]:
important_lst = important_lst+ ['Q6']
data1 = dummy_data[important_lst].copy()

In [21]:
#save(data1, 'data1.pkl')

In [17]:
X = dummy_data.drop(['Q6'], axis = 1)
y = dummy_data['Q6']
seed = 123
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

In [18]:
scaler = StandardScaler()
xtrain_val_scaled = scaler.fit_transform(X_train_val)
xtest_scaled = scaler.transform(X_test)

In [19]:
# chose logistic regression as my baseline model
model = LogisticRegression(solver = 'lbfgs', random_state = seed)
model.fit(xtrain_val_scaled, y_train_val)
print("The score for logistic regression is")
print("Training: {:6.2f}%".format(100*model.score(xtrain_val_scaled, y_train_val)))
print("Test set: {:6.2f}%".format(100*model.score(xtest_scaled, y_test)))

The score for logistic regression is
Training:  77.94%
Test set:  77.66%


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [20]:
y_prec = model.predict(xtest_scaled)

print('f1_score:',f1_score(y_test, y_prec,average='micro'))
print('recall:',recall_score(y_test, y_prec,average='micro'))
print('precision:',precision_score(y_test, y_prec,average='micro'))
print('balanced_accuracy_score', balanced_accuracy_score(y_test, y_prec))

f1_score: 0.7765765765765765
recall: 0.7765765765765765
precision: 0.7765765765765765
balanced_accuracy_score 0.6334043803285246
