In [1]:
import pandas as pd 
import numpy as np
import aquire as aq
import prepare as pr
import warnings 
warnings.filterwarnings('ignore')

import graphviz
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
#get and prepare titanic data
t_df = aq.new_titanic_data()
t_df[:1]

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0


In [3]:
#prep and create our train, validate and test dataframes/ creates dummies for string catagoricals
train, validate, test = pr.prep_titanic(t_df)
train[:1]

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,Q,S,male
583,0,1,36.0,0,0,40.125,1,0,0,1


In [4]:
#create x and y values for train x = df with the target missing, y = target variable
X_train = train.drop(columns=['survived']) 
y_train = train.survived
X_validate = validate.drop(columns=['survived']) 
y_validate = validate.survived
X_test = test.drop(columns=['survived']) 
y_test = test.survived


In [5]:
#1
#What is your baseline prediction? What is your baseline accuracy? 
#remember: your baseline prediction for a classification problem is predicting 
#the most prevelant class in the training dataset (the mode). 
#When you make those predictions, what is your accuracy? This is your baseline accuracy.

In [6]:
#find best base case
train.survived.value_counts()

0    307
1    190
Name: survived, dtype: int64

In [7]:
#set base case to 0
base_model = DummyClassifier(strategy='constant', constant=0)

In [8]:
#fit our model
base_model.fit(X_train, y_train)

DummyClassifier(constant=0, strategy='constant')

In [9]:
baseline = base_model.score(X_train , y_train)
print('Baseline accuracy: {:.3f} %'.format(baseline * 100))

Baseline accuracy: 61.771 %


In [10]:
#2 Fit the decision tree classifier to your 
#training sample and transform (i.e. make predictions on the training sample)
train.sample(25)

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,Q,S,male
495,0,3,29.916875,0,0,14.4583,1,0,0,1
77,0,3,29.916875,0,0,8.05,1,0,1,1
189,0,3,36.0,0,0,7.8958,1,0,1,1
155,0,1,51.0,0,1,61.3792,0,0,0,1
658,0,2,23.0,0,0,13.0,1,0,1,1
706,1,2,45.0,0,0,13.5,1,0,1,0
65,1,3,29.916875,1,1,15.2458,0,0,0,1
145,0,2,19.0,1,1,36.75,0,0,1,1
584,0,3,29.916875,0,0,8.7125,1,0,0,1
507,1,1,29.916875,0,0,26.55,1,0,1,1


In [11]:
#create our trees
clf_1 = DecisionTreeClassifier(max_depth=4)
clf_2 = DecisionTreeClassifier(max_depth=2, random_state=123)
clf_3 = DecisionTreeClassifier(max_depth=5, random_state=980)

In [12]:
#train our trees
clf_1.fit(X_train, y_train)
clf_2.fit(X_train, y_train)
clf_3.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=5, random_state=980)

In [13]:
#visualise data

In [14]:
dot_data_1 = export_graphviz(clf_1, feature_names= X_train.columns ,rounded=True, filled=True)
dot_data_2 = export_graphviz(clf_2, feature_names= X_train.columns ,rounded=True, filled=True)
dot_data_3 = export_graphviz(clf_3, feature_names= X_train.columns ,rounded=True, filled=True)
graph_1 = graphviz.Source(dot_data_1)
graph_2 = graphviz.Source(dot_data_2)
graph_3 = graphviz.Source(dot_data_3)
graph_1.render('titanic_tree_Model_1', view=True)
graph_2.render('titanic_tree_Model_2', view=True)
graph_3.render('titanic_tree_Model_3', view=True)

'titanic_tree_Model_3.pdf'

In [15]:
model_1_score = clf_1.score(X_train,y_train)
model_2_score = clf_2.score(X_train,y_train)
model_3_score = clf_3.score(X_train,y_train)

validate_model_1 = clf_1.score(X_validate,y_validate)
validate_model_2 = clf_2.score(X_validate,y_validate)
validate_model_3 = clf_3.score(X_validate,y_validate)

print('Model 1 accuracy: {:.3f} % | Model 1 accuracy on validate: {:.3f} %'.format(model_1_score * 100 ,validate_model_1 * 100))
print('Model 2 accuracy: {:.3f} % | Model 2 accuracy on validate: {:.3f} %'.format(model_2_score * 100 ,validate_model_2 * 100))
print('Model 3 accuracy: {:.3f} % | Model 3 accuracy on validate: {:.3f} %'.format(model_3_score * 100 ,validate_model_3 * 100))

Model 1 accuracy: 83.501 % | Model 1 accuracy on validate: 78.505 %
Model 2 accuracy: 79.074 % | Model 2 accuracy on validate: 78.037 %
Model 3 accuracy: 85.714 % | Model 3 accuracy on validate: 79.439 %


In [16]:
#in terms of training, accuracy, model 3 is the most accurate but has overfitting.
#model 2 preformed reletivly the same for the train and validate. use model 2

In [17]:
#compare validate to actual classification report for models
y_pred_1 = clf_1.predict(X_validate)
y_pred_2 = clf_2.predict(X_validate)
y_pred_3 = clf_3.predict(X_validate)
report_1 = classification_report(y_validate, y_pred_1,output_dict=True)
report_2 = classification_report(y_validate, y_pred_2,output_dict=True)
report_3 = classification_report(y_validate, y_pred_3,output_dict=True)

In [18]:
df_1, df_2 , df_3 = pd.DataFrame(report_1).transpose(), pd.DataFrame(report_2).transpose(), pd.DataFrame(report_3).transpose()

In [19]:
df_1, df_2, df_3

(              precision    recall  f1-score     support
 0              0.762195  0.946970  0.844595  132.000000
 1              0.860000  0.524390  0.651515   82.000000
 accuracy       0.785047  0.785047  0.785047    0.785047
 macro avg      0.811098  0.735680  0.748055  214.000000
 weighted avg   0.799672  0.785047  0.770611  214.000000,
               precision    recall  f1-score     support
 0              0.748538  0.969697  0.844884  132.000000
 1              0.906977  0.475610  0.624000   82.000000
 accuracy       0.780374  0.780374  0.780374    0.780374
 macro avg      0.827757  0.722653  0.734442  214.000000
 weighted avg   0.809248  0.780374  0.760247  214.000000,
               precision    recall  f1-score     support
 0              0.805556  0.878788  0.840580  132.000000
 1              0.771429  0.658537  0.710526   82.000000
 accuracy       0.794393  0.794393  0.794393    0.794393
 macro avg      0.788492  0.768662  0.775553  214.000000
 weighted avg   0.792479  0.7

In [20]:
#compare train to actual classification report for models
y_pred_1 = clf_1.predict(X_train)
y_pred_2 = clf_2.predict(X_train)
y_pred_3 = clf_3.predict(X_train)
report_1 = classification_report(y_train, y_pred_1,output_dict=True)
report_2 = classification_report(y_train, y_pred_2,output_dict=True)
report_3 = classification_report(y_train, y_pred_3,output_dict=True)
df_1, df_2 , df_3 = pd.DataFrame(report_1).transpose(), pd.DataFrame(report_2).transpose(), pd.DataFrame(report_3).transpose()

In [21]:
df_1, df_2, df_3

(              precision    recall  f1-score    support
 0              0.798408  0.980456  0.880117  307.00000
 1              0.950000  0.600000  0.735484  190.00000
 accuracy       0.835010  0.835010  0.835010    0.83501
 macro avg      0.874204  0.790228  0.807800  497.00000
 weighted avg   0.856361  0.835010  0.824825  497.00000,
               precision    recall  f1-score     support
 0              0.751861  0.986971  0.853521  307.000000
 1              0.957447  0.473684  0.633803  190.000000
 accuracy       0.790744  0.790744  0.790744    0.790744
 macro avg      0.854654  0.730327  0.743662  497.000000
 weighted avg   0.830455  0.790744  0.769524  497.000000,
               precision    recall  f1-score     support
 0              0.853293  0.928339  0.889236  307.000000
 1              0.865031  0.742105  0.798867  190.000000
 accuracy       0.857143  0.857143  0.857143    0.857143
 macro avg      0.859162  0.835222  0.844051  497.000000
 weighted avg   0.857780  0.857143 

In [22]:
#confusion matrix for models for train
labels = ["Dead", "Survived"]
y_pred_1 = clf_1.predict(X_train)
y_pred_2 = clf_2.predict(X_train)
y_pred_3 = clf_3.predict(X_train)

cm_1 = pd.DataFrame(confusion_matrix(y_train, y_pred_1),index=labels,columns=labels)
cm_2 = pd.DataFrame(confusion_matrix(y_train, y_pred_2),index=labels,columns=labels)
cm_3 = pd.DataFrame(confusion_matrix(y_train, y_pred_3),index=labels,columns=labels)

In [23]:
cm_1, cm_2, cm_3

(          Dead  Survived
 Dead       301         6
 Survived    76       114,
           Dead  Survived
 Dead       303         4
 Survived   100        90,
           Dead  Survived
 Dead       285        22
 Survived    49       141)

In [24]:
#confusion matrix for models for validate
labels = ["Dead", "Survived"]
y_pred_1 = clf_1.predict(X_validate)
y_pred_2 = clf_2.predict(X_validate)
y_pred_3 = clf_3.predict(X_validate)

cm_1 = pd.DataFrame(confusion_matrix(y_validate, y_pred_1),index=labels,columns=labels)
cm_2 = pd.DataFrame(confusion_matrix(y_validate, y_pred_2),index=labels,columns=labels)
cm_3 = pd.DataFrame(confusion_matrix(y_validate, y_pred_3),index=labels,columns=labels)

In [25]:
cm_1, cm_2, cm_3

(          Dead  Survived
 Dead       125         7
 Survived    39        43,
           Dead  Survived
 Dead       128         4
 Survived    43        39,
           Dead  Survived
 Dead       116        16
 Survived    28        54)

### #6 Which model performs better on your in-sample data?
#### for Positive = Dead

- Best for accuracy: Model 3
- Best for Recall : Model 2
- Best for Precision: Model  3


### #7 Which model performs better on your out-sample data?
#### for Positive = Dead

- Best for accuracy: Model 3
- Best for Recall : Model 2
- Best for Precision: Model  3



In [26]:
########################## RANDOM FOREST ##############################################


In [27]:
#1 Fit the Random Forest classifier to your training sample and transform 
#(i.e. make predictions on the training sample) setting the random_state accordingly 
# and setting min_samples_leaf = 1 and max_depth = 10.


rf_1 = RandomForestClassifier(max_depth= 10, min_samples_leaf=1)
rf_2 = RandomForestClassifier(max_depth= 4, min_samples_leaf= 6)

In [28]:
#evaluate your results using the model score, confusion matrix, and classification report.
#train

In [29]:
rf_1.fit(X_train,y_train)
rf_2.fit(X_train,y_train)

RandomForestClassifier(max_depth=4, min_samples_leaf=6)

In [30]:
rf_1_score = rf_1.score(X_train,y_train)
rf_2_score = rf_2.score(X_train,y_train)

validate_rf_1 = clf_1.score(X_validate,y_validate)
validate_rf_2 = clf_2.score(X_validate,y_validate)

print('Model 1 accuracy: {:.3f} % | Model 1 accuracy on validate: {:.3f} %'.format(rf_1_score * 100 ,validate_rf_1 * 100))
print('Model 2 accuracy: {:.3f} % | Model 2 accuracy on validate: {:.3f} %'.format(rf_2_score * 100 ,validate_rf_2 * 100))

Model 1 accuracy: 96.579 % | Model 1 accuracy on validate: 78.505 %
Model 2 accuracy: 84.708 % | Model 2 accuracy on validate: 78.037 %


In [31]:
#confusion matrices for train vs validate
labels = ["Dead", "Survived"]
y_pred_1 = rf_1.predict(X_train)
y_pred_2 = rf_2.predict(X_train)

v_pred_1 = rf_1.predict(X_validate)
v_pred_2 = rf_2.predict(X_validate)



rm_1 = pd.DataFrame(confusion_matrix(y_train, y_pred_1),index=labels,columns=labels)
rm_2 = pd.DataFrame(confusion_matrix(y_train, y_pred_2),index=labels,columns=labels)

rmv_1 = pd.DataFrame(confusion_matrix(y_validate, v_pred_1),index=labels,columns=labels)
rmv_2 = pd.DataFrame(confusion_matrix(y_validate, v_pred_2),index=labels,columns=labels)

In [32]:
rm_1, rm_2

(          Dead  Survived
 Dead       307         0
 Survived    17       173,
           Dead  Survived
 Dead       292        15
 Survived    61       129)

In [33]:
rmv_1, rmv_2

(          Dead  Survived
 Dead       118        14
 Survived    21        61,
           Dead  Survived
 Dead       122        10
 Survived    30        52)

In [34]:
#Classification reports for train vs validate
x_pred_1 = rf_1.predict(X_train)
x_pred_2 = rf_2.predict(X_train)

v_pred_1 = rf_1.predict(X_validate)
v_pred_2 = rf_2.predict(X_validate)

report_1 = classification_report(y_train, x_pred_1,output_dict=True)
report_2 = classification_report(y_train, x_pred_2,output_dict=True)

vreport_1 = classification_report(y_validate, v_pred_1,output_dict=True)
vreport_2 = classification_report(y_validate, v_pred_2,output_dict=True)

df_1, df_2 = pd.DataFrame(report_1).transpose(), pd.DataFrame(report_2).transpose()

dfv_1, dfv_2 = pd.DataFrame(vreport_1).transpose(), pd.DataFrame(vreport_2).transpose()

In [35]:
df_1, df_2

(              precision    recall  f1-score     support
 0              0.947531  1.000000  0.973059  307.000000
 1              1.000000  0.910526  0.953168  190.000000
 accuracy       0.965795  0.965795  0.965795    0.965795
 macro avg      0.973765  0.955263  0.963113  497.000000
 weighted avg   0.967589  0.965795  0.965455  497.000000,
               precision    recall  f1-score     support
 0              0.827195  0.951140  0.884848  307.000000
 1              0.895833  0.678947  0.772455  190.000000
 accuracy       0.847082  0.847082  0.847082    0.847082
 macro avg      0.861514  0.815044  0.828652  497.000000
 weighted avg   0.853435  0.847082  0.841881  497.000000)

In [36]:
dfv_1, dfv_2

(              precision    recall  f1-score     support
 0              0.848921  0.893939  0.870849  132.000000
 1              0.813333  0.743902  0.777070   82.000000
 accuracy       0.836449  0.836449  0.836449    0.836449
 macro avg      0.831127  0.818921  0.823959  214.000000
 weighted avg   0.835285  0.836449  0.834915  214.000000,
               precision    recall  f1-score     support
 0              0.802632  0.924242  0.859155  132.000000
 1              0.838710  0.634146  0.722222   82.000000
 accuracy       0.813084  0.813084  0.813084    0.813084
 macro avg      0.820671  0.779194  0.790689  214.000000
 weighted avg   0.816456  0.813084  0.806685  214.000000)

In [37]:
#model 2 has a closer metric on both validate and train 
#but model 1 accuracy on train was vastly greater in percent compared to model 2

In [38]:
######################### K- NEAREST NEIGHBOR #####################################|

In [39]:
#1.Fit a K-Nearest Neighbors classifier to your training sample and transform 
#(i.e. make predictions on the training sample)
t_scores=[]
v_scores=[]
for i in range(1,21):
    knn = KNeighborsClassifier(n_neighbors= i)
    knn.fit(X_train, y_train)
    t_scores.append({'k-value for Train':i, 'Train accuracy': '{:.2f} %'.format(knn.score(X_train,y_train) * 100)})
    v_scores.append({'k-value for Validate':i, 'Validate accuracy' : '{:2f}%'.format(knn.score(X_validate,y_validate) * 100)})
    
    

In [40]:
t_scores

[{'k-value for Train': 1, 'Train accuracy': '98.79 %'},
 {'k-value for Train': 2, 'Train accuracy': '83.10 %'},
 {'k-value for Train': 3, 'Train accuracy': '84.71 %'},
 {'k-value for Train': 4, 'Train accuracy': '79.28 %'},
 {'k-value for Train': 5, 'Train accuracy': '77.46 %'},
 {'k-value for Train': 6, 'Train accuracy': '76.66 %'},
 {'k-value for Train': 7, 'Train accuracy': '76.46 %'},
 {'k-value for Train': 8, 'Train accuracy': '74.65 %'},
 {'k-value for Train': 9, 'Train accuracy': '75.86 %'},
 {'k-value for Train': 10, 'Train accuracy': '76.06 %'},
 {'k-value for Train': 11, 'Train accuracy': '76.26 %'},
 {'k-value for Train': 12, 'Train accuracy': '75.45 %'},
 {'k-value for Train': 13, 'Train accuracy': '75.25 %'},
 {'k-value for Train': 14, 'Train accuracy': '72.23 %'},
 {'k-value for Train': 15, 'Train accuracy': '73.04 %'},
 {'k-value for Train': 16, 'Train accuracy': '72.43 %'},
 {'k-value for Train': 17, 'Train accuracy': '73.64 %'},
 {'k-value for Train': 18, 'Train accura

In [41]:
v_scores

[{'k-value for Validate': 1, 'Validate accuracy': '71.495327%'},
 {'k-value for Validate': 2, 'Validate accuracy': '67.289720%'},
 {'k-value for Validate': 3, 'Validate accuracy': '72.897196%'},
 {'k-value for Validate': 4, 'Validate accuracy': '68.224299%'},
 {'k-value for Validate': 5, 'Validate accuracy': '73.831776%'},
 {'k-value for Validate': 6, 'Validate accuracy': '70.560748%'},
 {'k-value for Validate': 7, 'Validate accuracy': '71.495327%'},
 {'k-value for Validate': 8, 'Validate accuracy': '71.028037%'},
 {'k-value for Validate': 9, 'Validate accuracy': '73.831776%'},
 {'k-value for Validate': 10, 'Validate accuracy': '68.691589%'},
 {'k-value for Validate': 11, 'Validate accuracy': '71.962617%'},
 {'k-value for Validate': 12, 'Validate accuracy': '71.495327%'},
 {'k-value for Validate': 13, 'Validate accuracy': '74.766355%'},
 {'k-value for Validate': 14, 'Validate accuracy': '71.962617%'},
 {'k-value for Validate': 15, 'Validate accuracy': '72.429907%'},
 {'k-value for Vali

In [42]:
###Confusion matrix for validate when n = 5, our best accuracy with smallest n 
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

KNeighborsClassifier()

In [43]:
knn_best = knn.predict(X_validate)
knnm_best = pd.DataFrame(confusion_matrix(y_validate, knn_best),index=labels,columns=labels)
labels = ["Dead", "Survived"]

In [44]:
knnm_best

Unnamed: 0,Dead,Survived
Dead,106,26
Survived,30,52


In [45]:
TP , FP, FN , TN = confusion_matrix(y_validate, knn_best).ravel()

In [46]:
accuracy = (TP+TN)/(TP + FP + FN+ TN)
recall = TP/(TP+FN)
precision = TP/(TP+FP)
f1_score = 2 *(precision * recall)/(precision + recall)

In [47]:
print('Accuracy is {:.4}'.format(accuracy * 100))
print('Recall is {:.4}'.format(recall * 100) )
print('Precision is {:.4}'.format(precision *100))
print('f1_score is {:.4}'.format(f1_score * 100))

Accuracy is 73.83
Recall is 77.94
Precision is 80.3
f1_score is 79.1


In [48]:
#Run through steps 2-4 setting k to 10
knn_10 = KNeighborsClassifier(n_neighbors=10)
knn_10.fit(X_train, y_train)

#Run through setps 2-4 setting k to 20
knn_20 = KNeighborsClassifier(n_neighbors=20)
knn_20.fit(X_train, y_train)



KNeighborsClassifier(n_neighbors=20)

In [49]:
pred_10 = knn_10.predict(X_validate)
knnm_10 = pd.DataFrame(confusion_matrix(y_validate, pred_10),index=labels,columns=labels)

pred_20 = knn_20.predict(X_validate)
knnm_20 = pd.DataFrame(confusion_matrix(y_validate, pred_20),index=labels,columns=labels)

In [50]:
knnm_10

Unnamed: 0,Dead,Survived
Dead,111,21
Survived,46,36


In [51]:
knnm_20

Unnamed: 0,Dead,Survived
Dead,114,18
Survived,46,36


In [52]:
from sklearn.metrics import precision_recall_fscore_support

In [53]:
stats_10 = precision_recall_fscore_support(y_validate, pred_10)
stats_20 = precision_recall_fscore_support(y_validate, pred_20)

In [54]:
print('---------------------Dead----------Alive')
print('n= 10 Accuracy is {}'.format(stats_10[0]))
print('n= 10 Recall is {}'.format(stats_10[1]))
print('n= 10 Precision is {}'.format(stats_10[2]))
print('n= 10 f1_score is {}'.format(stats_10[3]))
print('n= 20 Accuracy is {}'.format(stats_20[0]))
print('n= 20 Recall is {}'.format(stats_20[1]))
print('n= 20 Precision is {}'.format(stats_20[2]))
print('n= 20 f1_score is {}'.format(stats_20[3]))

---------------------Dead----------Alive
n= 10 Accuracy is [0.70700637 0.63157895]
n= 10 Recall is [0.84090909 0.43902439]
n= 10 Precision is [0.76816609 0.51798561]
n= 10 f1_score is [132  82]
n= 20 Accuracy is [0.7125     0.66666667]
n= 20 Recall is [0.86363636 0.43902439]
n= 20 Precision is [0.78082192 0.52941176]
n= 20 f1_score is [132  82]


In [55]:
report_10 = classification_report(y_validate, pred_10,output_dict=True)
report_20 = classification_report(y_validate, pred_20,output_dict=True)

In [56]:
pd.DataFrame(report_10).T

Unnamed: 0,precision,recall,f1-score,support
0,0.707006,0.840909,0.768166,132.0
1,0.631579,0.439024,0.517986,82.0
accuracy,0.686916,0.686916,0.686916,0.686916
macro avg,0.669293,0.639967,0.643076,214.0
weighted avg,0.678104,0.686916,0.672303,214.0


In [57]:
pd.DataFrame(report_20).T

Unnamed: 0,precision,recall,f1-score,support
0,0.7125,0.863636,0.780822,132.0
1,0.666667,0.439024,0.529412,82.0
accuracy,0.700935,0.700935,0.700935,0.700935
macro avg,0.689583,0.65133,0.655117,214.0
weighted avg,0.694938,0.700935,0.684487,214.0


In [58]:
################ LINEAR REGRESSION #########################


In [59]:
#1.Create a model that includes age in addition to fare and pclass. Does this model perform better than your baseline?

In [67]:
#2.Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.
train[:1]

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,Q,S,male
583,0,1,36.0,0,0,40.125,1,0,0,1


In [60]:
#create x and y values for train x = df with the target missing, y = target variable
X_train = train.drop(columns=['survived']) 
y_train = train.survived
X_validate = validate.drop(columns=['survived']) 
y_validate = validate.survived
X_test = test.drop(columns=['survived']) 
y_test = test.survived

In [61]:
base_model.fit(X_train, y_train)
baseline = base_model.score(X_validate,y_validate)

In [62]:
print('Baseline accuracy: {:.3f} %'.format(baseline * 100))

Baseline accuracy: 61.682 %


In [63]:
logit = LogisticRegression(C=1, random_state=123)

In [64]:
logit.fit(X_train, y_train)

LogisticRegression(C=1, random_state=123)

In [65]:
print('Linear regression Train accuracy: {:.2f} %'.format(logit.score(X_train,y_train) * 100))

Linear regression Train accuracy: 80.28 %


In [66]:
print('Linear regression Validate accuracy: {:.2f} %'.format(logit.score(X_validate,y_validate) * 100))

Linear regression Validate accuracy: 78.97 %


In [68]:
# regression model > baseline

In [69]:
# try out other combinations of features

In [100]:
logit2 = LogisticRegression(C=4)
logit3 = LogisticRegression(C=.15, max_iter=250)

In [101]:
logit2.fit(X_train, y_train)
logit3.fit(X_train, y_train)

LogisticRegression(C=0.15, max_iter=250)

In [102]:
print('Linear regression #2 Train accuracy: {:.2f} %'.format(logit2.score(X_train,y_train) * 100))
print('Linear regression #2 validate accuracy: {:.2f} %'.format(logit2.score(X_validate,y_validate) * 100))

Linear regression #2 Train accuracy: 81.09 %
Linear regression #2 validate accuracy: 80.37 %


In [103]:
print('Linear regression #3 Train accuracy: {:.2f} %'.format(logit3.score(X_train,y_train) * 100))
print('Linear regression #3 Validate accuracy: {:.2f} %'.format(logit3.score(X_validate,y_validate) * 100))

Linear regression #3 Train accuracy: 80.68 %
Linear regression #3 Validate accuracy: 79.91 %


In [105]:
#use model 2 on test
print('Linear regression #2 test accuracy: {:.2f} %'.format(logit2.score(X_test,y_test) * 100))

Linear regression #2 test accuracy: 79.21 %
