In [3]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import acquire
import prepare


from pydataset import data

In [4]:
df = acquire.new_titanic_data()

In [5]:
train, validate, test = prepare.prep_titanic(df)

In [6]:
X_train, y_train = train.drop(columns='survived'), train.survived
X_validate, y_validate = validate.drop(columns='survived'), validate.survived
X_test, y_test = test.drop(columns='survived'), test.survived

What is your baseline prediction? What is your baseline accuracy? remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). When you make those predictions, what is your accuracy? This is your baseline accuracy.

In [7]:
# Baseline determination - 0 or deaths
train.survived.value_counts()
# Baseline accuracy = 61.6% (307/491)

0    307
1    190
Name: survived, dtype: int64

Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [46]:
# set baseline
X_train['baseline'] = 0

In [47]:
X_train.head()

Unnamed: 0,pclass,fare,baseline
583,1,40.125,0
337,1,134.5,0
50,3,39.6875,0
218,1,76.2917,0
31,1,146.5208,0


In [57]:
X_cols = ['pclass', 'fare']
y_col = 'survived'

X_train, y_train = train[X_cols], train[y_col]
X_validate, y_validate = validate[X_cols], validate[y_col]
X_test, y_test = test[X_cols], test[y_col]

In [49]:
model1 = DecisionTreeClassifier(max_depth=5, random_state=123)
model1 = model1.fit(X_train, y_train)

In [50]:
model1_pred = model1.predict(X_train)

In [54]:
# create a new column on the train dataframe that contains the models predictions
train['prediction'] = model1_pred

In [58]:

print(f'training score: {model1.score(X_train, y_train):.2%}')
print(f'validate score: {model1.score(X_validate, y_validate):.2%}')

training score: 72.84%
validate score: 72.43%


In [65]:
# use the column you just created and the actual values in the survived column
# to generate a classification report
print(classification_report(train.survived, train.prediction, zero_division=True))

              precision    recall  f1-score   support

           0       0.71      0.95      0.81       307
           1       0.82      0.37      0.51       190

    accuracy                           0.73       497
   macro avg       0.76      0.66      0.66       497
weighted avg       0.75      0.73      0.70       497



In [56]:
pd.DataFrame(classification_report(train.survived, train.prediction, output_dict=True)).transpose()

Unnamed: 0,precision,recall,f1-score,support
0,0.709756,0.947883,0.811715,307.0
1,0.816092,0.373684,0.512635,190.0
accuracy,0.72837,0.72837,0.72837,0.72837
macro avg,0.762924,0.660783,0.662175,497.0
weighted avg,0.750408,0.72837,0.697379,497.0


## Random Forest Exercises ##

In [57]:
# Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.
model_rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=1,
                            n_estimators=10,
                            max_depth=10, 
                            random_state=123)
model_rf.fit(X_train, y_train)


RandomForestClassifier(max_depth=10, n_estimators=10, random_state=123)

In [58]:
# use model to run predictive values
model_rf.predict(X_train)

array([0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,

In [59]:
# create a new column on the train dataframe that contains the models predictions
mrf_pred = train['prediction']

In [31]:
# Evaluate your results usIng the model score, confusion matrix, and classification report.

# Model score
print(f'training score: {model_rf.score(X_train, y_train):.2%}')
print(f'validate score: {model_rf.score(X_validate, y_validate):.2%}')

training score: 82.90%
validate score: 72.43%


In [32]:
model_rf.predict(X_train)

array([0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,

In [33]:
# create a new column on the train dataframe that contains the models predictions
train['prediction'] = model_rf.predict(X_train)

In [37]:
print(classification_report(train.survived, train.prediction, zero_division=True))

              precision    recall  f1-score   support

           0       0.82      0.92      0.87       307
           1       0.84      0.68      0.75       190

    accuracy                           0.83       497
   macro avg       0.83      0.80      0.81       497
weighted avg       0.83      0.83      0.82       497



In [121]:
# Run through steps increasing your min_samples_leaf and decreasing your max_depth.


## KNN Exercises ##

In [1]:
import knn_lesson_util as util
from sklearn.neighbors import KNeighborsClassifier

Fit a K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample)

In [33]:
model_knn = KNeighborsClassifier(n_neighbors=22)

model_knn.fit(X_train, y_train)
model_knn.score(X_train, y_train)

0.6981891348088531

In [31]:
#Choosing k for model
for k in range(1, 25):
    model_knn = KNeighborsClassifier(n_neighbors=k)
    model_knn.fit(X_train, y_train)
    accuracy = model_knn.score(X_test, y_test)
    print(f'{k:2d}: {accuracy:.2%}')

 1: 61.24%
 2: 62.36%
 3: 61.24%
 4: 60.11%
 5: 61.24%
 6: 61.80%
 7: 62.36%
 8: 64.61%
 9: 64.04%
10: 66.29%
11: 65.73%
12: 66.29%
13: 67.42%
14: 66.29%
15: 68.54%
16: 66.29%
17: 66.85%
18: 66.29%
19: 66.29%
20: 65.17%
21: 67.42%
22: 66.29%
23: 67.98%
24: 66.85%


In [63]:
train['prediction'] = model_knn.predict(X_train)
train['correct'] = train.prediction == train.survived

Evaluate your results using the model score, confusion matrix, and classification report.

In [69]:
# model score
# ... on valdiate data
print('Accuracy of KNN classifier on validate set: {:.2f}'
     .format(model_knn.score(X_validate, y_validate)))

# ... on test data
model_score = model_knn.score(X_test, y_test)
print('Accuracy of KNN classifier on test set: {:.2f}'
     .format(model_score))

Accuracy of KNN classifier on validate set: 0.69
Accuracy of KNN classifier on test set: 0.66


In [65]:
# Confusion matrix
confusion_matrix(train.survived, train.prediction)

array([[260,  47],
       [103,  87]])

In [66]:
# Classification report
print(classification_report(train.survived, train.prediction, zero_division=True))

              precision    recall  f1-score   support

           0       0.72      0.85      0.78       307
           1       0.65      0.46      0.54       190

    accuracy                           0.70       497
   macro avg       0.68      0.65      0.66       497
weighted avg       0.69      0.70      0.68       497



Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [70]:
# Accuracy
print('Accuracy of KNN classifier on model set: {:.2f}'
     .format(model_score))

Accuracy of KNN classifier on model set: 0.66


In [78]:
# Precision, recall & f1-score
print(classification_report(train.survived, train.prediction, zero_division=True))

              precision    recall  f1-score   support

           0       0.72      0.85      0.78       307
           1       0.65      0.46      0.54       190

    accuracy                           0.70       497
   macro avg       0.68      0.65      0.66       497
weighted avg       0.69      0.70      0.68       497



In [82]:
# True positive, False positive, True Negative & False Negative
confusion_matrix(train.survived, train.prediction)

# true positve rate = sensitivity = TP/TP+FN = 87/(87+103) = 45.79%
# false postive rate = FP/FP+TN = 47/(47+260) = 15.3%
# true negative rate = specificity = TN/(TN+FP) = 260/(260+47) = 84.7%
# false negative rate = miss rate= FN/(FN+TP) = 103/(103+87) = 54.2%

array([[260,  47],
       [103,  87]])

## Logistic Regression ##

In [177]:
from sklearn.linear_model import LogisticRegression
# Set the model hyperparameters
# logit = LogisticRegression(C=1, class_weight={0:1, 1:99}, random_state=123, intercept_scaling=1, solver='lbfgs')

1) Create a model that includes age in addition to fare and pclass. Does this model perform better than your baseline?

In [178]:
X_cols = ['pclass', 'fare', 'pclass']
y_col = 'survived'

X_train, y_train = train[X_cols], train[y_col]
X_validate, y_validate = validate[X_cols], validate[y_col]
X_test, y_test = test[X_cols], test[y_col]

In [179]:
# fit the model
logit = LogisticRegression(C=1, class_weight={0:1, 1:99}, random_state=123, intercept_scaling=1, solver='lbfgs')
logit.fit(X_train, y_train)

LogisticRegression(C=1, class_weight={0: 1, 1: 99}, random_state=123)

In [180]:
# run the predictions
y1_pred = logit1.predict(X_train1)

In [157]:
y1_pred

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [158]:
# compute the accuracy of logit1 model
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X_train, y_train)))

Accuracy of Logistic Regression classifier on training set: 0.38


In [None]:
# logit1 does not perform better than the baseline

2) Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.

In [159]:
# select variables and create train, validate, test

X_cols = ['pclass', 'fare', 'pclass', 'sex']
y_col = 'survived'

X_train, y_train = train[X_cols], train[y_col]
X_validate, y_validate = validate[X_cols], validate[y_col]
X_test, y_test = test[X_cols], test[y_col]

In [94]:
X_train

Unnamed: 0,pclass,fare,pclass.1,sex
583,1,40.1250,1,male
337,1,134.5000,1,female
50,3,39.6875,3,male
218,1,76.2917,1,female
31,1,146.5208,1,female
...,...,...,...,...
313,3,7.8958,3,male
636,3,7.9250,3,male
222,3,8.0500,3,male
485,3,25.4667,3,female


In [98]:
# Generate dummy data for sex 
dummy_df = (pd.get_dummies(X_train.sex, drop_first=True))
dummy_df

Unnamed: 0,male
583,1
337,0
50,1
218,0
31,0
...,...
313,1
636,1
222,1
485,0


In [127]:
# Concate the dummy_df with the X_train to get dataframe
X_train1 = pd.concat([X_train, dummy_df], axis=1)

In [160]:
# fit the model
logit2 = LogisticRegression(C=1, class_weight={0:1, 1:99}, random_state=123, intercept_scaling=1, solver='lbfgs')
logit2.fit(X_train1, y_train)

LogisticRegression(C=1, class_weight={0: 1, 1: 99}, random_state=123)

In [163]:
# run the predictions
y2_pred = logit2.predict(X_train1)

In [164]:
y2_pred

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [167]:
# compute the accuracy of logit2 model
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit2.score(X_train1, y_train)))

Accuracy of Logistic Regression classifier on training set: 0.38


In [None]:
# logit2 does not perform better than the baseline

3) Try out other combinations of features and models.

In [171]:
#Re-weighting the survive scenario
logit3 = LogisticRegression(C=1, class_weight={0:1, 1:5}, random_state=123, intercept_scaling=1, solver='lbfgs')
logit3.fit(X_train1, y_train)

LogisticRegression(C=1, class_weight={0: 1, 1: 5}, random_state=123)

In [172]:
# run the predictions
y3_pred = logit3.predict(X_train1)

In [173]:
y3_pred

array([1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0,
       1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1,
       1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1,
       1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1,
       1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1,

In [174]:
# compute the accuracy of logit3 model
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit3.score(X_train1, y_train)))

Accuracy of Logistic Regression classifier on training set: 0.67


In [None]:
# After re-weighting the survive value from 99 to 5, accuracy improves leading to a better model.  Lofit3 performs better than the baeline


In [182]:
# Logit4
#No class weighting
logit4 = LogisticRegression(C=1, random_state=123, intercept_scaling=1, solver='lbfgs')
logit4.fit(X_train1, y_train)

LogisticRegression(C=1, random_state=123)

In [183]:
# run the predictions
y4_pred = logit4.predict(X_train1)

In [185]:
y4_pred

array([0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,
       1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,

In [186]:
# compute the accuracy of logit4 model
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit4.score(X_train1, y_train)))

Accuracy of Logistic Regression classifier on training set: 0.78


In [None]:
# After eliminating the class weighting, accuracy improves leading to a better model.  Logit4 performs better than the baeline