In [31]:
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from env import user, password, hostname, get_db_url
from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression

# acquire
import acquire
import prepare

# turn off pink warning boxes
import warnings
warnings.filterwarnings("ignore")

## Logistic Regression Exercises

1. Create a model that includes only age, fare, and pclass. Does this model perform better than your baseline?

In [3]:
titanic = acquire.get_titanic_data()
titanic

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,0,3,male,22.0,1,0,7.2500,S,Third,,Southampton,0
1,1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,2,1,3,female,26.0,0,0,7.9250,S,Third,,Southampton,1
3,3,3,1,1,female,35.0,1,0,53.1000,S,First,C,Southampton,0
4,4,4,0,3,male,35.0,0,0,8.0500,S,Third,,Southampton,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,886,886,0,2,male,27.0,0,0,13.0000,S,Second,,Southampton,1
887,887,887,1,1,female,19.0,0,0,30.0000,S,First,B,Southampton,1
888,888,888,0,3,female,,1,2,23.4500,S,Third,,Southampton,0
889,889,889,1,1,male,26.0,0,0,30.0000,C,First,C,Cherbourg,1


In [4]:
clean_df = prepare.prep_titanic(titanic)
clean_df

Unnamed: 0,passenger_id,survived,pclass,sex,sibsp,parch,fare,embark_town,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,0,3,male,1,0,7.2500,Southampton,0,1,0,1
1,1,1,1,female,1,0,71.2833,Cherbourg,0,0,0,0
2,2,1,3,female,0,0,7.9250,Southampton,1,0,0,1
3,3,1,1,female,1,0,53.1000,Southampton,0,0,0,1
4,4,0,3,male,0,0,8.0500,Southampton,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
886,886,0,2,male,0,0,13.0000,Southampton,1,1,0,1
887,887,1,1,female,0,0,30.0000,Southampton,1,0,0,1
888,888,0,3,female,1,2,23.4500,Southampton,0,0,0,1
889,889,1,1,male,0,0,30.0000,Cherbourg,1,1,0,0


In [5]:
train, validate, test = prepare.my_train_test_split(clean_df, target= 'survived')
train.shape, validate.shape, test.shape

((534, 12), (178, 12), (179, 12))

In [7]:
baseline_accuracy = (train.survived == 0).mean()
baseline_accuracy

0.6161048689138576

In [8]:
x_train = train.drop(columns=['passenger_id','survived','sex','sibsp','parch','embark_town','alone','sex_male','embark_town_Queenstown','embark_town_Southampton'])
y_train = train.survived

x_val = validate.drop(columns=['passenger_id','survived','sex','sibsp','parch','embark_town','alone','sex_male','embark_town_Queenstown','embark_town_Southampton'])
y_val = validate.survived

x_test = test.drop(columns=['passenger_id','survived','sex','sibsp','parch','embark_town','alone','sex_male','embark_town_Queenstown','embark_town_Southampton'])
y_test = test.survived

In [9]:
logit = LogisticRegression(C=1, class_weight={0:1, 1:99}, random_state=311, intercept_scaling=1, solver='lbfgs')
logit.fit(x_train, y_train)

In [10]:
print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)

Coefficient: 
 [[-0.75650364  0.00396956]]
Intercept: 
 [5.72105266]


In [11]:
y_pred = logit.predict(x_train)
y_predict_proba = logit.predict_proba(x_train)

In [12]:

prepare.evaluate_clf(logit, x_train, y_train, y_pred)

(0.3838951310861423,
           Pred 0  Pred 1
 Actual 0       0     329
 Actual 1       0     205,
                0           1  accuracy   macro avg  weighted avg
 precision    0.0    0.383895  0.383895    0.191948      0.147375
 recall       0.0    1.000000  0.383895    0.500000      0.383895
 f1-score     0.0    0.554804  0.383895    0.277402      0.212986
 support    329.0  205.000000  0.383895  534.000000    534.000000,
                 metric       score
 0             accuracy    0.383895
 1   true_positive_rate    1.000000
 2  false_positive_rate    1.000000
 3   true_negative_rate    0.000000
 4  false_negative_rate    0.000000
 5            precision    0.383895
 6               recall    1.000000
 7             f1_score    0.554804
 8          support_pos  205.000000
 9          support_neg  329.000000)

2. Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.

In [13]:
x_train = train.drop(columns=['passenger_id','survived','sex','sibsp','parch','embark_town','alone','embark_town_Queenstown','embark_town_Southampton'])
y_train = train.survived

x_val = validate.drop(columns=['passenger_id','survived','sex','sibsp','parch','embark_town','alone','embark_town_Queenstown','embark_town_Southampton'])
y_val = validate.survived

x_test = test.drop(columns=['passenger_id','survived','sex','sibsp','parch','embark_town','alone','embark_town_Queenstown','embark_town_Southampton'])
y_test = test.survived

In [14]:
logit2 = LogisticRegression(C=1, class_weight={0:1, 1:99}, random_state=311, intercept_scaling=1, solver='lbfgs')
logit2.fit(x_train, y_train)

In [15]:
y_pred = logit2.predict(x_train)
y_predict_proba = logit2.predict_proba(x_train)

In [16]:
prepare.evaluate_clf(logit2, x_train, y_train, y_pred)

(0.3838951310861423,
           Pred 0  Pred 1
 Actual 0       0     329
 Actual 1       0     205,
                0           1  accuracy   macro avg  weighted avg
 precision    0.0    0.383895  0.383895    0.191948      0.147375
 recall       0.0    1.000000  0.383895    0.500000      0.383895
 f1-score     0.0    0.554804  0.383895    0.277402      0.212986
 support    329.0  205.000000  0.383895  534.000000    534.000000,
                 metric       score
 0             accuracy    0.383895
 1   true_positive_rate    1.000000
 2  false_positive_rate    1.000000
 3   true_negative_rate    0.000000
 4  false_negative_rate    0.000000
 5            precision    0.383895
 6               recall    1.000000
 7             f1_score    0.554804
 8          support_pos  205.000000
 9          support_neg  329.000000)

3. Try out other combinations of features and models.

In [17]:
x_train = train.drop(columns=['passenger_id','survived','sex','sibsp','parch','pclass','alone','embark_town'])
y_train = train.survived

x_val = validate.drop(columns=['passenger_id','survived','sex','sibsp','parch','pclass','alone','embark_town'])
y_val = validate.survived

x_test = test.drop(columns=['passenger_id','survived','sex','sibsp','parch','pclass','alone','embark_town'])
y_test = test.survived

In [18]:

logit3 = LogisticRegression(C=1, class_weight={0:1, 1:99}, random_state=311, intercept_scaling=1, solver='lbfgs')
logit3.fit(x_train, y_train)

In [26]:
y_pred = logit3.predict(x_train)
y_predict_proba = logit3.predict_proba(x_train)

In [27]:

prepare.evaluate_clf(logit3, x_train, y_train, y_pred)

(0.3838951310861423,
           Pred 0  Pred 1
 Actual 0       0     329
 Actual 1       0     205,
                0           1  accuracy   macro avg  weighted avg
 precision    0.0    0.383895  0.383895    0.191948      0.147375
 recall       0.0    1.000000  0.383895    0.500000      0.383895
 f1-score     0.0    0.554804  0.383895    0.277402      0.212986
 support    329.0  205.000000  0.383895  534.000000    534.000000,
                 metric       score
 0             accuracy    0.383895
 1   true_positive_rate    1.000000
 2  false_positive_rate    1.000000
 3   true_negative_rate    0.000000
 4  false_negative_rate    0.000000
 5            precision    0.383895
 6               recall    1.000000
 7             f1_score    0.554804
 8          support_pos  205.000000
 9          support_neg  329.000000)