In [1]:
import numpy as np
import pandas as pd

import statsmodels.formula.api as smf

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

import acquire as ac
import prepare as pr

import warnings
warnings.filterwarnings("ignore")

---
# Logistic Regression Lesson Notes + Exercise 

In [2]:
df = ac.get_titanic_data()
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [3]:
encoder, scaler, train, test = pr.prep_titanic(df)
train.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone
212,212,0,3,male,0.271174,0,0,0.014151,2,Third,Southampton,1
222,222,0,3,male,0.635587,0,0,0.015713,2,Third,Southampton,1
775,775,0,3,male,0.22091,0,0,0.015127,2,Third,Southampton,1
229,229,0,3,female,0.346569,3,1,0.049708,2,Third,Southampton,0
751,751,1,3,male,0.070118,0,1,0.02435,2,Third,Southampton,0


In [4]:
train.shape

(712, 12)

In [5]:
test.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone
784,784,0,3,male,0.308872,0,0,0.013761,2,Third,Southampton,1
382,382,0,3,male,0.396833,0,0,0.015469,2,Third,Southampton,1
263,263,0,1,male,0.497361,0,0,0.0,2,First,Southampton,1
647,647,1,1,male,0.698417,0,0,0.069291,0,First,Cherbourg,1
238,238,0,2,male,0.233476,0,0,0.020495,2,Second,Southampton,1


In [6]:
test.shape

(179, 12)

In [7]:
X_train = train[["fare", "pclass"]]
y_train = train[["survived"]]
X_test = test[["fare", "pclass"]]
y_test = test[["survived"]]

In [8]:
LogisticRegression?

In [9]:
# make the object
logit = LogisticRegression()

In [10]:
# fit the object
logit.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [11]:
logit.predict(X_train)

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0,

In [12]:
logit.predict_proba(X_train)

array([[0.74409806, 0.25590194],
       [0.74378116, 0.25621884],
       [0.74390003, 0.25609997],
       ...,
       [0.74397925, 0.25602075],
       [0.74390003, 0.25609997],
       [0.38363301, 0.61636699]])

> The probability for survival is shaped this way when using predict_proba because the LogisticRegression model is predicting the probabily of both classes (0 and 1; non-survival or survival) for each entry

In [13]:
logit.classes_

array([0, 1])

In [14]:
logit.score(X_train, y_train)

0.672752808988764

In [15]:
logit.score(X_test, y_test)

0.7039106145251397

In [16]:
y_pred = logit.predict(X_test)
y_pred

array([0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0])

In [17]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.71      0.88      0.79       110
           1       0.69      0.42      0.52        69

    accuracy                           0.70       179
   macro avg       0.70      0.65      0.65       179
weighted avg       0.70      0.70      0.68       179



---
# Exercises

In this exercise, we'll continue working with the titanic dataset and building logistic regression models. Throughout this exercise, be sure you are training, evaluation, and comparing models on the train and validate datasets. The test dataset should only be used for your final model.

For all of the models you create, choose a threshold that optimizes for accuracy.

In [18]:
df = ac.get_titanic_data()
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [19]:
encoder, scaler, train, test = pr.prep_titanic(df)

---
### 1. Create another model that includes age in addition to fare and pclass. Does this model perform better than your previous one?

In [20]:
print(train.shape)
train.head()

(712, 12)


Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone
212,212,0,3,male,0.271174,0,0,0.014151,2,Third,Southampton,1
222,222,0,3,male,0.635587,0,0,0.015713,2,Third,Southampton,1
775,775,0,3,male,0.22091,0,0,0.015127,2,Third,Southampton,1
229,229,0,3,female,0.346569,3,1,0.049708,2,Third,Southampton,0
751,751,1,3,male,0.070118,0,1,0.02435,2,Third,Southampton,0


In [21]:
print(test.shape)
test.head()

(179, 12)


Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone
784,784,0,3,male,0.308872,0,0,0.013761,2,Third,Southampton,1
382,382,0,3,male,0.396833,0,0,0.015469,2,Third,Southampton,1
263,263,0,1,male,0.497361,0,0,0.0,2,First,Southampton,1
647,647,1,1,male,0.698417,0,0,0.069291,0,First,Cherbourg,1
238,238,0,2,male,0.233476,0,0,0.020495,2,Second,Southampton,1


In [22]:
train, validate = train_test_split(train, random_state=74, train_size=.75)
print(train.shape)
train.head()

(534, 12)


Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone
478,478,0,3,male,0.271174,0,0,0.01468,2,Third,Southampton,1
183,183,1,2,male,0.007288,2,1,0.076123,2,Second,Southampton,0
494,494,0,3,male,0.258608,0,0,0.015713,2,Third,Southampton,1
360,360,0,3,male,0.497361,1,4,0.054457,2,Third,Southampton,0
403,403,0,3,male,0.346569,1,0,0.030937,2,Third,Southampton,0


In [23]:
print(validate.shape)
validate.head()

(178, 12)


Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone
232,232,0,2,male,0.736115,0,0,0.02635,2,Second,Southampton,1
443,443,1,2,female,0.346569,0,0,0.025374,2,Second,Southampton,1
862,862,1,1,female,0.597889,0,0,0.05061,2,First,Southampton,1
561,561,0,3,male,0.497361,0,0,0.015412,2,Third,Southampton,1
872,872,0,1,male,0.409399,0,0,0.009759,2,First,Southampton,1


In [24]:
X_train = train[["pclass", "age", "fare"]]
y_train = train.survived
X_validate = validate[["pclass", "age", "fare"]]
y_validate = validate.survived

In [25]:
# make the object
logit = LogisticRegression()

In [26]:
# fit the object
logit.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [27]:
predictions = pd.DataFrame({
    "actual_survived": train.survived,
})
print(predictions.shape)
predictions.head()

(534, 1)


Unnamed: 0,actual_survived
478,0
183,1
494,0
360,0
403,0


In [28]:
# use the object (to predict survival)
predictions["survived ~ pclass + age + fare"] = logit.predict(X_train)

In [29]:
logit.score(X_train, y_train)

0.7078651685393258

In [30]:
print(classification_report(y_train, predictions["survived ~ pclass + age + fare"]))

              precision    recall  f1-score   support

           0       0.71      0.87      0.78       326
           1       0.69      0.45      0.55       208

    accuracy                           0.71       534
   macro avg       0.70      0.66      0.67       534
weighted avg       0.70      0.71      0.69       534



In [31]:
# validate model
X_validate["survived ~ pclass + age + fare"] = logit.predict(X_validate)

In [32]:
# predictions.apply(lambda c: logit.score(X_validate.drop(columns="survived ~ pclass + age + fare"), y_validate), c)

In [33]:
# accuracy = pd.DataFrame({
#     "survived ~ pclass + age + fare": logit.score(X_validate.drop(columns="survived ~ pclass + age + fare"), y_validate),
# })
# accuracy

In [34]:
logit.score(X_validate.drop(columns="survived ~ pclass + age + fare"), y_validate)

0.6853932584269663

In [35]:
print(classification_report(y_validate, X_validate["survived ~ pclass + age + fare"]))

              precision    recall  f1-score   support

           0       0.72      0.81      0.77       113
           1       0.59      0.46      0.52        65

    accuracy                           0.69       178
   macro avg       0.66      0.64      0.64       178
weighted avg       0.67      0.69      0.68       178



In [36]:
# test model
X_test = test[["pclass", "fare", "age"]]
y_test = test.survived

# predict survival using logistic regression model
X_test["survived ~ pclass + age + fare"] = logit.predict(X_test)

# calculate accuracy
print("Accuracy of Logistic Regression classifier model on test set {:.2f}".format(logit.score(X_test.drop(columns="survived ~ pclass + age + fare"), y_test)))
print()
# classification report
print(classification_report(y_test, X_test["survived ~ pclass + age + fare"]))

Accuracy of Logistic Regression classifier model on test set 0.65

              precision    recall  f1-score   support

           0       0.76      0.65      0.70       110
           1       0.54      0.67      0.60        69

    accuracy                           0.65       179
   macro avg       0.65      0.66      0.65       179
weighted avg       0.67      0.65      0.66       179



---
### 2. Include sex in your model as well. Note that you'll need to encode this feature before including it in a model.

In [37]:
df = ac.get_titanic_data()
encoder, scaler, train, test = pr.prep_titanic(df)

# encode sex
encoder = LabelEncoder()
encoder.fit(train[["sex"]])
train.sex = encoder.transform(train[["sex"]])
test.sex = encoder.transform(test[["sex"]])

In [38]:
train, validate = train_test_split(train, random_state=74, train_size=.75)

In [39]:
print(train.shape)
train.head()

(534, 12)


Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone
478,478,0,3,1,0.271174,0,0,0.01468,2,Third,Southampton,1
183,183,1,2,1,0.007288,2,1,0.076123,2,Second,Southampton,0
494,494,0,3,1,0.258608,0,0,0.015713,2,Third,Southampton,1
360,360,0,3,1,0.497361,1,4,0.054457,2,Third,Southampton,0
403,403,0,3,1,0.346569,1,0,0.030937,2,Third,Southampton,0


In [40]:
print(validate.shape)
validate.head()

(178, 12)


Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone
232,232,0,2,1,0.736115,0,0,0.02635,2,Second,Southampton,1
443,443,1,2,0,0.346569,0,0,0.025374,2,Second,Southampton,1
862,862,1,1,0,0.597889,0,0,0.05061,2,First,Southampton,1
561,561,0,3,1,0.497361,0,0,0.015412,2,Third,Southampton,1
872,872,0,1,1,0.409399,0,0,0.009759,2,First,Southampton,1


In [41]:
print(test.shape)
test.head()

(179, 12)


Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone
784,784,0,3,1,0.308872,0,0,0.013761,2,Third,Southampton,1
382,382,0,3,1,0.396833,0,0,0.015469,2,Third,Southampton,1
263,263,0,1,1,0.497361,0,0,0.0,2,First,Southampton,1
647,647,1,1,1,0.698417,0,0,0.069291,0,First,Cherbourg,1
238,238,0,2,1,0.233476,0,0,0.020495,2,Second,Southampton,1


In [42]:
X_train = train[["pclass", "age", "fare", "sex"]]
y_train = train.survived
X_validate = validate[["pclass", "age", "fare", "sex"]]
y_validate = validate.survived

In [43]:
logit = LogisticRegression()

In [44]:
logit.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [45]:
predictions["survived ~ pclass + age + fare + sex"] = logit.predict(X_train)
predictions

Unnamed: 0,actual_survived,survived ~ pclass + age + fare,survived ~ pclass + age + fare + sex
478,0,0,0
183,1,1,0
494,0,0,0
360,0,0,0
403,0,0,0
...,...,...,...
187,1,1,0
729,0,0,1
819,0,0,0
39,1,0,1


In [46]:
logit.score(X_train, y_train)

0.7827715355805244

In [47]:
print(classification_report(predictions.actual_survived, predictions["survived ~ pclass + age + fare + sex"]))

              precision    recall  f1-score   support

           0       0.80      0.85      0.83       326
           1       0.74      0.67      0.71       208

    accuracy                           0.78       534
   macro avg       0.77      0.76      0.77       534
weighted avg       0.78      0.78      0.78       534



---
### 3. Try out other combinations of features and models.

In [48]:
df = ac.get_titanic_data()
encoder, scaler, train, test = pr.prep_titanic(df)

# encode sex
encoder = LabelEncoder()
encoder.fit(train[["sex"]])
train.sex = encoder.transform(train[["sex"]])
test.sex = encoder.transform(test[["sex"]])

train, validate = train_test_split(train, random_state=74, train_size=.75)

In [49]:
train.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone
478,478,0,3,1,0.271174,0,0,0.01468,2,Third,Southampton,1
183,183,1,2,1,0.007288,2,1,0.076123,2,Second,Southampton,0
494,494,0,3,1,0.258608,0,0,0.015713,2,Third,Southampton,1
360,360,0,3,1,0.497361,1,4,0.054457,2,Third,Southampton,0
403,403,0,3,1,0.346569,1,0,0.030937,2,Third,Southampton,0


In [50]:
X_train = train[["pclass", "age", "fare", "sex", "embarked"]]
y_train = train.survived
X_validate = validate[["pclass", "age", "fare", "sex", "embarked"]]
y_validate = validate.survived

In [51]:
logit = LogisticRegression().fit(X_train, y_train)

In [52]:
predictions["survived ~ pclass + age + fare + sex + embarked"] = logit.predict(X_train)

In [53]:
logit.score(X_train, y_train)

0.7865168539325843

In [54]:
print(classification_report(predictions.actual_survived, predictions["survived ~ pclass + age + fare + sex + embarked"]))

              precision    recall  f1-score   support

           0       0.81      0.85      0.83       326
           1       0.75      0.68      0.71       208

    accuracy                           0.79       534
   macro avg       0.78      0.77      0.77       534
weighted avg       0.78      0.79      0.78       534



---
### 4. Choose you best model and evaluate it on the test dataset. Is it overfit?

In [55]:
validate["survived ~ pclass + age + fare + sex + embarked"] = logit.predict(X_validate)

In [56]:
logit.score(X_validate, y_validate)

0.8426966292134831

In [57]:
print(classification_report(validate.survived, validate["survived ~ pclass + age + fare + sex + embarked"]))

              precision    recall  f1-score   support

           0       0.86      0.89      0.88       113
           1       0.80      0.75      0.78        65

    accuracy                           0.84       178
   macro avg       0.83      0.82      0.83       178
weighted avg       0.84      0.84      0.84       178



In [58]:
X_test = test[["pclass", "age", "fare", "sex", "embarked"]]
y_test = test.survived

# test model
X_test["survived ~ pclass + age + fare + sex + embarked"] = logit.predict(X_test)

# calculate accuracy
print("Accuracy of Logistic Regression classifier model on test set {:.2f}".format(logit.score(X_test.drop(columns="survived ~ pclass + age + fare + sex + embarked"), y_test)))
print()
# classification report
print(classification_report(y_test, X_test["survived ~ pclass + age + fare + sex + embarked"]))

Accuracy of Logistic Regression classifier model on test set 0.74

              precision    recall  f1-score   support

           0       0.78      0.79      0.79       110
           1       0.66      0.65      0.66        69

    accuracy                           0.74       179
   macro avg       0.72      0.72      0.72       179
weighted avg       0.74      0.74      0.74       179

