# **Logistic Regression: Exercises**
<hr style="border:2px solid gray">

# `Titanic` Dataset
<hr style="border:2px solid gray">

>For all of the models you create, choose a threshold that optimizes for accuracy.



# Imports:

In [126]:
## Imports:

#standard DS imports
import pandas as pd
import numpy as np

#visualization imports
import matplotlib.pyplot as plt
import seaborn as sns

#metrics import
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer


#custom modules
import acquire
import prepare
from evaluate import print_classification_metrics

### Acquire
Plan --> **Acquire** --> Prepare --> Explore --> Model --> Deliver

In [127]:
#get my data
df = acquire.get_titanic_data('titanic_db')

## 1.) Create a model that includes only age, fare, and pclass. Does this model perform better than your baseline?
**ANSWER: Model score is 69.10%. Baseline score is  61.61%.**


### Prepare
Plan --> Acquire --> **Prepare** --> Explore --> Model --> Deliver

In [128]:
# check for nulls
df.isnull().sum()
# We are dropping deck in this model. Will impute for mean age after split.

passenger_id      0
survived          0
pclass            0
sex               0
age             177
sibsp             0
parch             0
fare              0
embarked          2
class             0
deck            688
embark_town       2
alone             0
dtype: int64

In [129]:
#clean my data
df = df.drop_duplicates()
df = df.drop(columns=['passenger_id', 'sex', 'sibsp', 'parch',
                      'embarked', 'class', 'deck', 'embark_town', 'alone'])

In [130]:
#split my data into train, validate, test
train, validate, test = prepare.split_function(df, 'survived')

In [131]:
imputer = SimpleImputer(missing_values = pd.NA, strategy='mean')
# fit imputer to train df
imputer = imputer.fit(train[['age']])
# Impute for train, validate, and test
train[['age']] = imputer.transform(train[['age']])
validate[['age']] = imputer.transform(validate[['age']])
test[['age']] = imputer.transform(test[['age']])
  

In [132]:
# further split by x and y and drop columns not in age, fare, and pclass:
target = 'survived'

# train
x_train = train.drop(columns=[target])
y_train = train[target]

# validate
x_validate = validate.drop(columns=[target])
y_validate = validate[target]

# test
x_test = test.drop(columns=[target])
y_test = test[target]

### Model
Plan --> Acquire --> Prepare --> Explore --> **Model** --> Deliver
#### Create Object:

In [133]:
# Define the logistic regression model
logit = LogisticRegression(random_state=666)

#### Fit Object:

In [134]:
#  fit the model on train data
logit.fit(x_train, y_train)

#### Transform/Predict:

In [135]:
# now use the model to make predictions
y_pred = logit.predict(x_train)

In [136]:
# Model Accuracy
logit_accuracy = logit.score(x_train, y_train)
print(f'Model accuracy score: {logit_accuracy: .2%}')

Model accuracy score:  69.10%


In [137]:
# baseline
baseline_accuracy = (train['survived'] == 0).mean()
print(f'Baseline accuracy score: {baseline_accuracy: .2%}')

Baseline accuracy score:  61.61%


## 2.) Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.



In [138]:
#get my data
df = acquire.get_titanic_data('titanic_db')

#clean my data: Keep sex for this model
df = df.drop_duplicates()
df = df.drop(columns=['passenger_id', 'sibsp', 'parch',
                      'embarked', 'class', 'deck', 'embark_town', 'alone'])

#create dummy variable for sex and concat
dummy_df = pd.get_dummies(df[['sex']], drop_first=True, dtype=int)
df = pd.concat([df, dummy_df], axis=1)

#after dummy is created, drop sex column
df = df.drop(columns=['sex'])

In [139]:
#split my data into train, validate, test
train, validate, test = prepare.split_function(df, 'survived')

In [140]:
imputer = SimpleImputer(missing_values = pd.NA, strategy='mean')
# fit imputer to train df
imputer = imputer.fit(train[['age']])
# Impute for train, validate, and test
train[['age']] = imputer.transform(train[['age']])
validate[['age']] = imputer.transform(validate[['age']])
test[['age']] = imputer.transform(test[['age']])

In [141]:
# further split by x and y and drop columns not in age, fare, and pclass:
target = 'survived'

# train
x2_train = train.drop(columns=[target])
y2_train = train[target]

# validate
x2_validate = validate.drop(columns=[target])
y2_validate = validate[target]

# test
x2_test = test.drop(columns=[target])
y2_test = test[target]

In [142]:
#### For model including 'sex'

# Define the logistic regression model
logit_sex = LogisticRegression(random_state=666)

#  fit the model on train data
logit_sex.fit(x2_train, y2_train)

# now use the model to make predictions
y_pred_sex = logit_sex.predict(x2_train)

## 3.) Try out other combinations of features and models.

> Lets add 'alone' to our second model.

In [143]:
#get my data
df = acquire.get_titanic_data('titanic_db')

#clean my data: Keep 'sex' and 'alone' for this model
df = df.drop_duplicates()
df = df.drop(columns=['passenger_id', 'sibsp', 'parch',
                      'embarked', 'class', 'deck', 'embark_town'])

#create dummy variable for sex and concat
dummy_df = pd.get_dummies(df[['sex']], drop_first=True, dtype=int)
df = pd.concat([df, dummy_df], axis=1)

#after dummy is created, drop sex column
df = df.drop(columns=['sex'])

In [144]:
#split my data into train, validate, test
train, validate, test = prepare.split_function(df, 'survived')

In [145]:
imputer = SimpleImputer(missing_values = pd.NA, strategy='mean')
# fit imputer to train df
imputer = imputer.fit(train[['age']])
# Impute for train, validate, and test
train[['age']] = imputer.transform(train[['age']])
validate[['age']] = imputer.transform(validate[['age']])
test[['age']] = imputer.transform(test[['age']])

In [146]:
# further split by x and y and drop columns not in age, fare, and pclass:
target = 'survived'

# train
x3_train = train.drop(columns=[target])
y3_train = train[target]

# validate
x3_validate = validate.drop(columns=[target])
y3_validate = validate[target]

# test
x3_test = test.drop(columns=[target])
y3_test = test[target]

In [147]:
#### For model including 'sex' and 'alone'

# Define the logistic regression model
logit_sex_alone = LogisticRegression(random_state=666)

#  fit the model on train data
logit_sex_alone.fit(x3_train, y3_train)

# now use the model to make predictions
y_pred_sex_alone = logit_sex_alone.predict(x3_train)

In [148]:
logit_accuracy = logit_sex_alone.score(x3_train, y3_train)
print(f'Model accuracy score: {logit_accuracy: .2%}')

Model accuracy score:  80.90%


> Model 2 looks good. Lets try it with some different hyperparameters

In [149]:
#### Model with 'sex' and c=5
#get my data
df = acquire.get_titanic_data('titanic_db')

#clean my data: Keep sex for this model
df = df.drop_duplicates()
df = df.drop(columns=['passenger_id', 'sibsp', 'parch',
                      'embarked', 'class', 'deck', 'embark_town', 'alone'])

#create dummy variable for sex and concat
dummy_df = pd.get_dummies(df[['sex']], drop_first=True, dtype=int)
df = pd.concat([df, dummy_df], axis=1)

#after dummy is created, drop sex column
df = df.drop(columns=['sex'])

In [150]:
#split my data into train, validate, test
train, validate, test = prepare.split_function(df, 'survived')

In [151]:
imputer = SimpleImputer(missing_values = pd.NA, strategy='mean')
# fit imputer to train df
imputer = imputer.fit(train[['age']])
# Impute for train, validate, and test
train[['age']] = imputer.transform(train[['age']])
validate[['age']] = imputer.transform(validate[['age']])
test[['age']] = imputer.transform(test[['age']])

In [152]:
# Define the logistic regression model
logit_sex2 = LogisticRegression(C=5, random_state=666)

#  fit the model on train data
logit_sex2.fit(x2_train, y2_train)

# now use the model to make predictions
y_pred_sex2 = logit_sex2.predict(x2_train)

#### Evaluate:

In [153]:
print_classification_metrics(y_train, y_pred)
print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
print_classification_metrics(y2_train,y_pred_sex)
print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
print_classification_metrics(y3_train,y_pred_sex_alone)
print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
print_classification_metrics(y2_train,y_pred_sex2)

Accuracy: 0.6910112359550562
True Positive Rate: 0.424390243902439
False Positive Rate: 0.14285714285714285
True Negative Rate: 0.8571428571428571
False Negative Rate: 0.5756097560975609
Precision: 0.6492537313432836
Recall: 0.424390243902439
F1 Score: 0.5132743362831858
Support (0): 205
Support (1): 329
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Accuracy: 0.8089887640449438
True Positive Rate: 0.7414634146341463
False Positive Rate: 0.14893617021276595
True Negative Rate: 0.851063829787234
False Negative Rate: 0.25853658536585367
Precision: 0.7562189054726368
Recall: 0.7414634146341463
F1 Score: 0.748768472906404
Support (0): 205
Support (1): 329
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Accuracy: 0.8089887640449438
True Positive Rate: 0.7414634146341463
False Positive Rate: 0.14893617021276595
True Negative Rate: 0.851063829787234
False Negative Rate: 0.25853658536585367
Precision: 0.7562189054726368
Recall: 0.7414634146341463
F1 Score: 0.748768472906404
Support (0):

In [154]:
# Model Accuracy

logit_accuracy = logit.score(x_train, y_train)
print(f'Model accuracy score: {logit_accuracy: .2%}')

logit_sex_accuracy = logit_sex.score(x2_train, y2_train)
print(f'Model 2 accuracy score: {logit_sex_accuracy: .2%}')

logit_sex_alone_accuracy = logit_sex_alone.score(x3_train, y3_train)
print(f'Model 3 accuracy score: {logit_sex_alone_accuracy: .2%}')

logit_sex2_accuracy = logit_sex2.score(x2_train, y2_train)
print(f'Model 3 accuracy score: {logit_sex_alone_accuracy: .2%}')

# baseline
baseline_accuracy = (train['survived'] == 0).mean()
print(f'Baseline accuracy score: {baseline_accuracy: .2%}')

Model accuracy score:  69.10%
Model 2 accuracy score:  80.90%
Model 3 accuracy score:  80.90%
Model 3 accuracy score:  80.90%
Baseline accuracy score:  61.61%


## 4.) Use you best 3 models to predict and evaluate on your validate sample.

In [155]:
# Predict on validate sample
y_pred_sex = logit_sex.predict(x2_validate)
y_pred_sex_alone = logit_sex_alone.predict(x3_validate)
y_pred_sex2 = logit_sex2.predict(x2_validate)

In [156]:
print_classification_metrics(y2_validate, y_pred_sex)

Accuracy: 0.7247191011235955
True Positive Rate: 0.5882352941176471
False Positive Rate: 0.19090909090909092
True Negative Rate: 0.8090909090909091
False Negative Rate: 0.4117647058823529
Precision: 0.6557377049180327
Recall: 0.5882352941176471
F1 Score: 0.6201550387596899
Support (0): 68
Support (1): 110


In [157]:
print_classification_metrics(y3_validate, y_pred_sex_alone)

Accuracy: 0.7247191011235955
True Positive Rate: 0.5882352941176471
False Positive Rate: 0.19090909090909092
True Negative Rate: 0.8090909090909091
False Negative Rate: 0.4117647058823529
Precision: 0.6557377049180327
Recall: 0.5882352941176471
F1 Score: 0.6201550387596899
Support (0): 68
Support (1): 110


In [158]:
print_classification_metrics(y2_validate, y_pred_sex2)

Accuracy: 0.7247191011235955
True Positive Rate: 0.5882352941176471
False Positive Rate: 0.19090909090909092
True Negative Rate: 0.8090909090909091
False Negative Rate: 0.4117647058823529
Precision: 0.6557377049180327
Recall: 0.5882352941176471
F1 Score: 0.6201550387596899
Support (0): 68
Support (1): 110
