# Logistic Regression Exercises

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

from env import user, password, host
from acquire import get_titanic_data
from prepare import prep_titanic

In this exercise, we'll continue working with the titanic dataset and building logistic regression models. Throughout this exercise, be sure you are training, evaluation, and comparing models on the train and validate datasets. The test dataset should only be used for your final model.

In [2]:
df = prep_titanic()
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone,Q,S
0,0,0,3,male,22.0,1,0,7.25,S,Third,Southampton,0,0,1
1,1,1,1,female,38.0,1,0,71.2833,C,First,Cherbourg,0,0,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,Southampton,1,0,1
3,3,1,1,female,35.0,1,0,53.1,S,First,Southampton,0,0,1
4,4,0,3,male,35.0,0,0,8.05,S,Third,Southampton,1,0,1


In [3]:
df.shape

(889, 14)

In [4]:
dummy_df = pd.get_dummies(df['sex']).drop(columns=['male'])
df = pd.concat([df, dummy_df], axis=1).drop(columns=['sex'])

In [5]:
df.head()

Unnamed: 0,passenger_id,survived,pclass,age,sibsp,parch,fare,embarked,class,embark_town,alone,Q,S,female
0,0,0,3,22.0,1,0,7.25,S,Third,Southampton,0,0,1,0
1,1,1,1,38.0,1,0,71.2833,C,First,Cherbourg,0,0,0,1
2,2,1,3,26.0,0,0,7.925,S,Third,Southampton,1,0,1,1
3,3,1,1,35.0,1,0,53.1,S,First,Southampton,0,0,1,1
4,4,0,3,35.0,0,0,8.05,S,Third,Southampton,1,0,1,0


In [6]:
df['first_class'] = df['pclass'].apply(lambda p: 1 if p == 1 else 0)
df['second_class'] = df['pclass'].apply(lambda p: 1 if p == 2 else 0)

In [7]:
X = df[['first_class', 'second_class','age','fare']]
y = df[['survived']]
print(X)
print(y)

     first_class  second_class   age     fare
0              0             0  22.0   7.2500
1              1             0  38.0  71.2833
2              0             0  26.0   7.9250
3              1             0  35.0  53.1000
4              0             0  35.0   8.0500
..           ...           ...   ...      ...
886            0             1  27.0  13.0000
887            1             0  19.0  30.0000
888            0             0   NaN  23.4500
889            1             0  26.0  30.0000
890            0             0  32.0   7.7500

[889 rows x 4 columns]
     survived
0           0
1           1
2           1
3           1
4           0
..        ...
886         0
887         1
888         0
889         1
890         0

[889 rows x 1 columns]


In [8]:
# Creating train, validate, test sets
X_train_validate, X_test  = train_test_split(X, test_size = .20, random_state = 123)
y_train_validate, y_test = train_test_split(y, test_size = .20, random_state = 123, stratify = y.survived)

X_train, X_validate  = train_test_split(X_train_validate, test_size = .30, random_state = 123)
y_train, y_validate = train_test_split(y_train_validate, test_size = .30, random_state = 123, stratify = y_train_validate.survived)
print("train: ", X_train.shape, ", validate: ", X_validate.shape, ", test: ", X_test.shape)
print("train: ", y_train.shape, ", validate: ", y_validate.shape, ", test: ", y_test.shape)

train:  (497, 4) , validate:  (214, 4) , test:  (178, 4)
train:  (497, 1) , validate:  (214, 1) , test:  (178, 1)


In [9]:
# Imputing missing age values
print(X_train.age.mean()) # We will use the train set mean age for our imputed mean 
print(X_validate.age.mean())
print(X_test.age.mean())

29.166666666666668
29.74525714285714
30.849290780141843


In [10]:
mean_imputer = SimpleImputer(missing_values=np.nan, strategy='mean', verbose=0)
X_train.iloc[:,:] = mean_imputer.fit_transform(X_train)
X_train.head(10)

Unnamed: 0,first_class,second_class,age,fare
689,1.0,0.0,15.0,211.3375
191,0.0,1.0,19.0,13.0
634,0.0,0.0,9.0,27.9
623,0.0,0.0,21.0,7.8542
244,0.0,0.0,30.0,7.225
264,0.0,0.0,29.166667,7.75
29,0.0,0.0,29.166667,7.8958
412,1.0,0.0,33.0,90.0
7,0.0,0.0,2.0,21.075
375,1.0,0.0,29.166667,82.1708


In [11]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 497 entries, 689 to 282
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   first_class   497 non-null    float64
 1   second_class  497 non-null    float64
 2   age           497 non-null    float64
 3   fare          497 non-null    float64
dtypes: float64(4)
memory usage: 19.4 KB


In [12]:
X_train.shape

(497, 4)

In [13]:
X_validate.head()

Unnamed: 0,first_class,second_class,age,fare
875,0,0,15.0,7.225
612,0,0,,15.5
53,0,1,29.0,26.0
335,0,0,,7.8958
393,1,0,23.0,113.275


In [14]:
X_validate['age'].fillna(inplace=True, value=X_train['age'].mean())
X_validate.head()

Unnamed: 0,first_class,second_class,age,fare
875,0,0,15.0,7.225
612,0,0,29.166667,15.5
53,0,1,29.0,26.0
335,0,0,29.166667,7.8958
393,1,0,23.0,113.275


In [15]:
X_test['age'].fillna(inplace=True, value=X_train['age'].mean())
X_test.head()

Unnamed: 0,first_class,second_class,age,fare
173,0,0,21.0,7.925
525,0,0,40.5,7.75
453,1,0,49.0,89.1042
171,0,0,4.0,29.125
883,0,1,28.0,10.5


The reasoning for using the train set mean age in all sets is that our model is built on the inherent assumption that our average age is 29.17. When we receive data we have never seen before (which is what the validate and test set represent) we would have no way of knowing what the mean age for future data would be. We can only use what we know for sure, which is our train data. 

### Q1. Start by defining your baseline model.

In [16]:
y_train.survived.value_counts()

0    307
1    190
Name: survived, dtype: int64

The majority of passengers on the titanic appeared to have died. We will use a model of always predicting "not survived" as our baseline to compare future models to. 

In [17]:
# If that baseline model was applied to the train data
baseline_accuracy_train = round(1 - y_train.survived.mean(),2)
baseline_accuracy_train

0.62

Our baseline model has a 62% accuracy on the training data.

### Q2. Create another model that includes age in addition to fare and pclass. Does this model perform better than your baseline?

We previously optimized our data to create the model this question is asking.

In [18]:
X_train.head()

Unnamed: 0,first_class,second_class,age,fare
689,1.0,0.0,15.0,211.3375
191,0.0,1.0,19.0,13.0
634,0.0,0.0,9.0,27.9
623,0.0,0.0,21.0,7.8542
244,0.0,0.0,30.0,7.225


In [19]:
X_train.describe()

Unnamed: 0,first_class,second_class,age,fare
count,497.0,497.0,497.0,497.0
mean,0.229376,0.207243,29.166667,31.946201
std,0.420855,0.40574,12.9645,53.919408
min,0.0,0.0,0.42,0.0
25%,0.0,0.0,22.0,7.925
50%,0.0,0.0,29.166667,13.8583
75%,0.0,0.0,35.0,30.5
max,1.0,1.0,80.0,512.3292


In [20]:
y_train.describe()

Unnamed: 0,survived
count,497.0
mean,0.382294
std,0.486437
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [21]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_validate = scaler.transform(X_validate)
X_test = scaler.transform(X_test)

In [22]:
model1 = LogisticRegression(C = 1)
model1.fit(X_train, y_train)
print('Coefficient: ', model1.coef_)
print('Intercept: ', model1.intercept_)

Coefficient:  [[0.09468532 0.0836866  0.02836107 0.01257167]]
Intercept:  [-0.48158488]


In [23]:
#Estimate whether or not a passenger would survive, using the training data
y_pred = model1.predict(X_train)
#Estimate the probability of a passenger surviving, using the training data
y_pred_proba = model1.predict_proba(X_train)

In [24]:
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(model1.score(X_train, y_train)))

Accuracy of Logistic Regression classifier on training set: 0.62


In [25]:
# Create a confusion matrix
print(confusion_matrix(y_train, y_pred))

[[307   0]
 [190   0]]


In [26]:
# Compute Precision, Recall, F1-score, and Support
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.62      1.00      0.76       307
           1       0.00      0.00      0.00       190

    accuracy                           0.62       497
   macro avg       0.31      0.50      0.38       497
weighted avg       0.38      0.62      0.47       497



#### Our Model is identical to baseline. It predicted that each passenger would die. 

#### VALIDATE MODEL ONE

In [27]:
y_pred1 = model1.predict(X_validate)
print("Model 4")

print('Accuracy: {:.2f}'.format(model1.score(X_validate, y_validate)))

print(confusion_matrix(y_validate, y_pred1))

print(classification_report(y_validate, y_pred1))

Model 4
Accuracy: 0.62
[[132   0]
 [ 82   0]]
              precision    recall  f1-score   support

           0       0.62      1.00      0.76       132
           1       0.00      0.00      0.00        82

    accuracy                           0.62       214
   macro avg       0.31      0.50      0.38       214
weighted avg       0.38      0.62      0.47       214



### Q3. Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.

In [28]:
# Recreating our X_train DataFrame with 'female' added
df = prep_titanic()
dummy_df = pd.get_dummies(df['sex']).drop(columns=['male'])
df = pd.concat([df, dummy_df], axis=1).drop(columns=['sex'])
df['first_class'] = df['pclass'].apply(lambda p: 1 if p == 1 else 0)
df['second_class'] = df['pclass'].apply(lambda p: 1 if p == 2 else 0)
X = df[['first_class', 'second_class','age','fare', 'female']]
y = df[['survived']]
X_train_validate, X_test  = train_test_split(X, test_size = .20, random_state = 123)
y_train_validate, y_test = train_test_split(y, test_size = .20, random_state = 123, stratify = y.survived)
X_train, X_validate  = train_test_split(X_train_validate, test_size = .30, random_state = 123)
y_train, y_validate = train_test_split(y_train_validate, test_size = .30, random_state = 123, stratify = y_train_validate.survived)
mean_imputer = SimpleImputer(missing_values=np.nan, strategy='mean', verbose=0)
X_train.iloc[:,:] = mean_imputer.fit_transform(X_train)
X_validate['age'].fillna(inplace=True, value=X_train['age'].mean())
X_test['age'].fillna(inplace=True, value=X_train['age'].mean())

In [29]:
X_train.head()

Unnamed: 0,first_class,second_class,age,fare,female
689,1.0,0.0,15.0,211.3375,1.0
191,0.0,1.0,19.0,13.0,0.0
634,0.0,0.0,9.0,27.9,1.0
623,0.0,0.0,21.0,7.8542,0.0
244,0.0,0.0,30.0,7.225,0.0


In [30]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_validate = scaler.transform(X_validate)
X_test = scaler.transform(X_test)

In [31]:
model2 = LogisticRegression(C = 1)
model2.fit(X_train, y_train)
print('Coefficient: ', model2.coef_)
print('Intercept: ', model2.intercept_)

Coefficient:  [[0.09164702 0.08169136 0.03254294 0.00943394 0.02611587]]
Intercept:  [-0.48165814]


In [32]:
#Estimate whether or not a passenger would survive, using the training data
y_pred = model2.predict(X_train)
#Estimate the probability of a passenger surviving, using the training data
y_pred_proba = model2.predict_proba(X_train)

In [33]:
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(model2.score(X_train, y_train)))

Accuracy of Logistic Regression classifier on training set: 0.62


In [34]:
# Create a confusion matrix
print(confusion_matrix(y_train, y_pred))

[[307   0]
 [190   0]]


In [35]:
# Compute Precision, Recall, F1-score, and Support
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.62      1.00      0.76       307
           1       0.00      0.00      0.00       190

    accuracy                           0.62       497
   macro avg       0.31      0.50      0.38       497
weighted avg       0.38      0.62      0.47       497



#### Even after adding in sex, our Model is identical to baseline. It predicted that each passenger would die. 

#### VALIDATE MODEL 2

In [36]:
y_pred2 = model2.predict(X_validate)
print("Model 2")

print('Accuracy: {:.2f}'.format(model2.score(X_validate, y_validate)))

print(confusion_matrix(y_validate, y_pred2))

print(classification_report(y_validate, y_pred2))

Model 2
Accuracy: 0.62
[[132   0]
 [ 82   0]]
              precision    recall  f1-score   support

           0       0.62      1.00      0.76       132
           1       0.00      0.00      0.00        82

    accuracy                           0.62       214
   macro avg       0.31      0.50      0.38       214
weighted avg       0.38      0.62      0.47       214



### Q4. Try out other combinations of features and models.

We will run a model that only has class, sex, and age as features

In [37]:
# Recreating our X_train DataFrame with only class, sex, and age as features
df = prep_titanic()
dummy_df = pd.get_dummies(df['sex']).drop(columns=['male'])
df = pd.concat([df, dummy_df], axis=1).drop(columns=['sex'])
df['first_class'] = df['pclass'].apply(lambda p: 1 if p == 1 else 0)
df['second_class'] = df['pclass'].apply(lambda p: 1 if p == 2 else 0)
X = df[['first_class', 'second_class','age', 'female']]
y = df[['survived']]
X_train_validate, X_test  = train_test_split(X, test_size = .20, random_state = 123)
y_train_validate, y_test = train_test_split(y, test_size = .20, random_state = 123, stratify = y.survived)
X_train, X_validate  = train_test_split(X_train_validate, test_size = .30, random_state = 123)
y_train, y_validate = train_test_split(y_train_validate, test_size = .30, random_state = 123, stratify = y_train_validate.survived)
mean_imputer = SimpleImputer(missing_values=np.nan, strategy='mean', verbose=0)
X_train.iloc[:,:] = mean_imputer.fit_transform(X_train)
X_validate['age'].fillna(inplace=True, value=X_train['age'].mean())
X_test['age'].fillna(inplace=True, value=X_train['age'].mean())

In [38]:
X_train.head()

Unnamed: 0,first_class,second_class,age,female
689,1.0,0.0,15.0,1.0
191,0.0,1.0,19.0,0.0
634,0.0,0.0,9.0,1.0
623,0.0,0.0,21.0,0.0
244,0.0,0.0,30.0,0.0


In [39]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_validate = scaler.transform(X_validate)
X_test = scaler.transform(X_test)

In [40]:
model3 = LogisticRegression(C = 1)
model3.fit(X_train, y_train)
print('Coefficient: ', model3.coef_)
print('Intercept: ', model3.intercept_)

Coefficient:  [[0.097213   0.08216342 0.03174524 0.02694864]]
Intercept:  [-0.48165352]


In [41]:
#Estimate whether or not a passenger would survive, using the training data
y_pred = model3.predict(X_train)
#Estimate the probability of a passenger surviving, using the training data
y_pred_proba = model3.predict_proba(X_train)

In [42]:
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(model3.score(X_train, y_train)))

Accuracy of Logistic Regression classifier on training set: 0.62


In [43]:
# Create a confusion matrix
print(confusion_matrix(y_train, y_pred))

[[307   0]
 [190   0]]


In [44]:
# Compute Precision, Recall, F1-score, and Support
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.62      1.00      0.76       307
           1       0.00      0.00      0.00       190

    accuracy                           0.62       497
   macro avg       0.31      0.50      0.38       497
weighted avg       0.38      0.62      0.47       497



#### VALIDATE MODEL 3

In [45]:
y_pred3 = model3.predict(X_validate)
print("Model 4")

print('Accuracy: {:.2f}'.format(model3.score(X_validate, y_validate)))

print(confusion_matrix(y_validate, y_pred3))

print(classification_report(y_validate, y_pred3))

Model 4
Accuracy: 0.62
[[132   0]
 [ 82   0]]
              precision    recall  f1-score   support

           0       0.62      1.00      0.76       132
           1       0.00      0.00      0.00        82

    accuracy                           0.62       214
   macro avg       0.31      0.50      0.38       214
weighted avg       0.38      0.62      0.47       214



### CREATING A ONE FEATURE MODEL (SEX ONLY)

In [46]:
# Recreating our X_train DataFrame with only class, sex, and age as features
df = prep_titanic()
dummy_df = pd.get_dummies(df['sex']).drop(columns=['male'])
df = pd.concat([df, dummy_df], axis=1).drop(columns=['sex'])
df['first_class'] = df['pclass'].apply(lambda p: 1 if p == 1 else 0)
df['second_class'] = df['pclass'].apply(lambda p: 1 if p == 2 else 0)
X = df[['female']]
y = df[['survived']]
X_train_validate, X_test  = train_test_split(X, test_size = .20, random_state = 123)
y_train_validate, y_test = train_test_split(y, test_size = .20, random_state = 123, stratify = y.survived)
X_train, X_validate  = train_test_split(X_train_validate, test_size = .30, random_state = 123)
y_train, y_validate = train_test_split(y_train_validate, test_size = .30, random_state = 123, stratify = y_train_validate.survived)

In [47]:
X_train.head()

Unnamed: 0,female
689,1
191,0
634,1
623,0
244,0


In [48]:
X_train.shape

(497, 1)

In [49]:
y_train.head()

Unnamed: 0,survived
583,0
337,1
50,0
218,1
31,1


In [50]:
y_train.shape

(497, 1)

In [51]:
model4 = LogisticRegression()
model4.fit(X_train, y_train)
print('Coefficient: ', model4.coef_)
print('Intercept: ', model4.intercept_)

Coefficient:  [[0.07537975]]
Intercept:  [-0.50651636]


In [52]:
#Estimate whether or not a passenger would survive, using the training data
y_pred = model4.predict(X_train)
#Estimate the probability of a passenger surviving, using the training data
y_pred_proba = model4.predict_proba(X_train)

In [53]:
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(model4.score(X_train, y_train)))

Accuracy of Logistic Regression classifier on training set: 0.62


In [54]:
# Create a confusion matrix
print(confusion_matrix(y_train, y_pred))

[[307   0]
 [190   0]]


In [55]:
# Compute Precision, Recall, F1-score, and Support
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.62      1.00      0.76       307
           1       0.00      0.00      0.00       190

    accuracy                           0.62       497
   macro avg       0.31      0.50      0.38       497
weighted avg       0.38      0.62      0.47       497



#### Unsure why these model coefficients are being calculated so low. It is clear that sex has an incredible influence on survival rate, yet the model is giving a tiny coefficent. Something is not being calculated correctly in the model. 

### Q5. Use your best 3 models to predict and evaluate on your validate sample

Considering that no model exceeded baseline, we could use any model. 

For the sake of the kernel flow, this question was retroactively added to earlier cells.

#### VALIDATE MODEL 4

In [56]:
y_pred4 = model4.predict(X_validate)
print("Model 4")

print('Accuracy: {:.2f}'.format(model4.score(X_validate, y_validate)))

print(confusion_matrix(y_validate, y_pred4))

print(classification_report(y_validate, y_pred4))

Model 4
Accuracy: 0.62
[[132   0]
 [ 82   0]]
              precision    recall  f1-score   support

           0       0.62      1.00      0.76       132
           1       0.00      0.00      0.00        82

    accuracy                           0.62       214
   macro avg       0.31      0.50      0.38       214
weighted avg       0.38      0.62      0.47       214



### Q6. Choose you best model from the validation performation, and evaluate it on the test dataset. How do the performance metrics compare to validate? to train?

#### TEST MODEL 4

In [57]:
y_pred4 = model4.predict(X_test)
y_pred4_proba = model4.predict_proba(X_test)

print("Model 4")

print('Accuracy: {:.2f}'.format(model4.score(X_test, y_test)))

print(confusion_matrix(y_test, y_pred4))

print(classification_report(y_test, y_pred4))

Model 4
Accuracy: 0.62
[[110   0]
 [ 68   0]]
              precision    recall  f1-score   support

           0       0.62      1.00      0.76       110
           1       0.00      0.00      0.00        68

    accuracy                           0.62       178
   macro avg       0.31      0.50      0.38       178
weighted avg       0.38      0.62      0.47       178



All performance measures are identical to validate, train, and baseline. No features were identified to have any impact on survival. This is obviously wrong, but I do not know why these models are not producing large enough coefficients. 

I will confer with the Data Science cohort.