# Logistic Regression Exercises

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

from env import user, password, host
from acquire import get_titanic_data
from prepare import prep_titanic

In this exercise, we'll continue working with the titanic dataset and building logistic regression models. Throughout this exercise, be sure you are training, evaluation, and comparing models on the train and validate datasets. The test dataset should only be used for your final model.

In [2]:
df = prep_titanic()
df.head()

AttributeError: 'tuple' object has no attribute 'head'

In [None]:
df.shape

In [None]:
dummy_df = pd.get_dummies(df['sex']).drop(columns=['male'])
df = pd.concat([df, dummy_df], axis=1).drop(columns=['sex'])

In [None]:
df.head()

In [None]:
df['first_class'] = df['pclass'].apply(lambda p: 1 if p == 1 else 0)
df['second_class'] = df['pclass'].apply(lambda p: 1 if p == 2 else 0)

In [None]:
X = df[['first_class', 'second_class','age','fare']]
y = df[['survived']]
print(X)
print(y)

In [None]:
# Creating train, validate, test sets
X_train_validate, X_test  = train_test_split(X, test_size = .20, random_state = 123)
y_train_validate, y_test = train_test_split(y, test_size = .20, random_state = 123, stratify = y.survived)

X_train, X_validate  = train_test_split(X_train_validate, test_size = .30, random_state = 123)
y_train, y_validate = train_test_split(y_train_validate, test_size = .30, random_state = 123, stratify = y_train_validate.survived)
print("train: ", X_train.shape, ", validate: ", X_validate.shape, ", test: ", X_test.shape)
print("train: ", y_train.shape, ", validate: ", y_validate.shape, ", test: ", y_test.shape)

In [None]:
# Imputing missing age values
print(X_train.age.mean()) # We will use the train set mean age for our imputed mean 
print(X_validate.age.mean())
print(X_test.age.mean())

In [None]:
mean_imputer = SimpleImputer(missing_values=np.nan, strategy='mean', verbose=0)
X_train.iloc[:,:] = mean_imputer.fit_transform(X_train)
X_train.head(10)

In [None]:
X_train.info()

In [None]:
X_train.shape

In [None]:
X_validate.head()

In [None]:
X_validate['age'].fillna(inplace=True, value=X_train['age'].mean())
X_validate.head()

In [None]:
X_test['age'].fillna(inplace=True, value=X_train['age'].mean())
X_test.head()

The reasoning for using the train set mean age in all sets is that our model is built on the inherent assumption that our average age is 29.17. When we receive data we have never seen before (which is what the validate and test set represent) we would have no way of knowing what the mean age for future data would be. We can only use what we know for sure, which is our train data. 

### Q1. Start by defining your baseline model.

In [None]:
y_train.survived.value_counts()

The majority of passengers on the titanic appeared to have died. We will use a model of always predicting "not survived" as our baseline to compare future models to. 

In [None]:
# If that baseline model was applied to the train data
baseline_accuracy_train = round(1 - y_train.survived.mean(),2)
baseline_accuracy_train

Our baseline model has a 62% accuracy on the training data.

### Q2. Create another model that includes age in addition to fare and pclass. Does this model perform better than your baseline?

We previously optimized our data to create the model this question is asking.

In [None]:
X_train.head()

In [None]:
X_train.describe()

In [None]:
y_train.describe()

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_validate = scaler.transform(X_validate)
X_test = scaler.transform(X_test)

In [None]:
model1 = LogisticRegression(C = 1)
model1.fit(X_train, y_train)
print('Coefficient: ', model1.coef_)
print('Intercept: ', model1.intercept_)

In [None]:
#Estimate whether or not a passenger would survive, using the training data
y_pred = model1.predict(X_train)
#Estimate the probability of a passenger surviving, using the training data
y_pred_proba = model1.predict_proba(X_train)

In [None]:
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(model1.score(X_train, y_train)))

In [None]:
# Create a confusion matrix
print(confusion_matrix(y_train, y_pred))

In [None]:
# Compute Precision, Recall, F1-score, and Support
print(classification_report(y_train, y_pred))

#### Our Model is identical to baseline. It predicted that each passenger would die. 

#### VALIDATE MODEL ONE

In [None]:
y_pred1 = model1.predict(X_validate)
print("Model 4")

print('Accuracy: {:.2f}'.format(model1.score(X_validate, y_validate)))

print(confusion_matrix(y_validate, y_pred1))

print(classification_report(y_validate, y_pred1))

### Q3. Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.

In [None]:
# Recreating our X_train DataFrame with 'female' added
df = prep_titanic()
dummy_df = pd.get_dummies(df['sex']).drop(columns=['male'])
df = pd.concat([df, dummy_df], axis=1).drop(columns=['sex'])
df['first_class'] = df['pclass'].apply(lambda p: 1 if p == 1 else 0)
df['second_class'] = df['pclass'].apply(lambda p: 1 if p == 2 else 0)
X = df[['first_class', 'second_class','age','fare', 'female']]
y = df[['survived']]
X_train_validate, X_test  = train_test_split(X, test_size = .20, random_state = 123)
y_train_validate, y_test = train_test_split(y, test_size = .20, random_state = 123, stratify = y.survived)
X_train, X_validate  = train_test_split(X_train_validate, test_size = .30, random_state = 123)
y_train, y_validate = train_test_split(y_train_validate, test_size = .30, random_state = 123, stratify = y_train_validate.survived)
mean_imputer = SimpleImputer(missing_values=np.nan, strategy='mean', verbose=0)
X_train.iloc[:,:] = mean_imputer.fit_transform(X_train)
X_validate['age'].fillna(inplace=True, value=X_train['age'].mean())
X_test['age'].fillna(inplace=True, value=X_train['age'].mean())

In [None]:
X_train.head()

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_validate = scaler.transform(X_validate)
X_test = scaler.transform(X_test)

In [None]:
model2 = LogisticRegression(C = 1)
model2.fit(X_train, y_train)
print('Coefficient: ', model2.coef_)
print('Intercept: ', model2.intercept_)

In [None]:
#Estimate whether or not a passenger would survive, using the training data
y_pred = model2.predict(X_train)
#Estimate the probability of a passenger surviving, using the training data
y_pred_proba = model2.predict_proba(X_train)

In [None]:
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(model2.score(X_train, y_train)))

In [None]:
# Create a confusion matrix
print(confusion_matrix(y_train, y_pred))

In [None]:
# Compute Precision, Recall, F1-score, and Support
print(classification_report(y_train, y_pred))

#### Even after adding in sex, our Model is identical to baseline. It predicted that each passenger would die. 

#### VALIDATE MODEL 2

In [None]:
y_pred2 = model2.predict(X_validate)
print("Model 2")

print('Accuracy: {:.2f}'.format(model2.score(X_validate, y_validate)))

print(confusion_matrix(y_validate, y_pred2))

print(classification_report(y_validate, y_pred2))

### Q4. Try out other combinations of features and models.

We will run a model that only has class, sex, and age as features

In [None]:
# Recreating our X_train DataFrame with only class, sex, and age as features
df = prep_titanic()
dummy_df = pd.get_dummies(df['sex']).drop(columns=['male'])
df = pd.concat([df, dummy_df], axis=1).drop(columns=['sex'])
df['first_class'] = df['pclass'].apply(lambda p: 1 if p == 1 else 0)
df['second_class'] = df['pclass'].apply(lambda p: 1 if p == 2 else 0)
X = df[['first_class', 'second_class','age', 'female']]
y = df[['survived']]
X_train_validate, X_test  = train_test_split(X, test_size = .20, random_state = 123)
y_train_validate, y_test = train_test_split(y, test_size = .20, random_state = 123, stratify = y.survived)
X_train, X_validate  = train_test_split(X_train_validate, test_size = .30, random_state = 123)
y_train, y_validate = train_test_split(y_train_validate, test_size = .30, random_state = 123, stratify = y_train_validate.survived)
mean_imputer = SimpleImputer(missing_values=np.nan, strategy='mean', verbose=0)
X_train.iloc[:,:] = mean_imputer.fit_transform(X_train)
X_validate['age'].fillna(inplace=True, value=X_train['age'].mean())
X_test['age'].fillna(inplace=True, value=X_train['age'].mean())

In [None]:
X_train.head()

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_validate = scaler.transform(X_validate)
X_test = scaler.transform(X_test)

In [None]:
model3 = LogisticRegression(C = 1)
model3.fit(X_train, y_train)
print('Coefficient: ', model3.coef_)
print('Intercept: ', model3.intercept_)

In [None]:
#Estimate whether or not a passenger would survive, using the training data
y_pred = model3.predict(X_train)
#Estimate the probability of a passenger surviving, using the training data
y_pred_proba = model3.predict_proba(X_train)

In [None]:
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(model3.score(X_train, y_train)))

In [None]:
# Create a confusion matrix
print(confusion_matrix(y_train, y_pred))

In [None]:
# Compute Precision, Recall, F1-score, and Support
print(classification_report(y_train, y_pred))

#### VALIDATE MODEL 3

In [None]:
y_pred3 = model3.predict(X_validate)
print("Model 4")

print('Accuracy: {:.2f}'.format(model3.score(X_validate, y_validate)))

print(confusion_matrix(y_validate, y_pred3))

print(classification_report(y_validate, y_pred3))

### CREATING A ONE FEATURE MODEL (SEX ONLY)

In [None]:
# Recreating our X_train DataFrame with only class, sex, and age as features
df = prep_titanic()
dummy_df = pd.get_dummies(df['sex']).drop(columns=['male'])
df = pd.concat([df, dummy_df], axis=1).drop(columns=['sex'])
df['first_class'] = df['pclass'].apply(lambda p: 1 if p == 1 else 0)
df['second_class'] = df['pclass'].apply(lambda p: 1 if p == 2 else 0)
X = df[['female']]
y = df[['survived']]
X_train_validate, X_test  = train_test_split(X, test_size = .20, random_state = 123)
y_train_validate, y_test = train_test_split(y, test_size = .20, random_state = 123, stratify = y.survived)
X_train, X_validate  = train_test_split(X_train_validate, test_size = .30, random_state = 123)
y_train, y_validate = train_test_split(y_train_validate, test_size = .30, random_state = 123, stratify = y_train_validate.survived)

In [None]:
X_train.head()

In [None]:
X_train.shape

In [None]:
y_train.head()

In [None]:
y_train.shape

In [None]:
model4 = LogisticRegression()
model4.fit(X_train, y_train)
print('Coefficient: ', model4.coef_)
print('Intercept: ', model4.intercept_)

In [None]:
#Estimate whether or not a passenger would survive, using the training data
y_pred = model4.predict(X_train)
#Estimate the probability of a passenger surviving, using the training data
y_pred_proba = model4.predict_proba(X_train)

In [None]:
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(model4.score(X_train, y_train)))

In [None]:
# Create a confusion matrix
print(confusion_matrix(y_train, y_pred))

In [None]:
# Compute Precision, Recall, F1-score, and Support
print(classification_report(y_train, y_pred))

#### Unsure why these model coefficients are being calculated so low. It is clear that sex has an incredible influence on survival rate, yet the model is giving a tiny coefficent. Something is not being calculated correctly in the model. 

### Q5. Use your best 3 models to predict and evaluate on your validate sample

Considering that no model exceeded baseline, we could use any model. 

For the sake of the kernel flow, this question was retroactively added to earlier cells.

#### VALIDATE MODEL 4

In [None]:
y_pred4 = model4.predict(X_validate)
print("Model 4")

print('Accuracy: {:.2f}'.format(model4.score(X_validate, y_validate)))

print(confusion_matrix(y_validate, y_pred4))

print(classification_report(y_validate, y_pred4))

### Q6. Choose you best model from the validation performation, and evaluate it on the test dataset. How do the performance metrics compare to validate? to train?

#### TEST MODEL 4

In [None]:
y_pred4 = model4.predict(X_test)
y_pred4_proba = model4.predict_proba(X_test)

print("Model 4")

print('Accuracy: {:.2f}'.format(model4.score(X_test, y_test)))

print(confusion_matrix(y_test, y_pred4))

print(classification_report(y_test, y_pred4))

All performance measures are identical to validate, train, and baseline. No features were identified to have any impact on survival. This is obviously wrong, but I do not know why these models are not producing large enough coefficients. 

I will confer with the Data Science cohort.