In [1]:
import pandas as pd
import numpy as np
from scipy import stats
from pydataset import data
import os
import env
import acquire 
import prepare

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.linear_model import LogisticRegression


import warnings
warnings.filterwarnings("ignore")


In [2]:
titan = acquire.get_titanic_data()

In [3]:
titan.head(3)

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1


In [4]:
titan.drop(['passenger_id', 'sex', 'sibsp', 'parch', 'embarked', 'class', 'deck', 'embark_town', 'alone'], axis=1, inplace=True)

In [5]:
titan.head(3)

Unnamed: 0,survived,pclass,age,fare
0,0,3,22.0,7.25
1,1,1,38.0,71.2833
2,1,3,26.0,7.925


In [6]:
titan.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   survived  891 non-null    int64  
 1   pclass    891 non-null    int64  
 2   age       714 non-null    float64
 3   fare      891 non-null    float64
dtypes: float64(2), int64(2)
memory usage: 34.8 KB


In [7]:
titan.isna().sum()

survived      0
pclass        0
age         177
fare          0
dtype: int64

In [8]:
# Going to use mean age to fill in null values. Do not want to drop them since 177/891 is almost 20% of the data.

round(titan.age.mean())

30

In [9]:
titan = titan.fillna(round(titan.age.mean()))

In [10]:
# Null values have been eliminated.
titan.isna().sum()

survived    0
pclass      0
age         0
fare        0
dtype: int64

In [11]:
train, test = train_test_split(titan, train_size=0.8, random_state=123, stratify=titan['survived'])


In [12]:
train.shape

(712, 4)

In [13]:
train, validate = train_test_split(train, train_size = 0.7, random_state = 123, stratify=train['survived'])

In [14]:
train.shape, validate.shape, test.shape

((498, 4), (214, 4), (179, 4))

In [15]:
X_train = train.drop(['survived'], axis=1)
y_train = train.survived

X_val = validate.drop(['survived'], axis=1)
y_val = validate.survived

X_test = test.drop(['survived'], axis=1)
y_test = test.survived

### Create a model that includes only age, fare, and pclass. Does this model perform better than your baseline?



In [16]:
baseline_df = train

In [17]:
baseline_df['baseline'] = 0

In [18]:
# Baseline accuracy is about 62%
baseline = accuracy_score(baseline_df.survived, baseline_df.baseline)

In [19]:
logit = LogisticRegression(random_state=123, penalty='l2', max_iter=100)

In [20]:
logit.fit(X_train, y_train)

In [21]:
logit.score(X_train, y_train)

0.7028112449799196

In [22]:
logit.score(X_val, y_val)

0.7102803738317757

In [23]:
baseline

0.6164658634538153

In [24]:
y_pred = logit.predict(X_train)
y_pred_val = logit.predict(X_val)

In [25]:
print('Train Classification Report')
print('============================')
print(classification_report(y_train, y_pred))
print('-------------------------------------------------------\n')
print('Validate Classification Report')
print('===============================')
print(classification_report(y_val, y_pred_val))

Train Classification Report
              precision    recall  f1-score   support

           0       0.71      0.87      0.78       307
           1       0.67      0.44      0.53       191

    accuracy                           0.70       498
   macro avg       0.69      0.65      0.66       498
weighted avg       0.70      0.70      0.69       498

-------------------------------------------------------

Validate Classification Report
              precision    recall  f1-score   support

           0       0.71      0.89      0.79       132
           1       0.70      0.43      0.53        82

    accuracy                           0.71       214
   macro avg       0.71      0.66      0.66       214
weighted avg       0.71      0.71      0.69       214



<div class="alert alert-block alert-success">
<b> Both the Train and Validate scores outperforms the baseline.<b>:


#### Graph of Model 1

### Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.

In [26]:
titan1 = acquire.get_titanic_data()

In [27]:
# Dropping columns
titan1.drop(['passenger_id', 'sibsp', 'parch', 'embarked', 'class', 'deck', 'embark_town', 'alone']\
            ,axis=1, inplace=True)

In [28]:
titan1.head(3)

Unnamed: 0,survived,pclass,sex,age,fare
0,0,3,male,22.0,7.25
1,1,1,female,38.0,71.2833
2,1,3,female,26.0,7.925


In [29]:
# Impute the age with mean of ages

titan1 = titan1.fillna(round(titan.age.mean()))

In [30]:
# Create the dum dums

the_dummy = pd.get_dummies(titan1.sex, dummy_na=False)

In [31]:
# Concat the_dummy to titan1

titan1 = pd.concat((titan1, the_dummy), axis=1)

In [32]:
# Now that we have them dum dums, we can drop the OG column

titan1.drop('sex', axis=1, inplace=True)

In [33]:
titan1.head(2)

Unnamed: 0,survived,pclass,age,fare,female,male
0,0,3,22.0,7.25,0,1
1,1,1,38.0,71.2833,1,0


In [34]:
# Put this data to work and Train it. Get a judgemental Validator. Then put it to the test.

train, test = train_test_split(titan1, train_size=0.8, random_state=123, stratify=titan1['survived'])

In [35]:
train, validate = train_test_split(train, train_size = 0.7, random_state = 123, stratify=train['survived'])

In [36]:
train.shape, validate.shape, test.shape

((498, 6), (214, 6), (179, 6))

In [37]:
# Have to rerun this code from above to refresh it, since we are using a new dataset that includes the 
# age column now.

X_train = train.drop(['survived'], axis=1)
y_train = train.survived

X_val = validate.drop(['survived'], axis=1)
y_val = validate.survived

X_test = test.drop(['survived'], axis=1)
y_test = test.survived

In [38]:
logit2 = LogisticRegression(random_state=123, penalty='l2', max_iter=100)

In [39]:
logit2.fit(X_train, y_train)

In [40]:
# Train Accuracy
logit2.score(X_train, y_train)

0.8112449799196787

In [41]:
# Validate Accuracy
logit2.score(X_val, y_val)

0.7710280373831776

In [42]:
baseline

0.6164658634538153

In [43]:
y_pred1 = logit2.predict(X_train)
y_pred1_val = logit2.predict(X_val)

In [44]:
print('Train Classification Report')
print('============================')
print(classification_report(y_train, y_pred1))
print('-------------------------------------------------------\n')
print('Validate Classification Report')
print('===============================')
print(classification_report(y_val, y_pred1_val))

Train Classification Report
              precision    recall  f1-score   support

           0       0.83      0.87      0.85       307
           1       0.77      0.72      0.75       191

    accuracy                           0.81       498
   macro avg       0.80      0.79      0.80       498
weighted avg       0.81      0.81      0.81       498

-------------------------------------------------------

Validate Classification Report
              precision    recall  f1-score   support

           0       0.80      0.83      0.82       132
           1       0.71      0.67      0.69        82

    accuracy                           0.77       214
   macro avg       0.76      0.75      0.75       214
weighted avg       0.77      0.77      0.77       214



#### Graph of Model 2

### Try out other combinations of features and models.

Continuing to use titan1 dataset, which has the sex column included

Keeping penalty at 'l1'(all or nothing) and max_iter = 100. Other hyperparameters will be added or adjusted

#### Master Sifu

In [45]:
master_sifu = LogisticRegression(random_state=123, penalty='l1', max_iter=100, solver='liblinear')

In [46]:
master_sifu.fit(X_train, y_train)

In [47]:
# Master Sifu's training score
master_sifu.score(X_train, y_train)

0.8012048192771084

In [48]:
# Master Sifu's validating score
master_sifu.score(X_val, y_val)

0.7663551401869159

In [49]:
# Baseline score
baseline

0.6164658634538153

In [50]:
difference = (master_sifu.score(X_train, y_train)) - (master_sifu.score(X_val, y_val))
difference

0.03484967909019254

#### Master Sensei

In [51]:
# Added class_weight='balanced' to this model
master_sensei = LogisticRegression(random_state=123, penalty='l1', max_iter=100, solver='liblinear', class_weight='balanced')

In [52]:
master_sensei.fit(X_train, y_train)

In [53]:
# Master Sensei's training score
master_sensei.score(X_train, y_train)

0.785140562248996

In [54]:
# Master Sensei's validation score
master_sensei.score(X_val, y_val)

0.7570093457943925

In [55]:
baseline

0.6164658634538153

In [56]:
difference1 = (master_sensei.score(X_train, y_train) - (master_sensei.score(X_val, y_val)))
difference1

0.02813121645460348

#### Master Meister

In [57]:
# C is at 2.0
master_meister = LogisticRegression(random_state=123, penalty='l1', max_iter=100, class_weight='balanced', solver='liblinear', C=2 )

In [58]:
master_meister.fit(X_train, y_train)

In [59]:
master_meister.score(X_train, y_train)

0.7951807228915663

In [60]:
master_meister.score(X_val, y_val)

0.7523364485981309

In [61]:
baseline

0.6164658634538153

In [62]:
difference2 = (master_meister.score(X_train, y_train) - (master_meister.score(X_val, y_val)))
difference2

0.0428442742934354

In [63]:
# 1st: Master Sensei, 2nd: Master Sifu, 3rd: Master Meister
difference, difference1, difference2

(0.03484967909019254, 0.02813121645460348, 0.0428442742934354)

### Use your best 3 models to predict and evaluate on your validate sample.

In [64]:
master_sifu.score(X_val, y_val)

0.7663551401869159

In [65]:
master_sensei.score(X_val, y_val)

0.7570093457943925

In [66]:
master_meister.score(X_val, y_val)

0.7523364485981309

### Choose you best model from the validation performation, and evaluate it on the test dataset. How do the performance metrics compare to validate? to train?

In [67]:
master_sifu.score(X_test, y_test)

0.7988826815642458

In [68]:
master_sifu.score(X_val, y_val)

0.7663551401869159

In [69]:
master_sifu.score(X_train, y_train)

0.8012048192771084

<div class="alert alert-block alert-success">
<b> The train dataset scored the highest, followed by the test dataset. The validate dataset had the lowest accuracy score of the three.<b>: