In [85]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import acquire
import prepare


from pydataset import data

In [26]:
df = acquire.new_titanic_data()

In [35]:
train, validate, test = prepare.prep_titanic(df)

In [46]:
X_train, y_train = train.drop(columns='survived'), train.survived
X_validate, y_validate = validate.drop(columns='survived'), validate.survived
X_test, y_test = test.drop(columns='survived'), test.survived

What is your baseline prediction? What is your baseline accuracy? remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). When you make those predictions, what is your accuracy? This is your baseline accuracy.

In [47]:
# Baseline determination - 0 or deaths
train.survived.value_counts()
# Baseline accuracy = 61.6% (307/491)

0    307
1    191
Name: survived, dtype: int64

Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [58]:
# set baseline
X_train['baseline'] = 0

In [59]:
X_train.head()

Unnamed: 0,passenger_id,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone,baseline
583,583,1,male,36.0,0,0,40.125,C,First,A,Cherbourg,1,0
165,165,3,male,9.0,0,2,20.525,S,Third,,Southampton,0,0
50,50,3,male,7.0,4,1,39.6875,S,Third,,Southampton,0,0
259,259,2,female,50.0,0,1,26.0,S,Second,,Southampton,0,0
306,306,1,female,,0,0,110.8833,C,First,,Cherbourg,1,0


In [65]:
X_cols = ['pclass', 'fare']
y_col = 'survived'

X_train, y_train = train[X_cols], train[y_col]
X_validate, y_validate = validate[X_cols], validate[y_col]
X_test, y_test = test[X_cols], test[y_col]

In [67]:
model1 = DecisionTreeClassifier()
model1.fit(X_train, y_train)

print(f'training score: {model1.score(X_train, y_train):.2%}')
print(f'validate score: {model1.score(X_validate, y_validate):.2%}')

training score: 83.94%
validate score: 71.03%


In [70]:
model1.predict(X_validate)

array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0])

In [72]:
# create a new column on the train dataframe that contains the models predictions
train['prediction'] = model1.predict(X_train)

In [74]:
# use the column you just created and the actual values in the survived column
# to generate a classification report
print(classification_report(train.survived, train.prediction, zero_division=True))

              precision    recall  f1-score   support

           0       0.81      0.97      0.88       307
           1       0.93      0.63      0.75       191

    accuracy                           0.84       498
   macro avg       0.87      0.80      0.82       498
weighted avg       0.85      0.84      0.83       498



In [80]:
pd.DataFrame(classification_report(train.survived, train.prediction, output_dict=True)).transpose()

Unnamed: 0,precision,recall,f1-score,support
0,0.807588,0.970684,0.881657,307.0
1,0.930233,0.628272,0.75,191.0
accuracy,0.839357,0.839357,0.839357,0.839357
macro avg,0.86891,0.799478,0.815828,498.0
weighted avg,0.854626,0.839357,0.831162,498.0


## Random Forest Exercises ##

In [122]:
# Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.
model_rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=5,
                            n_estimators=100,
                            max_depth=5, 
                            random_state=123)
model_rf.fit(X_train, y_train)

print(f'training score: {model_rf.score(X_train, y_train):.2%}')
print(f'validate score: {model_rf.score(X_validate, y_validate):.2%}')

training score: 74.90%
validate score: 69.63%


In [123]:
# Evaluate your results usIng the model score, confusion matrix, and classification report.

# Model score
print(f'training score: {model_rf.score(X_train, y_train):.2%}')
print(f'validate score: {model_rf.score(X_validate, y_validate):.2%}')

training score: 74.90%
validate score: 69.63%


In [124]:
model_rf.predict(X_validate)

array([0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1,
       1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0])

In [125]:
# create a new column on the train dataframe that contains the models predictions
train['prediction'] = model_rf.predict(X_train)

In [126]:
print(classification_report(train.survived, train.prediction, zero_division=True))

              precision    recall  f1-score   support

           0       0.76      0.87      0.81       307
           1       0.72      0.56      0.63       191

    accuracy                           0.75       498
   macro avg       0.74      0.71      0.72       498
weighted avg       0.75      0.75      0.74       498



In [121]:
# Run through steps increasing your min_samples_leaf and decreasing your max_depth.
