In [2]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import acquire
import prepare


from pydataset import data

In [4]:
df = acquire.new_titanic_data()

In [5]:
train, validate, test = prepare.prep_titanic(df)

In [6]:
X_train, y_train = train.drop(columns='survived'), train.survived
X_validate, y_validate = validate.drop(columns='survived'), validate.survived
X_test, y_test = test.drop(columns='survived'), test.survived

What is your baseline prediction? What is your baseline accuracy? remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). When you make those predictions, what is your accuracy? This is your baseline accuracy.

In [7]:
# Baseline determination - 0 or deaths
train.survived.value_counts()
# Baseline accuracy = 61.6% (307/491)

0    307
1    190
Name: survived, dtype: int64

Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [8]:
# set baseline
X_train['baseline'] = 0

In [9]:
X_train.head()

Unnamed: 0,passenger_id,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone,Q,S,baseline
583,583,1,male,36.0,0,0,40.125,C,First,Cherbourg,1,0,0,0
337,337,1,female,41.0,0,0,134.5,C,First,Cherbourg,1,0,0,0
50,50,3,male,7.0,4,1,39.6875,S,Third,Southampton,0,0,1,0
218,218,1,female,32.0,0,0,76.2917,C,First,Cherbourg,1,0,0,0
31,31,1,female,29.916875,1,0,146.5208,C,First,Cherbourg,0,0,0,0


In [10]:
X_cols = ['pclass', 'fare']
y_col = 'survived'

X_train, y_train = train[X_cols], train[y_col]
X_validate, y_validate = validate[X_cols], validate[y_col]
X_test, y_test = test[X_cols], test[y_col]

In [42]:
model1 = DecisionTreeClassifier(max_depth=5, random_state=123)
model1 = model1.fit(X_train, y_train)

training score: 72.84%
validate score: 72.43%


In [43]:
m1_pred = model1.predict(X_train)

In [50]:
# create a new column on the train dataframe that contains the models predictions
train['prediction'] = m1_pred

Unnamed: 0,pclass,fare
583,1,40.1250
337,1,134.5000
50,3,39.6875
218,1,76.2917
31,1,146.5208
...,...,...
313,3,7.8958
636,3,7.9250
222,3,8.0500
485,3,25.4667


In [46]:
print(f'training score: {model1.score(X_train, y_train):.2%}')
print(f'validate score: {model1.score(X_validate, y_validate):.2%}')

training score: 72.84%
validate score: 72.43%


In [45]:
# use the column you just created and the actual values in the survived column
# to generate a classification report
print(classification_report(train.survived, train.prediction, zero_division=True))

              precision    recall  f1-score   support

           0       0.71      0.95      0.81       307
           1       0.82      0.37      0.51       190

    accuracy                           0.73       497
   macro avg       0.76      0.66      0.66       497
weighted avg       0.75      0.73      0.70       497



In [15]:
pd.DataFrame(classification_report(train.survived, train.prediction, output_dict=True)).transpose()

Unnamed: 0,precision,recall,f1-score,support
0,0.804813,0.980456,0.883994,307.0
1,0.95122,0.615789,0.747604,190.0
accuracy,0.841046,0.841046,0.841046,0.841046
macro avg,0.878016,0.798123,0.815799,497.0
weighted avg,0.860783,0.841046,0.831853,497.0


## Random Forest Exercises ##

In [38]:
# Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.
model_rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=1,
                            n_estimators=10,
                            max_depth=10, 
                            random_state=123)
model_rf.fit(X_train, y_train)

print(f'training score: {model_rf.score(X_train, y_train):.2%}')
print(f'validate score: {model_rf.score(X_validate, y_validate):.2%}')

training score: 82.90%
validate score: 72.43%


In [31]:
# Evaluate your results usIng the model score, confusion matrix, and classification report.

# Model score
print(f'training score: {model_rf.score(X_train, y_train):.2%}')
print(f'validate score: {model_rf.score(X_validate, y_validate):.2%}')

training score: 82.90%
validate score: 72.43%


In [32]:
model_rf.predict(X_train)

array([0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,

In [33]:
# create a new column on the train dataframe that contains the models predictions
train['prediction'] = model_rf.predict(X_train)

In [37]:
print(classification_report(train.survived, train.prediction, zero_division=True))

              precision    recall  f1-score   support

           0       0.82      0.92      0.87       307
           1       0.84      0.68      0.75       190

    accuracy                           0.83       497
   macro avg       0.83      0.80      0.81       497
weighted avg       0.83      0.83      0.82       497



In [121]:
# Run through steps increasing your min_samples_leaf and decreasing your max_depth.
