In [16]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from pydataset import data

import acquire
from env import host, user, password 

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import graphviz
from graphviz import Graph

### Using titanic data, do the following:

In [17]:
#grab acquired dataset
df = acquire.get_titanic_data()
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [18]:
#clean up data by:

#dropping duplicate rows
df.drop_duplicates(inplace=True) 

#removing duplicates and columns with too many nulls
cols_to_drop = ['deck', 'embarked', 'class', 'age']
df = df.drop(columns=cols_to_drop)

#filling in missing values on embark_town w/ most common value "Southampton"
df['embark_town'] = df.embark_town.fillna(value='Southampton')

#encoding categorical variables that remain as strings/objects
dummy_df = pd.get_dummies(df[['sex','embark_town']], drop_first=True)
df = pd.concat([df, dummy_df], axis=1)


#look at reconstructed df
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,sibsp,parch,fare,embark_town,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,0,3,male,1,0,7.25,Southampton,0,1,0,1
1,1,1,1,female,1,0,71.2833,Cherbourg,0,0,0,0
2,2,1,3,female,0,0,7.925,Southampton,1,0,0,1
3,3,1,1,female,1,0,53.1,Southampton,0,0,0,1
4,4,0,3,male,0,0,8.05,Southampton,1,1,0,1


In [19]:
#create function to perform these steps when we need to produce this dataset
def clean_titanic():
    '''
    This function will:
    drop any duplicate observations, 
    drop columns not needed, 
    fill missing embarktown with 'Southampton',
    create dummy vars of sex and embark_town(encoding)
    
    and return a single cleaned dataframe
    '''
    df.drop_duplicates(inplace=True)
    df.drop(columns=['deck', 'embarked', 'class', 'age'], inplace=True)
    df.embark_town.fillna(value='Southampton', inplace=True)
    dummy_df = pd.get_dummies(df[['sex', 'embark_town']], drop_first=True)
    return pd.concat([df, dummy_df], axis=1)

In [35]:
#further clean up/prep by dropping sex and embark_town columns 
# b/c decision tree needs encoded features
df = df.drop(columns=['sex', 'embark_town'])
df.head()

Unnamed: 0,passenger_id,survived,pclass,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,0,3,1,0,7.25,0,1,0,1
1,1,1,1,1,0,71.2833,0,0,0,0
2,2,1,3,0,0,7.925,1,0,0,1
3,3,1,1,1,0,53.1,0,0,0,1
4,4,0,3,0,0,8.05,1,1,0,1


In [36]:
#split the data
train, test = train_test_split(df, test_size=.2, random_state=123, stratify=df.survived)
train, validate = train_test_split(train, test_size=.3, random_state=123, stratify=train.survived)


In [37]:
#prep data 
def prep_titanic_data():
    df = clean_titanic()
    train_validate, test = train_test_split(df, test_size=.2, random_state=123, stratify=df.survived)
    train, validate = train_test_split(train_validate, 
                                       test_size=.3, 
                                       random_state=123, 
                                       stratify=train_validate.survived)
    train, validate, test = impute_mode()
    return train, validate, test

In [38]:
print(f'train -> {train.shape}')
print(f'validate -> {validate.shape}')
print(f'test -> {test.shape}')

train.head()

train -> (498, 10)
validate -> (214, 10)
test -> (179, 10)


Unnamed: 0,passenger_id,survived,pclass,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
583,583,0,1,0,0,40.125,1,1,0,0
165,165,1,3,0,2,20.525,0,1,0,1
50,50,0,3,4,1,39.6875,0,1,0,1
259,259,1,2,0,1,26.0,0,0,0,1
306,306,1,1,0,0,110.8833,1,0,0,0


In [39]:
# Setup our X inputs and y target variable for each split
X_train = train.drop(columns=['survived'])
y_train = train.survived

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

### 1. What is your baseline prediction? What is your baseline accuracy? *remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). When you make those predictions, what is your accuracy? This is your baseline accuracy.*

In [40]:
#baseline prediction:
#predicting the most prevalant class in training dataset(the mode)
train.survived.value_counts()

0    307
1    191
Name: survived, dtype: int64

In [41]:
#baseline model would be to predict 0 since it is most prevalant
train["baseline_prediction"] = train.survived.value_counts().idxmax()
train.head()

#or train['baseline_prediction'] = 0

Unnamed: 0,passenger_id,survived,pclass,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton,baseline_prediction
583,583,0,1,0,0,40.125,1,1,0,0,0
165,165,1,3,0,2,20.525,0,1,0,1,0
50,50,0,3,4,1,39.6875,0,1,0,1,0
259,259,1,2,0,1,26.0,0,0,0,1,0
306,306,1,1,0,0,110.8833,1,0,0,0,0


In [42]:
#baseline accuracy:
#when making baseline predictions, what is the accuracy?
baseline_accuracy = (train.survived == train.baseline_prediction).mean()

print(f'baseline accuracy: {baseline_accuracy:.2%}')

baseline accuracy: 61.65%


### 2. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [115]:
#create/generate a blank, new Decision Tree model/object **with max_depth argument**
clf = DecisionTreeClassifier(max_depth=3, random_state=123)

In [116]:
#fit/train the model(decision tree classifier) to training data/sample
# format: model.fit(X, y)

clf = clf.fit(X_train, y_train)
clf.classes_

array([0, 1])

In [50]:
#visualize
dot_data = export_graphviz(clf, feature_names= X_train.columns, rounded=True, filled=True, out_file=None, class_names=['Did not survive', 'Survived'])
graph = graphviz.Source(dot_data) 

graph.render('titanic_decision_tree', view=True, format="pdf")

'titanic_decision_tree.pdf'

In [117]:
#transform/make predictions using trained model (the training sample)
y_pred = clf.predict(X_train)
y_pred[0:5]

array([0, 0, 0, 1, 1])

In [56]:
# Estimate the probabilities for each class
# aka get prediction probabilites for each class for each observation in train
y_pred_proba = clf.predict_proba(X_train)
y_pred_proba[0:5]

array([[0.62222222, 0.37777778],
       [0.62222222, 0.37777778],
       [0.89285714, 0.10714286],
       [0.14814815, 0.85185185],
       [0.        , 1.        ]])

### 3. Evaluate your in-sample results using the model score, confusion matrix, and classification report.



In [60]:
# Let's evaluate the model
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.82


In [62]:
# confusion matrix

confusion_matrix(y_train, y_pred)

array([[274,  33],
       [ 56, 135]])

In [63]:
y_train.value_counts()


0    307
1    191
Name: survived, dtype: int64

In [64]:
labels = sorted(y_train.unique())

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,274,33
1,56,135


In [121]:
#classification report
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.89      0.86       307
           1       0.80      0.71      0.75       191

    accuracy                           0.82       498
   macro avg       0.82      0.80      0.81       498
weighted avg       0.82      0.82      0.82       498



In [123]:
#cleaner way to read classification report
class_report = classification_report(y_train, y_pred, output_dict=True)
pd.DataFrame(class_report).T

Unnamed: 0,precision,recall,f1-score,support
0,0.830303,0.892508,0.860283,307.0
1,0.803571,0.706806,0.752089,191.0
accuracy,0.821285,0.821285,0.821285,0.821285
macro avg,0.816937,0.799657,0.806186,498.0
weighted avg,0.820051,0.821285,0.818787,498.0


### 4. Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.



In [79]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_fscore_support 

In [73]:
tn, fp, fn, tp = confusion_matrix(y_train, y_pred).ravel()

In [68]:
#accuracy (tp+tn/tp+tn+fp+fn)
accuracy_score(y_train, y_pred)

0.821285140562249

In [74]:
#true positive rate (sensitivity aka recall) (tp/tp+fn)
tp / (tp + fn)

0.7068062827225131

In [75]:
#false positive rate (fp/fp+tn)
fp / (fp + tn)

0.10749185667752444

In [76]:
#true negative rate (specificity) (tn/tn+fp)
tn / tn + fp

34.0

In [77]:
#false negative rate (miss rate)(fn/fn+tp)
fn / (fn + tn)

0.1696969696969697

In [69]:
#precision (positive predictive value) (tp/tp+fp)
precision_score(y_train, y_pred)

0.8035714285714286

In [70]:
#recall (aka sensitivity) (tp/tp+fn)
recall_score(y_train, y_pred)

0.7068062827225131

In [72]:
#f1-score (2*precision*recall / precision + recall)
f1_score(y_train, y_pred)

0.7520891364902508

In [81]:
#support (number of occurrences of each class in y_true)
precision_recall_fscore_support(y_train, y_pred)[-1]

array([307, 191])

### 5. Run through steps 2-4 using a different `max_depth` value.



In [96]:
#create new blank decision tree w/ new max_depth value
clf2 = DecisionTreeClassifier(max_depth=4, random_state=123)

In [97]:
#fit/train the model(decision tree classifier) to training data/sample
# format: model.fit(X, y)

clf2 = clf2.fit(X_train, y_train)
clf2.classes_

array([0, 1])

In [98]:
#visualize
dot_data = export_graphviz(clf2, feature_names= X_train.columns, rounded=True, filled=True, out_file=None, class_names=['Did not survive', 'Survived'])
graph = graphviz.Source(dot_data) 

graph.render('titanic_decision_tree', view=True, format="pdf")

'titanic_decision_tree.pdf'

In [99]:
#transform/make predictions using trained model (the training sample)
y_pred = clf2.predict(X_train)
y_pred[0:5]

array([0, 0, 0, 1, 1])

In [100]:
# Estimate the probabilities for each class
# aka get prediction probabilites for each class for each observation in train
y_pred_proba = clf2.predict_proba(X_train)
y_pred_proba[0:5]

array([[0.63636364, 0.36363636],
       [0.63636364, 0.36363636],
       [1.        , 0.        ],
       [0.11538462, 0.88461538],
       [0.        , 1.        ]])

In [101]:
# Let's evaluate the model
print('Accuracy of Decision Tree classifier on training set with max depth of 4: {:.2f}'
      .format(clf2.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set with max depth of 4: 0.84


In [102]:
# confusion matrix

confusion_matrix(y_train, y_pred)

array([[292,  15],
       [ 66, 125]])

In [103]:
#classification report
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.95      0.88       307
           1       0.89      0.65      0.76       191

    accuracy                           0.84       498
   macro avg       0.85      0.80      0.82       498
weighted avg       0.85      0.84      0.83       498



In [105]:
tn, fp, fn, tp = confusion_matrix(y_train, y_pred).ravel()

#accuracy
accuracy = accuracy_score(y_train, y_pred)

#true positive rate
tpr = tp / (tp + fn)

#false positive rate 
fpr = fp / (fp + tn)

#true negative rate (specificity)
tnr = tn / tn + fp

#false negative rate (miss rate)
fnr = fn / (fn + tn)

#precision (positive predictive value)
precision = precision_score(y_train, y_pred)

#recall (sensitivity)
recall = recall_score(y_train, y_pred)

#f1-score
f1 = f1_score(y_train, y_pred)

#support (number of occurrences of each class in y_true)
support = precision_recall_fscore_support(y_train, y_pred)[-1]


print(f'''
Accuracy: {accuracy}
true positive rate: {tpr}
false positive rate: {fpr}
true negative rate: {tnr}
false negative rate:{fnr}
precision: {precision}
recall: {recall}
f1-score: {f1}
support: {support}
''')


Accuracy: 0.8373493975903614
true positive rate: 0.6544502617801047
false positive rate: 0.048859934853420196
true negative rate: 16.0
false negative rate:0.18435754189944134
precision: 0.8928571428571429
recall: 0.6544502617801047
f1-score: 0.7552870090634441
support: [307 191]



### 6. Which model performs better on your in-sample data?

- model 2

### 7. Which model performs best on your out-of-sample data, the `validate` set?

- model 1

In [106]:
# Let's evaluate this model on out-of-sample data
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf.score(X_validate, y_validate)))

Accuracy of Decision Tree classifier on validate set: 0.78


In [107]:
# Use the classification model trained on train data to make predictions on validate data
y_pred = clf.predict(X_validate)
y_pred[0:3]

array([0, 0, 0])

In [108]:
y_validate.head(3)


610    0
424    0
568    0
Name: survived, dtype: int64

In [109]:
# Compare actual y values from validate to predictions based on X_validate
print(classification_report(y_validate, y_pred))

              precision    recall  f1-score   support

           0       0.79      0.86      0.83       132
           1       0.74      0.63      0.68        82

    accuracy                           0.78       214
   macro avg       0.77      0.75      0.76       214
weighted avg       0.77      0.78      0.77       214



In [110]:
# Let's evaluate this model 2 on out-of-sample data
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf2.score(X_validate, y_validate)))

Accuracy of Decision Tree classifier on validate set: 0.76


In [111]:
# Use the classification model trained on train data to make predictions on validate data
y_pred = clf2.predict(X_validate)
y_pred[0:3]

array([0, 0, 0])

In [112]:
y_validate.head(3)


610    0
424    0
568    0
Name: survived, dtype: int64

In [113]:
# Compare actual y values from validate to predictions based on X_validate
print(classification_report(y_validate, y_pred))


              precision    recall  f1-score   support

           0       0.77      0.88      0.82       132
           1       0.75      0.57      0.65        82

    accuracy                           0.76       214
   macro avg       0.76      0.73      0.73       214
weighted avg       0.76      0.76      0.75       214

