In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score
import numpy as np

# code for decision tree model similar to logistic regression model. Makes it to build and compare models for the same dataset

df = pd.read_csv('https://sololearn.com/uploads/files/titanic.csv')
df['Male'] = df['Sex'] == 'male'

# features and target selection
X = df[['Pclass', 'Male', 'Age', 'Siblings/Spouses', 'Parents/Children', 'Fare']].values
y = df['Survived'].values


## decision tree using train/test split

# train/test-split 
X_train, X_test, y_train, y_test = train_test_split(X, y) # split features and targets into train/test sets

# instantiate model
model = DecisionTreeClassifier() 

# fit
model.fit(X_train, y_train) # passing training features and targets

# predict
print(model.predict([[3, True, 22, 1, 0, 7.25]])) 
print()
# 3rd class, male passenger, age 22, with 1 sibiling/spouse, 0 parents/children, 7.25 fare
# predicted to not survive

# evaluate 
y_pred = model.predict(X_test) # passing testing features to get predictions
print("Decision Tree - single train/test split")
print("  accuracy:", accuracy_score(y_test, y_pred)) #  passing testing targets and predictions to get metrics 
print("  precision:", precision_score(y_test, y_pred)) # "
print("  recall:", recall_score(y_test, y_pred)) # "



[0]

Decision Tree - single train/test split
  accuracy: 0.7657657657657657
  precision: 0.7073170731707317
  recall: 0.6744186046511628


In [2]:

## decision tree using k-fold cross validation for more accurate metrics 

# instantiate kfold object
kf = KFold(n_splits=5, shuffle=True, random_state=23) # 5 chunks, random_state to lock the split

accuracy_scores = [] # create empty lists for metrics
precision_scores = [] # "
recall_scores = [] # "
for train_index, test_index in kf.split(X): # pass features to 'kf.split()' method  which creates the splits, outputs a generator. for loop using training and testing indices generated for each fold
    X_train, X_test = X[train_index], X[test_index] # train/test split for each fold using training and testing indices 
    y_train, y_test = y[train_index], y[test_index] # "
    dt = DecisionTreeClassifier(random_state=9) # instantiate model for each fold, internal random_state for decision tree
    dt.fit(X_train, y_train) # fit each model, passing training features and targets 
    y_pred = dt.predict(X_test) # get predictions for each model, passing testing features 
    accuracy_scores.append(accuracy_score(y_test, y_pred)) # passing each model's testing targets and predictions to get metrics and adding each model's metrics to respective metric list
    precision_scores.append(precision_score(y_test, y_pred)) # "
    recall_scores.append(recall_score(y_test, y_pred)) # "
print("Decision Tree - k-fold cross validated")
print("  accuracy:", np.mean(accuracy_scores)) # print mean of each metric list, this is the cross-validated metric value
print("  precision:", np.mean(precision_scores)) # "
print("  recall:", np.mean(recall_scores)) # "


Decision Tree - k-fold cross validated
  accuracy: 0.7587062781692376
  precision: 0.687061714459847
  recall: 0.7022506633151794


In [3]:
## comparing decision tree to logistic regression, using k-fold cross validation

dt_accuracy_scores = [] # create empty lists for decision tree metrics
dt_precision_scores = [] # "
dt_recall_scores = [] # "
lr_accuracy_scores = [] # create empty lists for logistic regression metrics
lr_precision_scores = [] # "
lr_recall_scores = [] # "
for train_index, test_index in kf.split(X): # pass features to 'kf.split()' method  which creates the splits, outputs a generator. for loop using training and testing indices generated for each fold
    X_train, X_test = X[train_index], X[test_index] # train/test split for each fold using training and testing indices 
    y_train, y_test = y[train_index], y[test_index] # "
    dt = DecisionTreeClassifier(random_state=9) # instantiate decision tree model for each fold, same internal random_state as above
    dt.fit(X_train, y_train) # fit each model, passing training features and targets 
    dt_y_pred = dt.predict(X_test) # get predictions for each model, passing testing features 
    dt_accuracy_scores.append(accuracy_score(y_test, dt_y_pred)) # passing each model's testing targets and predictions to get metrics and adding each model's metrics to respective metric list
    dt_precision_scores.append(precision_score(y_test, dt_y_pred)) # "
    dt_recall_scores.append(recall_score(y_test, dt_y_pred)) # "
    lr = LogisticRegression() # instantiate logistic regression model for each fold
    lr.fit(X_train, y_train) # fit each model, passing training features and targets 
    lr_y_pred = lr.predict(X_test) # get predictions for each model, passing testing features
    lr_accuracy_scores.append(accuracy_score(y_test, lr_y_pred)) # passing each model's testing targets and predictions to get metrics and adding each model's metrics to respective metric list
    lr_precision_scores.append(precision_score(y_test, lr_y_pred)) # "
    lr_recall_scores.append(recall_score(y_test, lr_y_pred)) # "
print("Decision Tree")
print("  accuracy:", np.mean(dt_accuracy_scores)) # print mean of each metric list for decision tree, this is the cross-validated metric value
print("  precision:", np.mean(dt_precision_scores)) # "
print("  recall:", np.mean(dt_recall_scores)) # "
print()
print("*versus*")
print()
print("Logistic Regression")
print("  accuracy:", np.mean(lr_accuracy_scores)) # "
print("  precision:", np.mean(lr_precision_scores)) # "
print("  recall:", np.mean(lr_recall_scores)) # "

# we see that the logistic regression model performs better, though we still may want to use a decision tree model for its interpretability 


Decision Tree
  accuracy: 0.7587062781692376
  precision: 0.687061714459847
  recall: 0.7022506633151794

*versus*

Logistic Regression
  accuracy: 0.7993461562876912
  precision: 0.7648070841239722
  recall: 0.7003502304147465


In [4]:
## comparing decision tree to logistic regression, using 'cross_val_score' shortcut

dt = DecisionTreeClassifier(random_state=9) # instantiate decision tree model, same internal 'random_state' as above
lr = LogisticRegression() # instantiate logistic regression model

dt_accuracy_scores = cross_val_score(dt, X, y, cv=kf) # pass model, features, targets, number of folds to 'cross_val_score' Passing 'kf' instead of a number. 'kf' is the KFold object that was initiated earlier with (n_splits=5, shuffle=True, random_state=23) to match the random state from before
lr_accuracy_scores = cross_val_score(lr, X , y, cv=kf)  # 'cross_val_score' will automatically create a train/test split for each fold, instantiate a model for each fold, fit each model, test each model, and output each model's accuracy score
print("Decision Tree")
print("  accuracy:", np.mean(dt_accuracy_scores)) # mean of the 5 accuracy scores
print()
print("*versus*")
print()
print("Logistic Regression")
print("  accuracy:", np.mean(lr_accuracy_scores))


Decision Tree
  accuracy: 0.7587062781692376

*versus*

Logistic Regression
  accuracy: 0.7993461562876912


In [5]:

## comparing decision tree using gini impurity to decision tree using entropy

# quality of the split (information gain) can be measured via gini impurity or entropy. Default is gini impurity. Can be adjusted with 'critereon=' parameter, passing 'gini' or 'entropy'

for criteria in ['gini', 'entropy']: # pass first 'gini' then 'entropy' to 'criteria'
    print(f"Decision Tree - {criteria}") #f-string print string w/ 'criteria'
    accuracy = [] # create empty lists for metrics (first for 'gini' then for 'entropy')
    precision = [] # "
    recall = [] # "
    # following 'for' loop done twice, first for 'gini' then for 'entropy'
    for train_index, test_index in kf.split(X): # pass features to 'kf.split()' method which creates the splits, outputs a generator. for loop using training and testing indices generated for each fold
        X_train, X_test = X[train_index], X[test_index] # train/test split for each fold using training and testing indices 
        y_train, y_test = y[train_index], y[test_index] # "
        dt = DecisionTreeClassifier(random_state=9, criterion=criteria) # instantiate decision tree model for each fold, same random_state as above, 'criterion=' pass 'criteria' (first 'gini' then entropy')
        dt.fit(X_train, y_train) # fit each model, passing training features and targets 
        y_pred = dt.predict(X_test) # get predictions for each model, passing testing features 
        accuracy.append(accuracy_score(y_test, y_pred)) # passing each model's testing targets and predictions to get metrics and adding each model's metrics to respective metric list
        precision.append(precision_score(y_test, y_pred)) # "
        recall.append(recall_score(y_test, y_pred)) # "
    print("accuracy:", np.mean(accuracy)) # print mean of each metric list, this is the cross-validated metric value
    print("precision:", np.mean(precision)) # "
    print("recall:", np.mean(recall), '\n') # " n\ for line break
    if criteria=='gini':
        print("*versus*") # prints only after decision tree with gini's metrics


# decision tree using entropy performs better but only very slightly. Rare to find a dataset where the choice would make a big difference
  

Decision Tree - gini
accuracy: 0.7587062781692376
precision: 0.687061714459847
recall: 0.7022506633151794 

*versus*
Decision Tree - entropy
accuracy: 0.772259252205929
precision: 0.7088325436802231
recall: 0.7069822184983476 



In [6]:
## pre-pruning the decision tree to reduce overfitting. comparing potential options using GridSearchCV

# We will compare different values for 'max_depth' (max n splits each data point) 'min_samples_leaf' (min n of samples threshold to stop splitting) and 'max_leaf_nodes' (max total n of leaf nodes allowed on tree)

# Instead of looping over different options like we've been doing above, we will use scikitlearn's built-in GridSearchCV, which automates this process for us and tells us the best parameters 

param_grid = {
    'max_depth': [5, 15, 25],
    'min_samples_leaf': [1, 3],
    'max_leaf_nodes': [10, 20, 35, 50]} # list of hyperparameters we want to tune and values we want to try for each one. (These are parameters for 'DecisionTreeClassifier()')

dt = DecisionTreeClassifier() # instantiate decision tree model

gs = GridSearchCV(dt, param_grid, scoring='f1', cv=5) # instantiate grid search object. pass model, 'param_grid', 'scoring=' use f1 score as metric to determine best parameters, 'cv=5' number of folds for kfold cross validation

gs.fit(X, y) # fit grid search with features and targets. Grid search will build models for every possible combination of parameters. 3 'max_depth' values * 2 'min_samples_leaf' values * 4 'max_leaf_nodes' = 24 models. It will do 5 fold cross validation to find f1 score for each model

print("best params:", gs.best_params_) # show which model won
print("best score:", gs.best_score_) # f1 score for winning model

#  Might get slightly different results on each run  depending on the randomness of how the points are distributed among the folds. Generally if we have multiple models with comparable performance, we’d choose the simpler model

# code and comments by github.com/alandavidgrunberg


best params: {'max_depth': 15, 'max_leaf_nodes': 35, 'min_samples_leaf': 1}
best score: 0.7746390877915417
