In [None]:
#Import our standard libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#suppress warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
#Load Clearned Dataset
fname = "titanic_cleaned.csv"
df = pd.read_csv(fname) 

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df.head()

In [None]:
#Check for multi-collinearity
cormat = df.corr()
round(cormat,2)

In [None]:
#Select only required classes and separate dependent & independent variables
cols_needed = ['Pclass', 'Sex','Age','Embarked', 'Cabin_ind','New_fare','Family_Cnt']
X = df[cols_needed]
y = df['Survived']

In [None]:
#Create dummies for qualitative variables
X = pd.get_dummies(X, columns = ['Pclass','Sex','Embarked'])

In [None]:
X.head()

In [None]:
#Remove gender female, as gender-male is enough
X.drop('Sex_female', axis =1, inplace = True)

In [None]:
X.head()

In [None]:
# Split the data into training and testing set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=428)

In [None]:
#Check the shape of train and test data
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
179/(712+179)

In [None]:
#load Logistic Regression
from sklearn.linear_model import LogisticRegression

In [None]:
#Fit logistic regression
lr_model = LogisticRegression()
lr_model.fit(X_train,y_train)

In [None]:
#Check co-efficients 
lr_model.coef_

In [None]:
#Look at the intercept
lr_model.intercept_

In [None]:
#predict for test data
predictions = lr_model.predict(X_test)

In [None]:
predictions

In [None]:
#If you want probability
lr_model.predict_proba(X_test)

In [None]:
#Check the accuracy of the model
print('training accuracy:', lr_model.score(X_train, y_train))
print('test accuracy:',lr_model.score(X_test, y_test))

In [None]:
# Print the Confusion Matrix and slice it into four pieces

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, predictions)
print('Confusion matrix\n\n', cm)

In [None]:
# visualize confusion matrix with seaborn heatmap

cm_matrix = pd.DataFrame(data=cm, columns=['Predicted -ve:0', 'Predicted +ve:1'], 
                                 index=['Actual -ve:0', 'Actual +ve:1'])

sns.heatmap(cm_matrix, annot=True, fmt='d', cmap='YlGnBu')

# Evaluation

In [None]:
#Accuracy
(90+52)/(90+15+22+52)

In [None]:
#Precision 
#Out of predicted as Survived, how many were actually survived?
print(52/(52+15))

In [None]:
#Recall
#Out of all the survive, how many were actually survived?
print(52/(52+22))

In [None]:
# Accuracy Score
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, predictions))

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, predictions)
print('Confusion matrix\n\n', cm)

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, predictions))

In [None]:
#Precision
print(90/(90+22)) #if Not survived is your relevant class
print(52/(52+15)) #if survived is your relevant class

In [None]:
#Recall
print(90/(90+15)) #if Not survived is your relevant class
print(52/(52+22)) #if survived is your relevant class

In [None]:
#Try different Threshold and check precision and recall
prediction_prob = lr_model.predict_proba(X_test)[:,1]
type(prediction_prob)
#predictions_threshold = 
prediction_prob[prediction_prob > 0.8] = 1
prediction_prob[prediction_prob <= 0.8] = 0

In [None]:
cm = confusion_matrix(y_test, prediction_prob)
print('Confusion matrix\n\n', cm)

In [None]:
print(classification_report(y_test, prediction_prob))
#Recall up, Precicision down

In [None]:
#0.2
prediction_prob = lr_model.predict_proba(X_test)[:,1]
type(prediction_prob)
#predictions_threshold = 
prediction_prob[prediction_prob > 0.2] = 1
prediction_prob[prediction_prob <= 0.2] = 0

In [None]:
cm = confusion_matrix(y_test, prediction_prob)
print('Confusion matrix\n\n', cm)

In [None]:
print(classification_report(y_test, prediction_prob))
#Recall down, precision up

In [None]:
from sklearn.metrics import precision_recall_curve
# Generate precision recall curve values: precision, recall, thresholds
precision, recall, thresholds = precision_recall_curve(y_test, prediction_prob)

# Plot Precision Recall curve
plt.plot(precision, recall)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision Recall Curve')
plt.show()

In [None]:
from sklearn.metrics import roc_curve, auc

false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, prediction_prob)
roc_auc = auc(false_positive_rate, true_positive_rate)

print("AUC - ",format(roc_auc))
# Plot Precision Recall curve
plt.plot(false_positive_rate, true_positive_rate)
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC Curve')
plt.show()

## Decision Trees

In [None]:
#Decision Trees
from sklearn.tree import DecisionTreeClassifier, export_text, plot_tree



In [None]:
#Instantiate the object
ctree = DecisionTreeClassifier(random_state = 0, max_depth = 3)

In [None]:
#Train/fit the model
ctree.fit(X_train, y_train)

In [None]:
#Get all the rules
text_rep = export_text(ctree)
print(text_rep)

In [None]:
plt.style.use('seaborn')
fig = plt.figure(figsize=(5,3), dpi = 300)
_ = plot_tree(ctree, 
                   feature_names=X.columns,  
                   impurity = True,
                   class_names=['Not Survived', 'Survived'],
                   filled=True, rounded = True)
plt.show()

In [None]:
#Predict for test data

predictions = ctree.predict(X_test)

In [None]:
#accuracy score
print(accuracy_score(y_test, predictions))

In [None]:
cm = confusion_matrix(y_test, predictions)
print('Confusion matrix\n\n', cm)

In [None]:
print(classification_report(y_test, predictions))

## Full Grown Tree

In [None]:
#Instantiate the object
fulltree = DecisionTreeClassifier(random_state = 0)

In [None]:
fulltree.fit(X_train, y_train)

In [None]:
plt.style.use('seaborn')
fig = plt.figure(figsize=(5,3), dpi = 300)
_ = plot_tree(fulltree, 
                   feature_names=X.columns,  
                   impurity = True,
                   class_names=['Not Survived', 'Survived'],
                   filled=True, rounded = True)
plt.show()

In [None]:
#Accuracy on training data

train_pred = fulltree.predict(X_train)
print(accuracy_score(y_train, train_pred))

In [None]:
#predict on test data
test_pred = fulltree.predict(X_test)
print(accuracy_score(y_test, test_pred))

## Small Tree

In [None]:
#Instantiate the object
smalltree = DecisionTreeClassifier(random_state = 0, max_depth = 2)

In [None]:
smalltree.fit(X_train, y_train)

In [None]:
plt.style.use('seaborn')
fig = plt.figure(figsize=(5,3), dpi = 300)
_ = plot_tree(smalltree, 
                   feature_names=X.columns,  
                   impurity = True,
                   class_names=['Not Survived', 'Survived'],
                   filled=True, rounded = True)
plt.show()

In [None]:
#Accuracy on training data

train_pred = smalltree.predict(X_train)
print(accuracy_score(y_train, train_pred))

In [None]:
#predict on test data
test_pred = smalltree.predict(X_test)
print(accuracy_score(y_test, test_pred))

## Hyperparameter Tuning Using GridSearchCV

In [None]:
from sklearn.model_selection import cross_val_score, GridSearchCV

In [None]:
#A function to print gridsearch results

def print_results(results):
    print('BEST PARAMS: {}\n'.format(results.best_params_))

    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print('{} (+/-{}) for {}'.format(round(mean, 3), round(std * 2, 3), params))

In [None]:
#Start With an initial guess for params
param_grid = {
    'max_depth':[10, 20, 30, 40],
    'min_samples_split': [20, 40, 60, 80],
    'min_impurity_decrease':[0, 0.0005, 0.001, 0.005, 0.01],
    'criterion':['gini','entropy']
}

In [None]:
gridsearch = GridSearchCV(DecisionTreeClassifier(random_state = 0),
                          param_grid,
                          cv = 5,
                          n_jobs = -1)
gridsearch.fit(X_train, y_train)

In [None]:
print_results(gridsearch)

In [None]:
print('Initial score:', gridsearch.best_score_)
print('Initial score:', gridsearch.best_params_)

In [None]:
#Adopt the hyperparameters
param_grid = {
    'max_depth':[5, 8, 10, 12],
    'min_samples_split': [10, 15, 20, 40],
    'min_impurity_decrease':[0.001, 0.005, 0.01],
    'criterion':['gini','entropy']
}

In [None]:
gridsearch = GridSearchCV(DecisionTreeClassifier(random_state = 0),
                          param_grid,
                          cv = 5,
                          n_jobs = -1)
gridsearch.fit(X_train, y_train)

In [None]:
print_results(gridsearch)

In [None]:
print('Final score:', gridsearch.best_score_)
print('Final Param:', gridsearch.best_params_)

In [None]:
#Take teh best estimator
bestCtree = gridsearch.best_estimator_

In [None]:
predictions = bestCtree.predict(X_test)

In [None]:
print(accuracy_score(y_test, predictions))

In [None]:
cm = confusion_matrix(y_test,predictions)
print(cm)

In [None]:
print(classification_report(y_test, predictions))

In [None]:
plt.style.use('seaborn')
fig = plt.figure(figsize=(5,5), dpi = 300)
_ = plot_tree(bestCtree, 
                   feature_names=X.columns,  
                   impurity = True,
                   class_names=['Not Survived', 'Survived'],
                   filled=True, rounded = True)
plt.show()

## Post - Pruning (Cost complexity parameter)

In [None]:
#CCP
param_grid = {
    "ccp_alpha" : [0.001, 0.005, 0.01, 0.05, 0.1]
}

gridsearch = GridSearchCV(DecisionTreeClassifier(random_state = 0),
                          param_grid,
                          cv = 5,
                          n_jobs = -1)

gridsearch.fit(X_train, y_train)

In [None]:
print_results(gridsearch)

In [None]:
print('Final score:', gridsearch.best_score_)
print('Final Param:', gridsearch.best_params_)

In [None]:
bestCtree_cp =gridsearch.best_estimator_

In [None]:
predictions = bestCtree_cp.predict(X_test)

In [None]:
print(accuracy_score(y_test, predictions))

In [None]:
plt.style.use('seaborn')
fig = plt.figure(figsize=(5,5), dpi = 300)
_ = plot_tree(bestCtree_cp, 
                   feature_names=X.columns,  
                   impurity = True,
                   class_names=['Not Survived', 'Survived'],
                   filled=True, rounded = True)
plt.show()