Coursera Google Advanced Data Analytics

Course 6 The Nuts and Bolts of Machine Learning

Module 4 Tree based modeling

Build and validate a random forest model using a validation data set


In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

# This lets us see all of the columns, preventing Juptyer from redacting them.
pd.set_option('display.max_columns', None)

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score,\
f1_score, confusion_matrix, ConfusionMatrixDisplay

from sklearn.ensemble import RandomForestClassifier

# This module lets us save our models once we fit them.
import pickle

In [2]:
# Read in data
file = '/content/Churn_Modelling.csv'
df_original = pd.read_csv(file)
df_original.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [3]:
# Drop useless and sensitive (Gender) cols
churn_df = df_original.drop(['RowNumber', 'CustomerId', 'Surname', 'Gender'], axis=1)
churn_df.head()

Unnamed: 0,CreditScore,Geography,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,41,1,83807.86,1,0,1,112542.58,0
2,502,France,42,8,159660.8,3,1,0,113931.57,1
3,699,France,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,43,2,125510.82,1,1,1,79084.1,0


In [4]:
# Dummy encode categoricals
churn_df2 = pd.get_dummies(churn_df, drop_first='True')
churn_df2.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_Germany,Geography_Spain
0,619,42,2,0.0,1,1,1,101348.88,1,False,False
1,608,41,1,83807.86,1,0,1,112542.58,0,False,True
2,502,42,8,159660.8,3,1,0,113931.57,1,False,False
3,699,39,1,0.0,2,0,0,93826.63,0,False,False
4,850,43,2,125510.82,1,1,1,79084.1,0,False,True


In [5]:
# Define the y (target) variable
y = churn_df2["Exited"]

# Define the X (predictor) variables
X = churn_df2.copy()
X = X.drop("Exited", axis = 1)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=42)

In [14]:
%%time

rf = RandomForestClassifier(random_state=0)

'''
cv_params = {'max_depth': [2,3,4,5, None],
             'min_samples_leaf': [1,2,3],
             'min_samples_split': [2,3,4],
             'max_features': [2,3,4],
             'n_estimators': [75, 100, 125, 150]
             }
'''

cv_params = {
    'max_depth': [4, 6, None],  # Fewer options
    'min_samples_leaf': [2, 5],  # Fewer options
    'min_samples_split': [2, 4],  # Fewer options
    'max_features': [2, 4],  # Fewer options
    'n_estimators': [100, 150]  # Fewer options
}

# Define scoring as a dictionary for multiple metrics
scoring = {
    'accuracy': 'accuracy',
    'precision': 'precision',
    'recall': 'recall',
    'f1': 'f1'
}

# GridSearchCV with corrected scoring parameter
rf_cv = GridSearchCV(
    estimator=rf,
    param_grid=cv_params,
    scoring=scoring,
    cv=5,
    refit='f1'  # Refits the model based on the F1 score
)

# Line below is commented out because it was already run
# Results saved in `rf_cv_model.pickle` below
#rf_cv.fit(X_train, y_train)

CPU times: user 115 µs, sys: 1e+03 ns, total: 116 µs
Wall time: 121 µs


In [15]:
# Define a path to the folder where you want to save the model
path = '/content/'

In [16]:
# Cell has already been run and saved
# Uncomment if refitting model above, along with fitting the data above

# Pickle the model
with open(path+'rf_cv_model.pickle', 'wb') as to_write:
    pickle.dump(rf_cv, to_write)

In [17]:
# Read in pickled model
with open(path + 'rf_cv_model.pickle', 'rb') as to_read:
    rf_cv = pickle.load(to_read)

In [19]:
from sklearn.model_selection import RandomizedSearchCV

rf_random_cv = RandomizedSearchCV(
    rf, cv_params, scoring='f1', cv=5, n_iter=50, random_state=42  # Limit to 50 random combinations
)
rf_random_cv.fit(X_train, y_train)
print(rf_random_cv.best_params_)



KeyboardInterrupt: 

In [18]:
# Model has been fit already so line below is commented out
#rf_cv.fit(X_train, y_train)

#rf_cv.best_params_

KeyboardInterrupt: 