In [24]:
import pandas as pd
import numpy as np

from ml_from_scratch.ensemble import RandomForestClassifier

from ml_from_scratch.cross_validation import GridSearchCV
from ml_from_scratch.metrics import precision_score

from sklearn.model_selection import train_test_split

In [2]:
pd.set_option('max_colwidth', 400)

In [3]:
# Import data
filename = "data/churn.csv"

# Load
data = pd.read_csv(filename, sep=',')
data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [4]:
# Drop unnecessary Data
data = data.drop(columns=["RowNumber", "CustomerId", "Surname", "Geography"])

# Transform Gender Data
data["Gender"] = data["Gender"].apply(lambda x: 1 if x == "Female" else 0)

data.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,1,42,2,0.0,1,1,1,101348.88,1
1,608,1,41,1,83807.86,1,0,1,112542.58,0
2,502,1,42,8,159660.8,3,1,0,113931.57,1
3,699,1,39,1,0.0,2,0,0,93826.63,0
4,850,1,43,2,125510.82,1,1,1,79084.1,0


In [5]:
data["Exited"].value_counts(normalize=True)

Exited
0    0.7963
1    0.2037
Name: proportion, dtype: float64

In [6]:
target_columns = "Exited"
X = data.drop(columns=target_columns)
y = data[target_columns]

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    stratify=y)

In [13]:
grid_params = {
    "n_estimators": [10, 50, 200, 500, 800, 1000],
    "max_features": ["sqrt", "log2"],
    "criterion": ["gini", "entropy"]
}

In [14]:
clf = GridSearchCV(estimator=RandomForestClassifier,
                      param_grid=grid_params,
                      scoring="accuracy",
                      cv=10)

In [15]:
clf.fit(X=X_train, y=y_train)

Unnamed: 0,parameter,score_method,cv_score_train,cv_score_valid
0,"{'n_estimators': 10, 'max_features': 'sqrt', 'criterion': 'gini'}",accuracy,0.873793,0.807733
1,"{'n_estimators': 10, 'max_features': 'sqrt', 'criterion': 'entropy'}",accuracy,0.875719,0.8084
2,"{'n_estimators': 10, 'max_features': 'log2', 'criterion': 'gini'}",accuracy,0.873793,0.807733
3,"{'n_estimators': 10, 'max_features': 'log2', 'criterion': 'entropy'}",accuracy,0.875719,0.8084
4,"{'n_estimators': 50, 'max_features': 'sqrt', 'criterion': 'gini'}",accuracy,0.873985,0.806533
5,"{'n_estimators': 50, 'max_features': 'sqrt', 'criterion': 'entropy'}",accuracy,0.87723,0.805733
6,"{'n_estimators': 50, 'max_features': 'log2', 'criterion': 'gini'}",accuracy,0.873985,0.806533
7,"{'n_estimators': 50, 'max_features': 'log2', 'criterion': 'entropy'}",accuracy,0.87723,0.805733
8,"{'n_estimators': 200, 'max_features': 'sqrt', 'criterion': 'gini'}",accuracy,0.858296,0.804
9,"{'n_estimators': 200, 'max_features': 'sqrt', 'criterion': 'entropy'}",accuracy,0.860785,0.804133


In [32]:
clf.cv_results[11]["cv_score_valid"]

0.8041333333333334

In [37]:
clf._parameters[11]

{'n_estimators': 200, 'max_features': 'log2', 'criterion': 'entropy'}

In [17]:
cv_precision = GridSearchCV(estimator=RandomForestClassifier,
                            param_grid=grid_params,
                            scoring="precision",
                            cv=10)

In [18]:
cv_precision.fit(X_train, y_train)

Unnamed: 0,parameter,score_method,cv_score_train,cv_score_valid
0,"{'n_estimators': 10, 'max_features': 'sqrt', 'criterion': 'gini'}",precision,0.999241,0.825012
1,"{'n_estimators': 10, 'max_features': 'sqrt', 'criterion': 'entropy'}",precision,0.998888,0.827338
2,"{'n_estimators': 10, 'max_features': 'log2', 'criterion': 'gini'}",precision,0.999241,0.825012
3,"{'n_estimators': 10, 'max_features': 'log2', 'criterion': 'entropy'}",precision,0.998888,0.827338
4,"{'n_estimators': 50, 'max_features': 'sqrt', 'criterion': 'gini'}",precision,1.0,0.974603
5,"{'n_estimators': 50, 'max_features': 'sqrt', 'criterion': 'entropy'}",precision,1.0,0.972222
6,"{'n_estimators': 50, 'max_features': 'log2', 'criterion': 'gini'}",precision,1.0,0.974603
7,"{'n_estimators': 50, 'max_features': 'log2', 'criterion': 'entropy'}",precision,1.0,0.972222
8,"{'n_estimators': 200, 'max_features': 'sqrt', 'criterion': 'gini'}",precision,1.0,1.0
9,"{'n_estimators': 200, 'max_features': 'sqrt', 'criterion': 'entropy'}",precision,1.0,1.0


In [34]:
cv_precision._parameters[6]

{'n_estimators': 50, 'max_features': 'log2', 'criterion': 'gini'}

In [36]:
cv_precision.cv_results[6]["cv_score_valid"]

0.9746031746031747

In [27]:
rforest = RandomForestClassifier(n_estimators=50,
                                 max_features="log2",
                                 criterion="gini")

rforest.fit(X_train, y_train)
y_pred = rforest.predict(X_test)

precision_score(y_test, y_pred)

0.9655172413793104

In [28]:
print("Precision Score: ", precision_score(y_test, y_pred))

Precision Score:  0.9655172413793104


In [39]:
rforest_acc = RandomForestClassifier(n_estimators=200,
                                 max_features="log2",
                                 criterion="entropy")

rforest_acc.fit(X_train, y_train)
y_pred_acc = rforest_acc.predict(X_test)

precision_score(y_test, y_pred_acc)

1.0