In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [2]:
DF_PATH = '../data/dataframes/telecom_churn_scaled.csv'
df = pd.read_csv(DF_PATH)
df.head()

Unnamed: 0.1,Unnamed: 0,AccountWeeks,DataUsage,CustServCalls,DayMins,DayCalls,MonthlyCharge,OverageFee,RoamMins,AvgMinPerCall,AvgDataUsagePerWeek,RoamMinsRatio,ComplaintIndex,Churn,ContractRenewal,DataPlan
0,0,0.676489,1.480204,-0.427932,1.566767,0.476643,1.990727,-0.071584,-0.085008,0.735099,0.100484,-0.356334,-0.333213,0,1,1
1,1,0.149065,2.266072,-0.427932,-0.333738,1.124503,1.56451,-0.107082,1.240482,-0.761059,0.269943,0.223148,0.602388,0,1,1
2,2,0.902529,-0.641642,-1.188218,1.168304,0.675985,-0.262133,-1.574346,0.703121,0.359875,-0.164576,-0.20361,-0.848993,0,1,0
3,3,-0.42859,-0.641642,0.332354,2.196596,-1.466936,0.042307,-2.741846,-1.303026,3.201312,-0.164576,-0.5494,-1.938814,0,0,0
4,4,-0.654629,-0.641642,1.092641,-0.24009,0.626149,-0.931902,-1.037927,-0.049184,-0.540767,-0.164576,-0.07474,0.129447,0,0,0


In [3]:
X = df.drop(columns=['Churn'])
y = df['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, shuffle=True, random_state=42)

In [4]:
# Instantiating the RandomForest Model with Parametrized Attributes
strat_k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

param_grid = {
    'n_estimators' : [10, 100, 1000],
    'criterion' : ['gini', 'entropy', 'log_loss'],
    'max_features' : ['sqrt', 'log2', None],
    'bootstrap' : [True, False],
}

grid_search = GridSearchCV(
    RandomForestClassifier(random_state=42, class_weight='balanced'),
    param_grid=param_grid,
    cv=strat_k_fold,
    verbose=3,
    n_jobs=-1,
    scoring='f1'
)

# Fitting the search
grid_search.fit(X_train, y_train)

# Printing statistic
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best F1-Score: {grid_search.best_score_:0.3f}")

Fitting 5 folds for each of 54 candidates, totalling 270 fits
[CV 1/5] END bootstrap=True, criterion=gini, max_features=sqrt, n_estimators=10;, score=0.661 total time=   0.1s
[CV 4/5] END bootstrap=True, criterion=gini, max_features=sqrt, n_estimators=10;, score=0.628 total time=   0.1s
[CV 3/5] END bootstrap=True, criterion=gini, max_features=sqrt, n_estimators=10;, score=0.612 total time=   0.1s
[CV 2/5] END bootstrap=True, criterion=gini, max_features=sqrt, n_estimators=10;, score=0.672 total time=   0.1s
[CV 5/5] END bootstrap=True, criterion=gini, max_features=sqrt, n_estimators=10;, score=0.610 total time=   0.1s
[CV 1/5] END bootstrap=True, criterion=gini, max_features=log2, n_estimators=100;, score=0.672 total time=   1.0s
[CV 1/5] END bootstrap=True, criterion=gini, max_features=sqrt, n_estimators=100;, score=0.672 total time=   1.0s
[CV 2/5] END bootstrap=True, criterion=gini, max_features=log2, n_estimators=100;, score=0.730 total time=   0.9s
[CV 3/5] END bootstrap=True, cr

In [5]:
# Instantiating a RM clasifier based on the best output of the grid search
bootstrap = grid_search.best_estimator_.get_params()['bootstrap']
criterion =  grid_search.best_estimator_.get_params()['criterion']
max_features =  grid_search.best_estimator_.get_params()['max_features']
n_estimators =  grid_search.best_estimator_.get_params()['n_estimators']

rf_classifier = RandomForestClassifier(
    bootstrap=bootstrap,
    criterion=criterion,
    max_features= max_features,
    n_estimators=n_estimators,
    class_weight='balanced',
    random_state=42,
    verbose=1
)
rf_classifier.fit(X_train, y_train)

preds = rf_classifier.predict(X_test)
results = classification_report(y_true=y_test, y_pred=preds, output_dict=True)
results

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    1.2s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    4.7s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:   10.7s
[Parallel(n_jobs=1)]: Done 799 tasks      | elapsed:   18.9s
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:   23.7s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 799 tasks      | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:    0.1s finished


{'0': {'precision': 0.9359190556492412,
  'recall': 0.9736842105263158,
  'f1-score': 0.9544282029234737,
  'support': 570.0},
 '1': {'precision': 0.7972972972972973,
  'recall': 0.6082474226804123,
  'f1-score': 0.6900584795321637,
  'support': 97.0},
 'accuracy': 0.9205397301349325,
 'macro avg': {'precision': 0.8666081764732692,
  'recall': 0.7909658166033641,
  'f1-score': 0.8222433412278187,
  'support': 667.0},
 'weighted avg': {'precision': 0.9157596695021069,
  'recall': 0.9205397301349325,
  'f1-score': 0.9159816314557719,
  'support': 667.0}}