In [4]:
import mlflow
import mlflow.sklearn
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import recall_score

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score 

import os
import time

In [6]:
# Import the Dataset
df1 = pd.read_excel('train1.xlsx')
df1.head()

Unnamed: 0,CustomerID,MonthlyRevenue,MonthlyMinutes,TotalRecurringCharge,DirectorAssistedCalls,OverageMinutes,RoamingCalls,PercChangeMinutes,PercChangeRevenues,DroppedCalls,...,RVOwner_50-100,RVOwner_100-200,RVOwner_200-300,RVOwner_300-400,RVOwner_400+,RVOwner_11-20,RVOwner_21-30,RVOwner_31-40,RVOwner_40+,RVOwner_Top 10
0,3000002,24.0,219,22,0.25,0,0.0,-157,-19.0,0.7,...,0,0,0,0,0,0,0,0,1,0
1,3000010,16.99,10,17,0.0,0,0.0,-4,0.0,0.3,...,0,0,0,0,0,0,0,0,1,0
2,3000014,38.0,8,38,0.0,0,0.0,-2,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
3,3000022,82.28,1312,75,1.24,0,0.0,157,8.1,52.0,...,0,0,0,0,0,0,0,0,1,0
4,3000026,17.14,0,17,0.0,0,0.0,0,-0.2,0.0,...,0,0,0,0,0,0,0,0,1,0


In [14]:
# dropping the customerid
df1.drop(['CustomerID'], axis=1, inplace=True)
df1.head()

Unnamed: 0,MonthlyRevenue,MonthlyMinutes,TotalRecurringCharge,DirectorAssistedCalls,OverageMinutes,RoamingCalls,PercChangeMinutes,PercChangeRevenues,DroppedCalls,BlockedCalls,...,RVOwner_50-100,RVOwner_100-200,RVOwner_200-300,RVOwner_300-400,RVOwner_400+,RVOwner_11-20,RVOwner_21-30,RVOwner_31-40,RVOwner_40+,RVOwner_Top 10
0,24.0,219,22,0.25,0,0.0,-157,-19.0,0.7,0.7,...,0,0,0,0,0,0,0,0,1,0
1,16.99,10,17,0.0,0,0.0,-4,0.0,0.3,0.0,...,0,0,0,0,0,0,0,0,1,0
2,38.0,8,38,0.0,0,0.0,-2,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
3,82.28,1312,75,1.24,0,0.0,157,8.1,52.0,7.7,...,0,0,0,0,0,0,0,0,1,0
4,17.14,0,17,0.0,0,0.0,0,-0.2,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0


In [15]:
# Train Test Split
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve


# split the dataset into independent and dependent data
X = df1.drop('Churn', axis=1)
Y = df1['Churn']


# performing the split
x_train, x_test, y_train, y_test = train_test_split(X, Y, 
                                                   test_size=0.30,
                                                   random_state=1,
                                                   stratify = Y)

 # Model Building

In [18]:
from sklearn.ensemble import RandomForestClassifier

In [27]:
# setting up the hyperparameter grid
rf_model = RandomForestClassifier()

param_grid = {
    'n_estimators': [100, 200, 300, 400],
}


# setting up the gridsearch
grid_search = GridSearchCV(
    estimator = rf_model,
    param_grid = param_grid,
    scoring = 'recall',
    cv=3,
    verbose=3
)


# Fitting the Grid Search to the Training Data
with mlflow.start_run() as run:
    grid_search.fit(x_train, y_train)
    
    # Log parameters, metrics, and model
    mlflow.log_params(grid_search.best_params_)
    mlflow.log_metric("recall", grid_search.best_score_)
    mlflow.sklearn.log_model(grid_search.best_estimator_, "best_model")

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV 1/3] END ..................n_estimators=100;, score=0.066 total time=   4.9s
[CV 2/3] END ..................n_estimators=100;, score=0.065 total time=   4.9s
[CV 3/3] END ..................n_estimators=100;, score=0.063 total time=   4.8s
[CV 1/3] END ..................n_estimators=200;, score=0.059 total time=   9.7s
[CV 2/3] END ..................n_estimators=200;, score=0.058 total time=   9.6s
[CV 3/3] END ..................n_estimators=200;, score=0.054 total time=   9.6s
[CV 1/3] END ..................n_estimators=300;, score=0.057 total time=  14.5s
[CV 2/3] END ..................n_estimators=300;, score=0.053 total time=  14.3s
[CV 3/3] END ..................n_estimators=300;, score=0.055 total time=  14.1s
[CV 1/3] END ..................n_estimators=400;, score=0.060 total time=  19.1s
[CV 2/3] END ..................n_estimators=400;, score=0.052 total time=  18.9s
[CV 3/3] END ..................n_estimators=400;,

