In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler

## Data Reading

In [None]:
dfA = pd.read_csv('Company A - Data.csv')

### Defining a function to do some cleaning

In [None]:
def fix_data(x):
    x.drop(['Unnamed: 0'], axis=1, inplace=True)
    x = x.replace(r'^\s*$', 0, regex=True)
    x['TotalCharges'] = pd.to_numeric(x['TotalCharges'])
    
    x.gender.replace(('Male', 'Female'), (1, 0), inplace=True)
    x.Partner.replace(('Yes', 'No'), (1, 0), inplace=True)
    x.Dependents.replace(('Yes', 'No'), (1, 0), inplace=True)
    x.PhoneService.replace(('Yes', 'No'), (1, 0), inplace=True)
    x.PaperlessBilling.replace(('Yes', 'No'), (1, 0), inplace=True)
    x.Churn.replace(('Yes', 'No'), (1, 0), inplace=True)
    x.dropna(axis=0, inplace=True)
    return(x)

We fit our data to the function we created above, mostly replacing binary categories for 1s and 0s for ease of modelling

In the next step we drop TotalCharges as its directly correlated to tenure*MonthlyCharges and CustomerID as it doesnt really add much

We also divide our data into X and y datasets for the modelling

In [None]:
dfA = fix_data(dfA)

X = dfA.drop(['Churn','customerID','TotalCharges'], axis=1)
y = dfA['Churn']

### One Hot encoding the rest of our category variables

In [None]:
ohe = OneHotEncoder(categories='auto', drop= 'first')
feature_arr = ohe.fit_transform(X[['InternetService','OnlineSecurity', 'OnlineBackup',
                                  'DeviceProtection', 'TechSupport','StreamingTV','StreamingMovies',
                                   'Contract','PaymentMethod','MultipleLines']]).toarray()

features = pd.DataFrame(feature_arr)

X = pd.concat([X, features], axis=1)
X.drop(['InternetService','OnlineSecurity', 'OnlineBackup',
        'DeviceProtection', 'TechSupport','StreamingTV','StreamingMovies',
        'Contract','PaymentMethod','MultipleLines'], axis=1, inplace=True)

X.dropna(axis=0, inplace=True)

## Modelling

As we will perform a grid search for best parameters, this automatically does a cross validation within the training data

Also we stratify our data as there are an unbalanced number of 1s and 0s

In [None]:
X_train, X_test, y, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

We scale the data for a better modelling

In [None]:
scaler = StandardScaler()
train_scaled = scaler.fit_transform(X_train)
test_scaled = scaler.transform(X_test)

We import the necessary libraries to perform the GridSearch and RandomForest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold

We create out 5 folds for the validation

In [None]:
cv = StratifiedKFold(n_splits=5, random_state=42)
cv.get_n_splits(train_scaled, y)

We perform the gridsearch to find out the best hyperparameter

In [None]:
model = RandomForestClassifier()
parameters = {'n_estimators': [80,85,90,100,105,110],
              'criterion': ['gini'],
              'min_samples_split': [2,3,4],
              'min_samples_leaf': [1,2,4,5,6,7],
              'max_features': ['auto','sqrt','log2']
             }

search = GridSearchCV(model, param_grid=parameters, cv=cv, scoring='accuracy', n_jobs = -1, verbose = 5)
result = search.fit(train_scaled, y)

In [None]:
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

Best hyperparameters for our case:

Best Score: 0.8040957410837188
Best Hyperparameters: {'criterion': 'gini', 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 4, 'n_estimators': 100}

Next, we fit our model with the hyperparameters found previously

In [None]:
RFC = RandomForestClassifier(criterion='gini', 
                             max_features='sqrt', 
                             min_samples_leaf=4, 
                             min_samples_split=4, 
                             n_estimators=100)

RFC.fit(train_scaled, y)

We predict the testing data and print metrics to see how good our model performed

In [None]:
predictions = RFC.predict(test_scaled)
print('Accuracy: ', accuracy_score(y_test, predictions))
print('F1 Score: ', f1_score(y_test, predictions))
print('\n')
print('Confusion Matrix: \n', confusion_matrix(y_test, predictions))

### Model Export

Finally we save our model into a pickle file to use it on the dashboard

In [None]:
import pickle
filename = 'daip5_V2.pkl'
pickle.dump(RFC, open(filename, 'wb')) 