In [3]:
import warnings 
warnings.filterwarnings('ignore')

# Data handling
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np

# EDA (pandas-profiling, etc. )
from statsmodels.genmod.families import Binomial
from scipy.stats import pointbiserialr
import scipy.stats as stats

# Feature Processing (Scikit-learn processing, etc. )
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mutual_info_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

# Machine Learning (Scikit-learn Estimators, Catboost, LightGBM, etc. )
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import KFold, cross_val_score

# Hyperparameters Fine-tuning (Scikit-learn hp search, cross-validation, etc. )
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import confusion_matrix

# Other packages
import os, pickle

# Feature Processing & Engineering
Here is the section to **clean**, **process** the dataset and **create new features**.

In [4]:
Data_All = pd.read_csv(r'Dataset\Train_Data.csv')
Data_All.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5034 entries, 0 to 5033
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            5034 non-null   object 
 1   SeniorCitizen     5034 non-null   int64  
 2   Partner           5034 non-null   object 
 3   Dependents        5034 non-null   object 
 4   tenure            5034 non-null   int64  
 5   PhoneService      5034 non-null   object 
 6   MultipleLines     5034 non-null   object 
 7   InternetService   5034 non-null   object 
 8   OnlineSecurity    5034 non-null   object 
 9   OnlineBackup      5034 non-null   object 
 10  DeviceProtection  5034 non-null   object 
 11  TechSupport       5034 non-null   object 
 12  StreamingTV       5034 non-null   object 
 13  StreamingMovies   5034 non-null   object 
 14  Contract          5034 non-null   object 
 15  PaperlessBilling  5034 non-null   object 
 16  PaymentMethod     5034 non-null   object 


In [5]:
Data_All.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.950001,1889.5,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.849998,108.150002,Yes
3,Male,0,No,No,45,No,No,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.299999,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.699997,151.649994,Yes


In [6]:
# check unique values of each column
for column in Data_All.columns:
    print('Column: {} - Unique Values: {}'.format(column, Data_All[column].unique()))

Column: gender - Unique Values: ['Female' 'Male']
Column: SeniorCitizen - Unique Values: [0 1]
Column: Partner - Unique Values: ['Yes' 'No']
Column: Dependents - Unique Values: ['No' 'Yes']
Column: tenure - Unique Values: [ 1 34  2 45  8 22 10 28 62 13 16 58 49 25 69 52 71 21 12 30 47 72 17 27
  5 46 11 70 63 43 15 60 18 66  9  3 31 50 64 56  7 42 35 48 29 65 38 68
 32 55 37 36 41  6  4 33 67 23 57 61 14 20 53 40 59 24 44 19 54 51 26 39]
Column: PhoneService - Unique Values: ['No' 'Yes']
Column: MultipleLines - Unique Values: ['No' 'Yes' 'No phone service']
Column: InternetService - Unique Values: ['DSL' 'Fiber optic' 'No']
Column: OnlineSecurity - Unique Values: ['No' 'Yes' 'No internet service']
Column: OnlineBackup - Unique Values: ['Yes' 'No' 'No internet service']
Column: DeviceProtection - Unique Values: ['No' 'Yes' 'No internet service']
Column: TechSupport - Unique Values: ['No' 'Yes' 'No internet service']
Column: StreamingTV - Unique Values: ['No' 'Yes' 'No internet service']

In [7]:
Data_All['SeniorCitizen'] = Data_All['SeniorCitizen'].astype('object')

In [8]:
Data_All.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5034 entries, 0 to 5033
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            5034 non-null   object 
 1   SeniorCitizen     5034 non-null   object 
 2   Partner           5034 non-null   object 
 3   Dependents        5034 non-null   object 
 4   tenure            5034 non-null   int64  
 5   PhoneService      5034 non-null   object 
 6   MultipleLines     5034 non-null   object 
 7   InternetService   5034 non-null   object 
 8   OnlineSecurity    5034 non-null   object 
 9   OnlineBackup      5034 non-null   object 
 10  DeviceProtection  5034 non-null   object 
 11  TechSupport       5034 non-null   object 
 12  StreamingTV       5034 non-null   object 
 13  StreamingMovies   5034 non-null   object 
 14  Contract          5034 non-null   object 
 15  PaperlessBilling  5034 non-null   object 
 16  PaymentMethod     5034 non-null   object 


In [9]:
Data_All.to_csv("Dataset\Final_Train.csv", index=False)

## Dataset Splitting

In [10]:
X = Data_All.drop(columns=['Churn'])
y = Data_All['Churn']

In [11]:
(X.shape, y.shape)

((5034, 19), (5034,))

## Label Encoding

In [12]:
# Encode the target variable (Churn) to have 0 or 1 instead of No or Yes

labelEncoder = LabelEncoder()

y = labelEncoder.fit_transform(y)

## Features encoding

In [13]:
# Identify numeric and non-numeric columns
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()

from sklearn.impute import SimpleImputer

# Creating imputer variables
numerical_imputer = SimpleImputer(strategy="mean")
categorical_imputer = SimpleImputer(strategy="most_frequent")

# Apply imputers to the data
X_train_cat = categorical_imputer.fit_transform(X[cat_cols])
X_train_num = numerical_imputer.fit_transform(X[num_cols])

In [14]:
encoder=OneHotEncoder(handle_unknown='ignore')

# encoding the xtrain categories and converting to a dataframe
X_train_cat_encoded = encoder.fit(X_train_cat)
X_train_cat_encoded = pd.DataFrame(encoder.transform(X_train_cat).toarray(),
                                   columns=encoder.get_feature_names_out(cat_cols))

## Features Scaling

In [15]:
scaler= StandardScaler()

X_train_num_scaled = scaler.fit_transform(X_train_num)
X_train_num_scaled = pd.DataFrame(X_train_num_scaled, columns = num_cols)

X = pd.concat([X_train_num_scaled,X_train_cat_encoded], axis =1)

In [16]:
X.columns

Index(['tenure', 'MonthlyCharges', 'TotalCharges', 'gender_Female',
       'gender_Male', 'SeniorCitizen_0', 'SeniorCitizen_1', 'Partner_No',
       'Partner_Yes', 'Dependents_No', 'Dependents_Yes', 'PhoneService_No',
       'PhoneService_Yes', 'MultipleLines_No',
       'MultipleLines_No phone service', 'MultipleLines_Yes',
       'InternetService_DSL', 'InternetService_Fiber optic',
       'InternetService_No', 'OnlineSecurity_No',
       'OnlineSecurity_No internet service', 'OnlineSecurity_Yes',
       'OnlineBackup_No', 'OnlineBackup_No internet service',
       'OnlineBackup_Yes', 'DeviceProtection_No',
       'DeviceProtection_No internet service', 'DeviceProtection_Yes',
       'TechSupport_No', 'TechSupport_No internet service', 'TechSupport_Yes',
       'StreamingTV_No', 'StreamingTV_No internet service', 'StreamingTV_Yes',
       'StreamingMovies_No', 'StreamingMovies_No internet service',
       'StreamingMovies_Yes', 'Contract_Month-to-month', 'Contract_One year',
       '

## Train set Balancing (SMOTE Algorithm)

SMOTE (Synthetic Minority Over-sampling Technique) is a method used to address class imbalance in a binary classification problem. 

Earlier we realised that our target vaiable has a class imbalance. One class (the minority class) has significantly fewer instances than the other class (the majority class). This imbalance can negatively impact the performance of machine learning models, as they might become biased toward the majority class.

SMOTE will aim to balance the class distribution by generating synthetic samples until the minority class has the same number of instances as the majority class. By creating synthetic samples, SMOTE helps the model better capture the patterns in the minority class and prevents it from favoring the majority class due to the imbalance. 

In [17]:
# apply SMOTE to the training data (oversampling)

smote = SMOTE(random_state=42, k_neighbors=5, sampling_strategy='auto')

X_resampled, y_resampled = smote.fit_resample(X, y)

#### Train-test split

In [18]:
# Split the data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

In [19]:
# check shape after resampling

pd.DataFrame(X_train).shape, pd.DataFrame(y_train).shape

((5916, 46), (5916, 1))

In [20]:
# view class distribution

pd.value_counts(pd.Series(y_train))

0    2958
1    2958
Name: count, dtype: int64

Our train dataset is now balaced

## Machine Learning Modeling 
Here is the section to **build**, **train**, **evaluate** and **compare** the models to each others.

### Model 1. Logistic Regression Model

#### Create the Model

In [21]:
LR = LogisticRegression(random_state=42)

#### Train the Model

In [22]:
LR.fit(X_train, y_train)

### Model 2. K-nearest Neighbors

#### Create the Model

In [23]:
knn = KNeighborsClassifier()

#### Train the Model

In [24]:
knn.fit(X_train, y_train)

### Model 3. Random Forest Classifier

#### Create the Model

In [25]:
rfm = RandomForestClassifier(random_state=42)

#### Train the Model

In [26]:
rfm.fit(X_train, y_train)

### Model 4. Support Vector Machines

#### Create the Model

In [27]:
svm = SVC(random_state=42)

#### Train the Model

In [28]:
svm.fit(X_train, y_train)

### Model 5. Gradient Boosting

#### Create the Model

In [29]:
gb = GradientBoostingClassifier(random_state=42)

#### Train the Model

In [30]:
gb.fit(X_train, y_train)

### Model 6. XGBoost

#### Create the Model

In [31]:
xgb = XGBClassifier(random_state=42)

#### Train the Model

In [32]:
xgb.fit(X_train, y_train)

## Model Evaluation
We create a pandas dataframe that will allow us to compare our models.

#### K-Fold Cross-Validation

k-fold cross-validation  estimates the performance on our models across multiple subsets of the data (k-folds), providing a comprehensive evaluation of their generalization ability. The model is trained and evaluated k times, with each fold serving as the validation set once. This process helps estimate the model's performance across different subsets of the data.

It helps estimate how well a model will perform on new, unseen data and provides insights into its stability and consistency.

In [33]:
# Create a dataframe with the K-fold Cross-Validation results

models = [
    ('Logistic Regression', LR),
    ('Random Forest', rfm),
    ('SVM', svm),
    ('Gradient Boosting', gb),
    ('XGBoost', xgb)
]

# number of k-folds
k = 5

results = []

for name, model in models:
    kf = KFold(n_splits=k, shuffle=True, random_state=42)  # Create a KFold object
    scores = cross_val_score(model, X_train, y_train, cv=kf, scoring='accuracy')

    # Append results to the list
    results.append((name, scores.mean(), scores.std()))

results_df = pd.DataFrame(results, columns=['Model', 'Mean Accuracy', 'Std Deviation'])

results_df.sort_values(by='Mean Accuracy', ascending=False)

Unnamed: 0,Model,Mean Accuracy,Std Deviation
1,Random Forest,0.853449,0.013452
4,XGBoost,0.848715,0.006835
3,Gradient Boosting,0.847193,0.009548
2,SVM,0.812879,0.009753
0,Logistic Regression,0.782282,0.014983


The output of our k-fold cross-validation is the mean accuracy and std deviation.

1. **Average Accuracy** is the mean across all k folds during the cross-validation process. Higher mean accuracy values indicate better predictive performance.
2. **Standard Deviation** measures the variability or spread of accuracy values across the k folds. A lower standard deviation suggests that the model's performance is consistent across different subsets of the data (folds), while a higher standard deviation indicates that the model's performance varies more widely. Smaller standard deviations are generally desirable because they indicate a more stable model.

The **Random Forest** model has the highest mean accuracy (0.8507) **85%** among the evaluated models. This means that, on average, the model correctly predicted the target variable for about 85% of the data points in each fold. It performs well on average across different folds, and it has a relatively low standard deviation (0.0087), indicating consistent performance.

#### Classification Report

In [34]:
model_names = ['Logistic Regression', 'k-NN', 'Random Forest', 'SVM', 'Gradient Boosting', 'XGBoost']
models = [LR, knn, rfm, svm, gb, xgb] # our trained models
model_names_list = []
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

# Loop through each model to calculate metrics and store information
for name, model in zip(model_names, models):
    # Make predictions on the test data
    y_pred = model.predict(X_test)
    
    # Calculate accuracy, precision, recall, and F1-score
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    # Store model name and metrics
    model_names_list.append(name)
    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)

# Create a DataFrame with the calculated metrics
metrics_df = pd.DataFrame({
    'Model': model_names_list,
    'Accuracy': accuracy_scores,
    'Precision': precision_scores,
    'Recall': recall_scores,
    'F1-Score': f1_scores
})

# Display the DataFrame
metrics_df.sort_values(by='Accuracy', ascending=False)

Unnamed: 0,Model,Accuracy,Precision,Recall,F1-Score
2,Random Forest,0.868919,0.862069,0.878378,0.870147
4,Gradient Boosting,0.852027,0.836129,0.875676,0.855446
5,XGBoost,0.849324,0.854595,0.841892,0.848196
3,SVM,0.816892,0.798726,0.847297,0.822295
1,k-NN,0.803378,0.741139,0.932432,0.825853
0,Logistic Regression,0.791216,0.769712,0.831081,0.79922


- The random forest model is our highest performing model with an accuracy of 0.869595 / **87%**.

**Accuracy**: Accuracy is a measure of the overall correctness of predictions made by the model. It indicates the proportion of correctly classified instances out of the total number of instances.

**Precision**: Precision is a metric that measures the proportion of true positive predictions (*correctly predicted positive instances*) out of all instances predicted as positive. It assesses the model's ability to avoid false positives.

**Recall**: Recall, also known as *sensitivity* or true positive rate, measures the proportion of true positive predictions out of all actual positive instances. It assesses the model's ability to capture all positive instances.

**F1-Score**: The F1-Score is the harmonic mean of precision and recall. It provides a balanced measure that considers both false positives and false negatives. It is particularly useful when dealing with imbalanced datasets.

- Our top 3 models (Random Forest, XGBoost and Gradient Boosting) are all tree based models, specifically ensemble learning techniques that combine multiple individual trees to improve overall performance and robustness. They reduce overfitting by averaging or boosting the individual trees' predictions. They offer a combination of powerful features that make them robust, accurate, and versatile for classification tasks across a wide range of domains and data characteristics.

## Hyperparameters tuning 

We will Fine-tune our top model using a ` RandomizedSearchCV`  (that is in sklearn.model_selection
) to find the best hyperparameters and achieve the maximum performance of the model

#### 1. Tuning Model 1 (Random Forest)

In [35]:
# Check current model parameters

current_params = rfm.get_params()
current_params

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

In [36]:
# Random Forest tuning

# Define the parameter distributions for hyperparameter tuning
param_grid = {
  'n_estimators': [20, 50, 100, 200, 300],
  'max_depth': [None, 10, 15, 20, 25],
  'min_samples_split': [2, 4, 6],
  'min_samples_leaf': [1, 2, 3, 4, 5],
  'class_weight': ['balanced', None],
  'max_features': ['auto', 'sqrt', 'log2'],
  'criterion': ['gini', 'entropy']
}


# Create a RandomForestClassifier model with a specific random state
# rf_model = RandomForestClassifier(random_state=42)

# Initialize RandomizedSearchCV with the RandomForestClassifier model and parameter distributions
random_search_rf = RandomizedSearchCV(estimator=rfm, param_distributions=param_grid, 
                                      scoring='accuracy', n_iter=150, random_state=42,
                                      cv=5, n_jobs=-1, verbose = 2)

# fit best estimator on train data
random_search_rf.fit(X_train, y_train)

# best parameters
best_params = random_search_rf.best_params_

# mean accuracy score of the best estimator
best_score = random_search_rf.best_score_

best_params

Fitting 5 folds for each of 150 candidates, totalling 750 fits


{'n_estimators': 100,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'log2',
 'max_depth': 20,
 'criterion': 'entropy',
 'class_weight': 'balanced'}

In [37]:
# mean accuracy score of the best estimator

best_score

0.8564908843754997

In [38]:
# Fit tuned model on train data

tuned_rf_model = random_search_rf.best_estimator_
tuned_rf_model.fit(X_train, y_train)

# original model
rfm.fit(X_train, y_train)

# make the predictions
random_search_rf_pred = tuned_rf_model.predict(X_test)
original_rf_model = rfm.predict(X_test)

Let's compare the tuned model and the original model performance

In [39]:
# Calculate the classification report

report = classification_report(y_test, original_rf_model, output_dict=True)
report_2 = classification_report(y_test, random_search_rf_pred, output_dict=True)

# Extract precision, recall, f1-score, and accuracy metrics for both classes

precision = ((report['1']['precision'] + report['0']['precision'])) / 2
recall = ((report['1']['recall'] + report['0']['recall'])) / 2
f1 = ((report['1']['f1-score'] + report['0']['f1-score'])) / 2

precision2 = ((report_2['1']['precision'] + report_2['0']['precision'])) / 2
recall2 = ((report_2['1']['recall'] + report_2['0']['recall'])) / 2
f12 = ((report_2['1']['f1-score'] + report_2['0']['f1-score'])) / 2

# Create dictionaries for metrics
metrics_original_Random_Forest = {
    'Total Precision': precision,
    'Total Recall': recall,
    'Total F1-Score': f1,
    'Accuracy': report['accuracy']
}

metrics_Tuned_Random_Forest = {
    'Total Precision': precision2,
    'Total Recall': recall2,
    'Total F1-Score': f12,
    'Accuracy': report_2['accuracy']
}

# Create DataFrames from the metrics dictionaries
metrics_df_original = pd.DataFrame(metrics_original_Random_Forest, index=['Original Random Forest'])
metrics_df_tuned = pd.DataFrame(metrics_Tuned_Random_Forest, index=['Tuned Random Forest'])

# Concatenate the DataFrames vertically to combine the metrics
combined_metrics_df = pd.concat([metrics_df_original, metrics_df_tuned])

combined_metrics_df

Unnamed: 0,Total Precision,Total Recall,Total F1-Score,Accuracy
Original Random Forest,0.869051,0.868919,0.868907,0.868919
Tuned Random Forest,0.86654,0.866216,0.866187,0.866216


- Our Model has barely improved, still at **87%** accuracy. It seems that we had the best parameters before even tuning our model.

In [40]:
# construct the confusion matrix for tuned model
confusion_matrix_rf = confusion_matrix(y_test, random_search_rf_pred)

confusion_matrix_rf

array([[630, 110],
       [ 88, 652]], dtype=int64)

- 630 instances were correctly classified as True Negatives (TN).
- 652 instances were correctly classified as True Positives (TP).
- 110 instances were classified as False Positives (FP).
- 88 instances were classified as False Negatives (FN).

## Export key components
Here is the section to **export** the important ML objects that will be use to develop an app: *Encoder, Scaler, ColumnTransformer, Model, Pipeline, etc*.

In [43]:
import joblib

destination = "toolkit"

# Create a directory if it doesn't exist
if not os.path.exists(destination):
    os.makedirs(destination)

# Create a dictionary to store the objects and their filenames
models = {
    "numerical_imputer": numerical_imputer,
    "categorical_imputer": categorical_imputer,
    "encoder": encoder,
    "scaler": scaler,
    "Final_model": rfm
}

# Loop through the models and save them using joblib.dump()
for name, model in models.items():
    file_path = os.path.join(destination, f"{name}.joblib")
    joblib.dump(model, file_path)


In [44]:
# Exporting the requirements
requirements = "\n".join(f"{m.__name__}=={m.__version__}" for m in globals().values() if getattr(m, "__version__", None))

with open("requirements.txt", "w") as f:
    f.write(requirements)