In [1]:
import pickle

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [2]:
df = pd.read_csv('customer_data.csv')
df

Unnamed: 0,age,gender,tenure,usage_frequency,support_calls,payment_delay,subscription_type,contract_length,total_spend,last_interaction,churn
0,30,Female,39,14,5,18,Standard,Annual,932.0,17,1
1,65,Female,49,1,10,8,Basic,Monthly,557.0,6,1
2,55,Female,14,4,6,18,Basic,Quarterly,185.0,3,1
3,58,Male,38,21,7,7,Standard,Monthly,396.0,29,1
4,23,Male,32,20,5,8,Basic,Monthly,617.0,20,1
...,...,...,...,...,...,...,...,...,...,...,...
505201,45,Female,33,12,6,21,Basic,Quarterly,947.0,14,1
505202,37,Male,6,1,5,22,Standard,Annual,923.0,9,1
505203,25,Male,39,14,8,30,Premium,Monthly,327.0,20,1
505204,50,Female,18,19,7,22,Standard,Monthly,540.0,13,1


In [3]:
# train-test spilt 
y = df['churn']
X = df.drop(columns='churn')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=23)
# Reset the index of the resulting DataFrames

X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

In [4]:
# check 
X_train

Unnamed: 0,age,gender,tenure,usage_frequency,support_calls,payment_delay,subscription_type,contract_length,total_spend,last_interaction
0,19,Female,48,7,3,30,Premium,Annual,787.00,29
1,65,Female,11,20,9,14,Standard,Monthly,562.00,13
2,38,Male,8,20,1,4,Basic,Quarterly,961.86,8
3,38,Female,59,25,10,4,Premium,Annual,706.00,14
4,46,Male,38,24,10,16,Standard,Annual,260.00,25
...,...,...,...,...,...,...,...,...,...,...
404159,24,Female,1,4,2,18,Standard,Quarterly,740.72,15
404160,62,Male,29,23,9,24,Standard,Quarterly,327.10,11
404161,34,Male,13,16,1,6,Premium,Quarterly,520.36,23
404162,35,Male,23,18,4,10,Standard,Monthly,420.00,3


In [5]:
X_test

Unnamed: 0,age,gender,tenure,usage_frequency,support_calls,payment_delay,subscription_type,contract_length,total_spend,last_interaction
0,58,Male,2,20,6,24,Standard,Quarterly,664.00,20
1,52,Male,14,13,2,13,Standard,Monthly,650.00,9
2,46,Male,38,3,0,9,Basic,Annual,571.47,24
3,29,Female,59,27,5,22,Basic,Quarterly,502.00,2
4,42,Male,29,7,1,20,Basic,Annual,541.34,10
...,...,...,...,...,...,...,...,...,...,...
101037,29,Male,33,9,1,1,Basic,Annual,801.41,29
101038,34,Male,43,28,9,11,Standard,Annual,435.00,30
101039,50,Female,40,13,0,1,Premium,Quarterly,651.28,9
101040,49,Female,46,6,0,20,Standard,Quarterly,897.68,2


In [6]:
# one hot encoding 
encoder = OneHotEncoder(sparse_output=False)
encoder.fit(X_train[['gender','subscription_type','contract_length']])

In [7]:
# get feature names after encoder 
feature_names = encoder.get_feature_names_out(['gender','subscription_type','contract_length'])
feature_names

array(['gender_Female', 'gender_Male', 'subscription_type_Basic',
       'subscription_type_Premium', 'subscription_type_Standard',
       'contract_length_Annual', 'contract_length_Monthly',
       'contract_length_Quarterly'], dtype=object)

In [8]:
train_categorical_encoder = encoder.transform(X_train[['gender','subscription_type','contract_length']])
train_encoder_df = pd.DataFrame(train_categorical_encoder, columns=feature_names)

test_categorical_encoder = encoder.transform(X_test[['gender','subscription_type','contract_length']])
test_encoder_df = pd.DataFrame(test_categorical_encoder, columns=feature_names)

In [9]:
test_encoder_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101042 entries, 0 to 101041
Data columns (total 8 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   gender_Female               101042 non-null  float64
 1   gender_Male                 101042 non-null  float64
 2   subscription_type_Basic     101042 non-null  float64
 3   subscription_type_Premium   101042 non-null  float64
 4   subscription_type_Standard  101042 non-null  float64
 5   contract_length_Annual      101042 non-null  float64
 6   contract_length_Monthly     101042 non-null  float64
 7   contract_length_Quarterly   101042 non-null  float64
dtypes: float64(8)
memory usage: 6.2 MB


In [10]:
conv_col = ['gender_Female','gender_Male', 'subscription_type_Basic', 'subscription_type_Premium', 'subscription_type_Standard' ,'contract_length_Annual','contract_length_Monthly', 'contract_length_Quarterly' ]

for col in conv_col:
    train_encoder_df[col] = train_encoder_df[col].astype(int)

In [11]:
train_encoder_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 404164 entries, 0 to 404163
Data columns (total 8 columns):
 #   Column                      Non-Null Count   Dtype
---  ------                      --------------   -----
 0   gender_Female               404164 non-null  int32
 1   gender_Male                 404164 non-null  int32
 2   subscription_type_Basic     404164 non-null  int32
 3   subscription_type_Premium   404164 non-null  int32
 4   subscription_type_Standard  404164 non-null  int32
 5   contract_length_Annual      404164 non-null  int32
 6   contract_length_Monthly     404164 non-null  int32
 7   contract_length_Quarterly   404164 non-null  int32
dtypes: int32(8)
memory usage: 12.3 MB


In [12]:
train_encoder_df

Unnamed: 0,gender_Female,gender_Male,subscription_type_Basic,subscription_type_Premium,subscription_type_Standard,contract_length_Annual,contract_length_Monthly,contract_length_Quarterly
0,1,0,0,1,0,1,0,0
1,1,0,0,0,1,0,1,0
2,0,1,1,0,0,0,0,1
3,1,0,0,1,0,1,0,0
4,0,1,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...
404159,1,0,0,0,1,0,0,1
404160,0,1,0,0,1,0,0,1
404161,0,1,0,1,0,0,0,1
404162,0,1,0,0,1,0,1,0


In [13]:
# concat data after transform 
X_train = X_train.drop(columns=['gender', 'subscription_type', 'contract_length'])
X_test = X_test.drop(columns=['gender', 'subscription_type', 'contract_length'])

In [14]:
X_train

Unnamed: 0,age,tenure,usage_frequency,support_calls,payment_delay,total_spend,last_interaction
0,19,48,7,3,30,787.00,29
1,65,11,20,9,14,562.00,13
2,38,8,20,1,4,961.86,8
3,38,59,25,10,4,706.00,14
4,46,38,24,10,16,260.00,25
...,...,...,...,...,...,...,...
404159,24,1,4,2,18,740.72,15
404160,62,29,23,9,24,327.10,11
404161,34,13,16,1,6,520.36,23
404162,35,23,18,4,10,420.00,3


In [15]:
X_test

Unnamed: 0,age,tenure,usage_frequency,support_calls,payment_delay,total_spend,last_interaction
0,58,2,20,6,24,664.00,20
1,52,14,13,2,13,650.00,9
2,46,38,3,0,9,571.47,24
3,29,59,27,5,22,502.00,2
4,42,29,7,1,20,541.34,10
...,...,...,...,...,...,...,...
101037,29,33,9,1,1,801.41,29
101038,34,43,28,9,11,435.00,30
101039,50,40,13,0,1,651.28,9
101040,49,46,6,0,20,897.68,2


In [16]:
X_train = pd.concat([X_train, train_encoder_df], axis=1)
X_test = pd.concat([X_test,test_encoder_df], axis=1)

In [17]:
X_train

Unnamed: 0,age,tenure,usage_frequency,support_calls,payment_delay,total_spend,last_interaction,gender_Female,gender_Male,subscription_type_Basic,subscription_type_Premium,subscription_type_Standard,contract_length_Annual,contract_length_Monthly,contract_length_Quarterly
0,19,48,7,3,30,787.00,29,1,0,0,1,0,1,0,0
1,65,11,20,9,14,562.00,13,1,0,0,0,1,0,1,0
2,38,8,20,1,4,961.86,8,0,1,1,0,0,0,0,1
3,38,59,25,10,4,706.00,14,1,0,0,1,0,1,0,0
4,46,38,24,10,16,260.00,25,0,1,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
404159,24,1,4,2,18,740.72,15,1,0,0,0,1,0,0,1
404160,62,29,23,9,24,327.10,11,0,1,0,0,1,0,0,1
404161,34,13,16,1,6,520.36,23,0,1,0,1,0,0,0,1
404162,35,23,18,4,10,420.00,3,0,1,0,0,1,0,1,0


In [18]:
X_test

Unnamed: 0,age,tenure,usage_frequency,support_calls,payment_delay,total_spend,last_interaction,gender_Female,gender_Male,subscription_type_Basic,subscription_type_Premium,subscription_type_Standard,contract_length_Annual,contract_length_Monthly,contract_length_Quarterly
0,58,2,20,6,24,664.00,20,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
1,52,14,13,2,13,650.00,9,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
2,46,38,3,0,9,571.47,24,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
3,29,59,27,5,22,502.00,2,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,42,29,7,1,20,541.34,10,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101037,29,33,9,1,1,801.41,29,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
101038,34,43,28,9,11,435.00,30,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
101039,50,40,13,0,1,651.28,9,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
101040,49,46,6,0,20,897.68,2,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [19]:
#saving the encoder file 
import pickle
with open('encoder.pkl', 'wb') as file:
    pickle.dump(encoder, file)

In [28]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report

def evaluate_model(y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.2f}")
    
    precision = precision_score(y_test, y_pred)
    print(f"Precision: {precision:.2f}")
    
    recall = recall_score(y_test, y_pred)
    print(f"Recall: {recall:.2f}")
    
    conf_matrix = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix:")
    print(conf_matrix)
    print()
    
    class_report = classification_report(y_test, y_pred)
    print("Classification Report:")
    print(class_report)

## Random Forest

In [50]:
from sklearn.ensemble import RandomForestClassifier
random_forest_classifier = RandomForestClassifier(random_state=42, max_depth=15,min_samples_split=10)
random_forest_classifier.fit(X_train, y_train)

In [51]:
# validating model 
y_pred = random_forest_classifier.predict(X_test)

In [52]:
evaluate_model(y_test, y_pred)

Accuracy: 0.93
Precision: 0.90
Recall: 0.99
Confusion Matrix:
[[38431  6328]
 [  508 55775]]

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.86      0.92     44759
           1       0.90      0.99      0.94     56283

    accuracy                           0.93    101042
   macro avg       0.94      0.92      0.93    101042
weighted avg       0.94      0.93      0.93    101042



## Xgbost

In [54]:
import xgboost as xgb
xgb_classifier = xgb.XGBClassifier(random_state=42)
xgb_classifier.fit(X_train, y_train)

In [56]:
y_pred = xgb_classifier.predict(X_test)
evaluate_model(y_test, y_pred)

Accuracy: 0.93
Precision: 0.90
Recall: 0.99
Confusion Matrix:
[[38400  6359]
 [  389 55894]]

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.86      0.92     44759
           1       0.90      0.99      0.94     56283

    accuracy                           0.93    101042
   macro avg       0.94      0.93      0.93    101042
weighted avg       0.94      0.93      0.93    101042



In [58]:
# save model 
with open("customer_churn_random_forest_model.pkl", 'wb') as model_file:
    pickle.dump(random_forest_classifier, model_file)