In [194]:
# Import packages
import numpy as np
import pandas as pd
import joblib
from pathlib import Path

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Sampling
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

# Data Pre-processing
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder,FunctionTransformer,PowerTransformer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.preprocessing import MinMaxScaler

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# Model Selection
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import KFold
from imblearn.pipeline import Pipeline

# Metrics
from sklearn.metrics import accuracy_score,f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import make_scorer
from sklearn.metrics import confusion_matrix


import warnings
warnings.filterwarnings('ignore')

In [195]:
df = pd.read_csv("../data/processed/fe_data.csv")

In [196]:
df.head()

Unnamed: 0.1,Unnamed: 0,gender,age,married,number_of_dependents,city,number_of_referrals,tenure_in_months,offer,phone_service,...,paperless_billing,payment_method,monthly_charge,total_refunds,total_extra_data_charges,total_long_distance_charges,total_revenue,customer_status,churn_category,churn_reason
0,0,Female,37,Yes,0,Frazier Park,2,9,no_offer,Yes,...,Yes,Credit Card,65.6,0.0,0,381.51,974.81,Stayed,,
1,1,Male,46,No,0,Glendale,0,9,no_offer,Yes,...,No,Credit Card,-4.0,38.33,10,96.21,610.28,Stayed,,
2,2,Male,50,No,0,Costa Mesa,0,4,Offer E,Yes,...,Yes,Bank Withdrawal,73.9,0.0,0,134.6,415.45,Churned,Competitor,Competitor had better devices
3,3,Male,78,Yes,0,Martinez,1,13,Offer D,Yes,...,Yes,Bank Withdrawal,98.0,0.0,0,361.66,1599.51,Churned,Dissatisfaction,Product dissatisfaction
4,4,Female,75,Yes,0,Camarillo,3,3,no_offer,Yes,...,Yes,Credit Card,83.9,0.0,0,22.14,289.54,Churned,Dissatisfaction,Network reliability


In [197]:
df = df[df['customer_status'] == 'Churned']

In [198]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1869 entries, 2 to 7039
Data columns (total 34 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Unnamed: 0                         1869 non-null   int64  
 1   gender                             1869 non-null   object 
 2   age                                1869 non-null   int64  
 3   married                            1869 non-null   object 
 4   number_of_dependents               1869 non-null   int64  
 5   city                               1869 non-null   object 
 6   number_of_referrals                1869 non-null   int64  
 7   tenure_in_months                   1869 non-null   int64  
 8   offer                              1869 non-null   object 
 9   phone_service                      1869 non-null   object 
 10  avg_monthly_long_distance_charges  1869 non-null   float64
 11  multiple_lines                     1699 non-null   object 
 1

In [199]:
df.isnull().sum()

Unnamed: 0                             0
gender                                 0
age                                    0
married                                0
number_of_dependents                   0
city                                   0
number_of_referrals                    0
tenure_in_months                       0
offer                                  0
phone_service                          0
avg_monthly_long_distance_charges      0
multiple_lines                       170
internet_service                       0
internet_type                          0
avg_monthly_gb_download                0
online_security                        0
online_backup                          0
device_protection_plan                 0
premium_tech_support                   0
streaming_tv                           0
streaming_movies                       0
streaming_music                        0
unlimited_data                         0
contract                               0
paperless_billin

In [200]:
numeric_data = df.select_dtypes(include=['number'])
skweed = []
for i in numeric_data:
    print(f"{i} skewness: {df[i].skew()}")
    if(df[i].skew() >0.5 or df[i].skew()<-0.5):
        skweed.append(i)


Unnamed: 0 skewness: 0.03452934289842404
age skewness: -0.0009241915165487352
number_of_dependents skewness: 5.741516955529797
number_of_referrals skewness: 3.806818270717484
tenure_in_months skewness: 1.1492802882380029
avg_monthly_long_distance_charges skewness: 0.054646043836724056
avg_monthly_gb_download skewness: 1.341455741523856
monthly_charge skewness: -0.8851887950828556
total_refunds skewness: 5.0621730459266985
total_extra_data_charges skewness: 4.015198806174346
total_long_distance_charges skewness: 2.1258898213734767
total_revenue skewness: 1.5454065510949506


In [201]:
skweed

['number_of_dependents',
 'number_of_referrals',
 'tenure_in_months',
 'avg_monthly_gb_download',
 'monthly_charge',
 'total_refunds',
 'total_extra_data_charges',
 'total_long_distance_charges',
 'total_revenue']

In [202]:
x = df.drop(columns=['churn_category','customer_status','churn_reason'])
y = df['churn_category']

In [203]:
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [204]:
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()
numerical_features = list(set(X_train.columns.tolist()) - set(categorical_features))

In [205]:
imputer = SimpleImputer(strategy='most_frequent')
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

In [206]:
X_train = pd.DataFrame(X_train, columns=x.columns)
X_test = pd.DataFrame(X_test, columns=x.columns)

In [207]:
# outlier data
def find_outliers_IQR(df):

   q1=df.quantile(0.25)

   q3=df.quantile(0.75)

   IQR=q3-q1

   outliers = df[((df<(q1-1.5*IQR)) | (df>(q3+1.5*IQR)))]

   return outliers

In [208]:
outlier_detection = numeric_data
for k, v in outlier_detection.items():
        outliers = find_outliers_IQR(v)
        perc = len(find_outliers_IQR(v)) * 100.0 / np.shape(outlier_detection)[0]
        print(f'Column :{k}')
        print(f"Number of Outliers = {len(outliers)}","||",f"Max Outlier Value = {outliers.max()}",
              "||", f"Min Outlier Value = {outliers.min()}", "||", f"Percentage of Outliers = {perc:.2f}%")
        print("\n")

Column :Unnamed: 0


Number of Outliers = 0 || Max Outlier Value = nan || Min Outlier Value = nan || Percentage of Outliers = 0.00%


Column :age
Number of Outliers = 0 || Max Outlier Value = nan || Min Outlier Value = nan || Percentage of Outliers = 0.00%


Column :number_of_dependents
Number of Outliers = 106 || Max Outlier Value = 7 || Min Outlier Value = 1 || Percentage of Outliers = 5.67%


Column :number_of_referrals
Number of Outliers = 91 || Max Outlier Value = 9 || Min Outlier Value = 3 || Percentage of Outliers = 4.87%


Column :tenure_in_months
Number of Outliers = 23 || Max Outlier Value = 72 || Min Outlier Value = 70 || Percentage of Outliers = 1.23%


Column :avg_monthly_long_distance_charges
Number of Outliers = 0 || Max Outlier Value = nan || Min Outlier Value = nan || Percentage of Outliers = 0.00%


Column :avg_monthly_gb_download
Number of Outliers = 147 || Max Outlier Value = 85.0 || Min Outlier Value = 56.0 || Percentage of Outliers = 7.87%


Column :monthly_charge
Number of Outliers 

In [209]:
def handle_outliers(X_train, X_test, columns):
    for col in columns:
        Q1 = np.percentile(X_train[col], 25)
        Q3 = np.percentile(X_train[col], 75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Handling outliers in X_train
        X_train[col] = np.where(X_train[col] < lower_bound, lower_bound, X_train[col])
        X_train[col] = np.where(X_train[col] > upper_bound, upper_bound, X_train[col])

        # Handling outliers in X_test based on X_train IQR
        X_test[col] = np.where(X_test[col] < lower_bound, lower_bound, X_test[col])
        X_test[col] = np.where(X_test[col] > upper_bound, upper_bound, X_test[col])

    return X_train, X_test


# Define the columns to apply outlier handling
columns_to_handle = ['avg_monthly_gb_download', 'total_revenue', 'number_of_dependents',
                     'total_refunds', 'total_extra_data_charges','number_of_referrals']

# Apply the function to the specified columns
X_train, X_test = handle_outliers(X_train, X_test, columns_to_handle)

In [210]:
pt = PowerTransformer(method='yeo-johnson')
X_train[skweed] = pt.fit_transform(X_train[skweed])
X_test[skweed] = pt.transform(X_test[skweed])
X_train = pd.DataFrame(X_train, columns=x.columns)
X_test = pd.DataFrame(X_test, columns=x.columns)

In [211]:
numeric_data = df.select_dtypes(include=['number'])
for i in numeric_data:
    print(f"{i} skewness: {X_train[i].skew()}")


Unnamed: 0 skewness: 0.013825724554029303
age skewness: -0.009846519008349208
number_of_dependents skewness: 0.0
number_of_referrals skewness: 0.7225720246443321
tenure_in_months skewness: 0.002693585938929521
avg_monthly_long_distance_charges skewness: 0.08526707349664271
avg_monthly_gb_download skewness: -0.09745135995201601
monthly_charge skewness: -0.5124969126797962
total_refunds skewness: 0.0
total_extra_data_charges skewness: 0.0
total_long_distance_charges skewness: -0.07396066834979624
total_revenue skewness: -0.06380319160036356


In [None]:
oe = OrdinalEncoder()
X_train[categorical_features] = oe.fit_transform(X_train[categorical_features])
X_test[categorical_features] = oe.transform(X_test[categorical_features])


In [None]:
ohe = onehot

In [214]:
sc = StandardScaler()
X_train[numerical_features] = sc.fit_transform(X_train[numerical_features])
X_test[numerical_features] = sc.transform(X_test[numerical_features])

In [215]:
le = LabelEncoder()

y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

In [216]:
sm = SMOTE(random_state=42)
X_train, y_train = sm.fit_resample(X_train,y_train)

ValueError: could not convert string to float: 'Female'

In [None]:
rf = RandomForestClassifier()

rf.fit(X_train,y_train)

In [None]:
y_pred = rf.predict(X_test)

In [None]:

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
# ...existing code...

Accuracy: 0.43315508021390375
              precision    recall  f1-score   support

           0       0.28      0.18      0.22        56
           1       0.50      0.81      0.62       175
           2       0.15      0.06      0.09        62
           3       0.25      0.10      0.14        40
           4       0.15      0.05      0.07        41

    accuracy                           0.43       374
   macro avg       0.27      0.24      0.23       374
weighted avg       0.35      0.43      0.36       374



In [None]:
print("Macro F1:", f1_score(y_test, y_pred, average="macro"))
print("Weighted F1:", f1_score(y_test, y_pred, average="weighted"))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Macro F1: 0.2291308726082783
Weighted F1: 0.36163370249311627

Classification Report:
               precision    recall  f1-score   support

           0       0.28      0.18      0.22        56
           1       0.50      0.81      0.62       175
           2       0.15      0.06      0.09        62
           3       0.25      0.10      0.14        40
           4       0.15      0.05      0.07        41

    accuracy                           0.43       374
   macro avg       0.27      0.24      0.23       374
weighted avg       0.35      0.43      0.36       374


Confusion Matrix:
 [[ 10  39   3   1   3]
 [ 12 142   8   7   6]
 [  7  50   4   1   0]
 [  2  28   4   4   2]
 [  5  23   8   3   2]]
