In [794]:
# Import packages
import numpy as np
import pandas as pd
import joblib
from pathlib import Path

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Sampling
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

# Data Pre-processing
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder,FunctionTransformer,PowerTransformer, OneHotEncoder
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.preprocessing import MinMaxScaler

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# Model Selection
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import KFold
from imblearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold

# Metrics
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import make_scorer
from sklearn.metrics import confusion_matrix


import warnings
warnings.filterwarnings('ignore')

In [795]:
df = pd.read_csv('../data/processed/fe_data.csv')
df.drop(columns=['Unnamed: 0'],inplace=True)

In [796]:
df.head()

Unnamed: 0,gender,age,married,number_of_dependents,city,number_of_referrals,tenure_in_months,offer,phone_service,avg_monthly_long_distance_charges,...,paperless_billing,payment_method,monthly_charge,total_refunds,total_extra_data_charges,total_long_distance_charges,total_revenue,customer_status,churn_category,churn_reason
0,Female,37,Yes,0,Frazier Park,2,9,no_offer,Yes,42.39,...,Yes,Credit Card,65.6,0.0,0,381.51,974.81,Stayed,,
1,Male,46,No,0,Glendale,0,9,no_offer,Yes,10.69,...,No,Credit Card,-4.0,38.33,10,96.21,610.28,Stayed,,
2,Male,50,No,0,Costa Mesa,0,4,Offer E,Yes,33.65,...,Yes,Bank Withdrawal,73.9,0.0,0,134.6,415.45,Churned,Competitor,Competitor had better devices
3,Male,78,Yes,0,Martinez,1,13,Offer D,Yes,27.82,...,Yes,Bank Withdrawal,98.0,0.0,0,361.66,1599.51,Churned,Dissatisfaction,Product dissatisfaction
4,Female,75,Yes,0,Camarillo,3,3,no_offer,Yes,7.38,...,Yes,Credit Card,83.9,0.0,0,22.14,289.54,Churned,Dissatisfaction,Network reliability


In [797]:
df = df[df['customer_status'] == 'Churned']

In [798]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1869 entries, 2 to 7039
Data columns (total 33 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   gender                             1869 non-null   object 
 1   age                                1869 non-null   int64  
 2   married                            1869 non-null   object 
 3   number_of_dependents               1869 non-null   int64  
 4   city                               1869 non-null   object 
 5   number_of_referrals                1869 non-null   int64  
 6   tenure_in_months                   1869 non-null   int64  
 7   offer                              1869 non-null   object 
 8   phone_service                      1869 non-null   object 
 9   avg_monthly_long_distance_charges  1869 non-null   float64
 10  multiple_lines                     1699 non-null   object 
 11  internet_service                   1869 non-null   object 
 1

In [799]:
df.isnull().sum()

gender                                 0
age                                    0
married                                0
number_of_dependents                   0
city                                   0
number_of_referrals                    0
tenure_in_months                       0
offer                                  0
phone_service                          0
avg_monthly_long_distance_charges      0
multiple_lines                       170
internet_service                       0
internet_type                          0
avg_monthly_gb_download                0
online_security                        0
online_backup                          0
device_protection_plan                 0
premium_tech_support                   0
streaming_tv                           0
streaming_movies                       0
streaming_music                        0
unlimited_data                         0
contract                               0
paperless_billing                      0
payment_method  

In [800]:
numeric_data = df.select_dtypes(include=['number'])
skweed = []
for i in numeric_data:
    print(f"{i} skewness: {df[i].skew()}")
    if(df[i].skew() >0.5 or df[i].skew()<-0.5):
        skweed.append(i)


age skewness: -0.0009241915165487352
number_of_dependents skewness: 5.741516955529797
number_of_referrals skewness: 3.806818270717484
tenure_in_months skewness: 1.1492802882380029
avg_monthly_long_distance_charges skewness: 0.054646043836724056
avg_monthly_gb_download skewness: 1.341455741523856
monthly_charge skewness: -0.8851887950828556
total_refunds skewness: 5.0621730459266985
total_extra_data_charges skewness: 4.015198806174346
total_long_distance_charges skewness: 2.1258898213734767
total_revenue skewness: 1.5454065510949506


In [801]:
skweed

['number_of_dependents',
 'number_of_referrals',
 'tenure_in_months',
 'avg_monthly_gb_download',
 'monthly_charge',
 'total_refunds',
 'total_extra_data_charges',
 'total_long_distance_charges',
 'total_revenue']

In [802]:
x = df.drop(columns=['churn_category','customer_status','churn_reason'])
y = df['churn_category']

In [803]:
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=42,stratify=y)

In [804]:
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()
numerical_features = list(set(X_train.columns.tolist()) - set(categorical_features))

In [805]:
imputer = SimpleImputer(strategy='most_frequent')
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

In [806]:
X_train = pd.DataFrame(X_train, columns=x.columns)
X_test = pd.DataFrame(X_test, columns=x.columns)

In [807]:
# outlier data
def find_outliers_IQR(df):

   q1=df.quantile(0.25)

   q3=df.quantile(0.75)

   IQR=q3-q1

   outliers = df[((df<(q1-1.5*IQR)) | (df>(q3+1.5*IQR)))]

   return outliers

In [808]:
outlier_detection = numeric_data
for k, v in outlier_detection.items():
        outliers = find_outliers_IQR(v)
        perc = len(find_outliers_IQR(v)) * 100.0 / np.shape(outlier_detection)[0]
        print(f'Column :{k}')
        print(f"Number of Outliers = {len(outliers)}","||",f"Max Outlier Value = {outliers.max()}",
              "||", f"Min Outlier Value = {outliers.min()}", "||", f"Percentage of Outliers = {perc:.2f}%")
        print("\n")

Column :age
Number of Outliers = 0 || Max Outlier Value = nan || Min Outlier Value = nan || Percentage of Outliers = 0.00%


Column :number_of_dependents
Number of Outliers = 106 || Max Outlier Value = 7 || Min Outlier Value = 1 || Percentage of Outliers = 5.67%


Column :number_of_referrals
Number of Outliers = 91 || Max Outlier Value = 9 || Min Outlier Value = 3 || Percentage of Outliers = 4.87%


Column :tenure_in_months
Number of Outliers = 23 || Max Outlier Value = 72 || Min Outlier Value = 70 || Percentage of Outliers = 1.23%


Column :avg_monthly_long_distance_charges
Number of Outliers = 0 || Max Outlier Value = nan || Min Outlier Value = nan || Percentage of Outliers = 0.00%


Column :avg_monthly_gb_download
Number of Outliers = 147 || Max Outlier Value = 85.0 || Min Outlier Value = 56.0 || Percentage of Outliers = 7.87%


Column :monthly_charge
Number of Outliers = 20 || Max Outlier Value = -4.0 || Min Outlier Value = -10.0 || Percentage of Outliers = 1.07%


Column :total_re

In [809]:
# def handle_outliers(X_train, X_test, columns):
#     for col in columns:
#         Q1 = np.percentile(X_train[col], 25)
#         Q3 = np.percentile(X_train[col], 75)
#         IQR = Q3 - Q1
#         lower_bound = Q1 - 1.5 * IQR
#         upper_bound = Q3 + 1.5 * IQR

#         # Handling outliers in X_train
#         X_train[col] = np.where(X_train[col] < lower_bound, lower_bound, X_train[col])
#         X_train[col] = np.where(X_train[col] > upper_bound, upper_bound, X_train[col])

#         # Handling outliers in X_test based on X_train IQR
#         X_test[col] = np.where(X_test[col] < lower_bound, lower_bound, X_test[col])
#         X_test[col] = np.where(X_test[col] > upper_bound, upper_bound, X_test[col])

#     return X_train, X_test


# # Define the columns to apply outlier handling
# columns_to_handle = ['avg_monthly_gb_download', 'total_revenue', 'number_of_dependents',
#                      'total_refunds', 'total_extra_data_charges','number_of_referrals']

# # Apply the function to the specified columns
# X_train, X_test = handle_outliers(X_train, X_test, columns_to_handle)

In [810]:
# pt = PowerTransformer(method='yeo-johnson')
# X_train[skweed] = pt.fit_transform(X_train[skweed])
# X_test[skweed] = pt.transform(X_test[skweed])
# X_train = pd.DataFrame(X_train, columns=x.columns)
# X_test = pd.DataFrame(X_test, columns=x.columns)

In [811]:
numeric_data = df.select_dtypes(include=['number'])
for i in numeric_data:
    print(f"{i} skewness: {X_train[i].skew()}")


age skewness: 0.014936115662294153
number_of_dependents skewness: 5.594590872824135
number_of_referrals skewness: 3.8077471794907036
tenure_in_months skewness: 1.1502712380069662
avg_monthly_long_distance_charges skewness: 0.061363623216479375
avg_monthly_gb_download skewness: 1.3378788493834264
monthly_charge skewness: -0.8330740944951658
total_refunds skewness: 5.147739497877579
total_extra_data_charges skewness: 4.256261107601844
total_long_distance_charges skewness: 2.1156504779096617
total_revenue skewness: 1.5526651197703631


In [812]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
import pandas as pd

# 1) all categorical cols from train set
all_cat_cols = categorical_features

# 2) keep these as ordinal (if present)
ordinal_cols = [c for c in ["contract", "offer"] if c in all_cat_cols]

# 3) all remaining categorical -> onehot
onehot_cols = [c for c in all_cat_cols if c not in ordinal_cols]

print("All categorical:", all_cat_cols)
print("Ordinal:", ordinal_cols)
print("OneHot:", onehot_cols)

# 4) normalize text
for c in all_cat_cols:
    X_train[c] = X_train[c].astype(str).str.strip().str.lower()
    X_test[c] = X_test[c].astype(str).str.strip().str.lower()



All categorical: ['gender', 'married', 'city', 'offer', 'phone_service', 'multiple_lines', 'internet_service', 'internet_type', 'online_security', 'online_backup', 'device_protection_plan', 'premium_tech_support', 'streaming_tv', 'streaming_movies', 'streaming_music', 'unlimited_data', 'contract', 'paperless_billing', 'payment_method']
Ordinal: ['contract', 'offer']
OneHot: ['gender', 'married', 'city', 'phone_service', 'multiple_lines', 'internet_service', 'internet_type', 'online_security', 'online_backup', 'device_protection_plan', 'premium_tech_support', 'streaming_tv', 'streaming_movies', 'streaming_music', 'unlimited_data', 'paperless_billing', 'payment_method']


In [813]:
ord_categories = []
for c in ordinal_cols:
    if c == "contract":
        ord_categories.append(["month-to-month", "one year", "two year"])
    elif c == "offer":
        ord_categories.append(["no offer", "offer a", "offer b", "offer c", "offer d", "offer e"])
    else:
        ord_categories.append(sorted(X_train[c].dropna().unique().tolist()))

oe = OrdinalEncoder(
    categories=ord_categories,
    handle_unknown="use_encoded_value",
    unknown_value=-1
)

if ordinal_cols:
    X_train[ordinal_cols] = oe.fit_transform(X_train[ordinal_cols])
    X_test[ordinal_cols] = oe.transform(X_test[ordinal_cols])

In [814]:
ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

if onehot_cols:
    X_train_ohe = ohe.fit_transform(X_train[onehot_cols])
    X_test_ohe = ohe.transform(X_test[onehot_cols])

    ohe_cols = ohe.get_feature_names_out(onehot_cols)

    X_train_ohe_df = pd.DataFrame(X_train_ohe, columns=ohe_cols, index=X_train.index)
    X_test_ohe_df = pd.DataFrame(X_test_ohe, columns=ohe_cols, index=X_test.index)

    X_train = pd.concat([X_train.drop(columns=onehot_cols), X_train_ohe_df], axis=1)
    X_test = pd.concat([X_test.drop(columns=onehot_cols), X_test_ohe_df], axis=1)

In [815]:
# sc = StandardScaler()
# X_train[numerical_features] = sc.fit_transform(X_train[numerical_features])
# X_test[numerical_features] = sc.transform(X_test[numerical_features])

In [816]:
le = LabelEncoder()

y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

In [817]:
# sm = SMOTE(random_state=42)
# X_train, y_train = sm.fit_resample(X_train,y_train)

In [818]:
import numpy as np
import pandas as pd
from scipy.stats import randint
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, classification_report, confusion_matrix, accuracy_score

In [819]:
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

rf = RandomForestClassifier(
    random_state=42,
    n_jobs=-1,
    class_weight="balanced_subsample"   
)


param_dist = {
    "n_estimators": randint(200, 700),
    "max_depth": [None, 8, 12, 16, 20],
    "min_samples_split": randint(2, 12),
    "min_samples_leaf": randint(1, 6),
    "max_features": ["sqrt", "log2"],
    "bootstrap": [True]
}

search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=10,
    scoring="f1_macro",
    cv=cv,
    n_jobs=-1,
    random_state=42,
    verbose=1,
    refit=True
)

search.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [820]:
y_pred = search.predict(X_test)

In [821]:

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
# ...existing code...

Accuracy: 0.35561497326203206
              precision    recall  f1-score   support

           0       0.29      0.08      0.12        63
           1       0.52      0.57      0.54       168
           2       0.19      0.27      0.22        64
           3       0.15      0.16      0.16        37
           4       0.21      0.21      0.21        42

    accuracy                           0.36       374
   macro avg       0.27      0.26      0.25       374
weighted avg       0.35      0.36      0.34       374



In [822]:
print("Macro F1:", f1_score(y_test, y_pred, average="macro"))
print("Weighted F1:", f1_score(y_test, y_pred, average="weighted"))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Macro F1: 0.25206651065259444
Weighted F1: 0.3421540627294776

Classification Report:
               precision    recall  f1-score   support

           0       0.29      0.08      0.12        63
           1       0.52      0.57      0.54       168
           2       0.19      0.27      0.22        64
           3       0.15      0.16      0.16        37
           4       0.21      0.21      0.21        42

    accuracy                           0.36       374
   macro avg       0.27      0.26      0.25       374
weighted avg       0.35      0.36      0.34       374


Confusion Matrix:
 [[ 5 25 15  4 14]
 [ 8 96 37 16 11]
 [ 1 37 17  4  5]
 [ 1 16 11  6  3]
 [ 2 12 10  9  9]]
