In [2]:
# # Import packages
import numpy as np
import pandas as pd
import joblib
from pathlib import Path

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Sampling
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

# Data Pre-processing
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder,FunctionTransformer,PowerTransformer, OneHotEncoder
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.preprocessing import MinMaxScaler

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# Model Selection
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import KFold
from imblearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold

# Metrics
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import make_scorer
from sklearn.metrics import confusion_matrix


import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('../data/processed/fe_data.csv')
df.drop(columns=['Unnamed: 0'],inplace=True)

In [4]:
df.drop(columns=['churn_category','churn_reason'],inplace=True)

In [5]:
df.head(5)

Unnamed: 0,gender,age,married,number_of_dependents,city,number_of_referrals,tenure_in_months,offer,phone_service,avg_monthly_long_distance_charges,...,unlimited_data,contract,paperless_billing,payment_method,monthly_charge,total_refunds,total_extra_data_charges,total_long_distance_charges,total_revenue,customer_status
0,Female,37,Yes,0,Frazier Park,2,9,no_offer,Yes,42.39,...,Yes,One Year,Yes,Credit Card,65.6,0.0,0,381.51,974.81,Stayed
1,Male,46,No,0,Glendale,0,9,no_offer,Yes,10.69,...,No,Month-to-Month,No,Credit Card,-4.0,38.33,10,96.21,610.28,Stayed
2,Male,50,No,0,Costa Mesa,0,4,Offer E,Yes,33.65,...,Yes,Month-to-Month,Yes,Bank Withdrawal,73.9,0.0,0,134.6,415.45,Churned
3,Male,78,Yes,0,Martinez,1,13,Offer D,Yes,27.82,...,Yes,Month-to-Month,Yes,Bank Withdrawal,98.0,0.0,0,361.66,1599.51,Churned
4,Female,75,Yes,0,Camarillo,3,3,no_offer,Yes,7.38,...,Yes,Month-to-Month,Yes,Credit Card,83.9,0.0,0,22.14,289.54,Churned


In [6]:
df.isnull().sum()

gender                                 0
age                                    0
married                                0
number_of_dependents                   0
city                                   0
number_of_referrals                    0
tenure_in_months                       0
offer                                  0
phone_service                          0
avg_monthly_long_distance_charges      0
multiple_lines                       682
internet_service                       0
internet_type                          0
avg_monthly_gb_download                0
online_security                        0
online_backup                          0
device_protection_plan                 0
premium_tech_support                   0
streaming_tv                           0
streaming_movies                       0
streaming_music                        0
unlimited_data                         0
contract                               0
paperless_billing                      0
payment_method  

In [7]:
numeric_data = df.select_dtypes(include=['number'])
skweed = []
for i in numeric_data:
    print(f"{i} skewness: {df[i].skew()}")
    if(df[i].skew() >0.5 or df[i].skew()<-0.5):
        skweed.append(i)


age skewness: 0.1621864486741778
number_of_dependents skewness: 2.1099319807722305
number_of_referrals skewness: 1.4460596247412694
tenure_in_months skewness: 0.24054261407486305
avg_monthly_long_distance_charges skewness: 0.04917589933841267
avg_monthly_gb_download skewness: 1.2165839031452528
monthly_charge skewness: -0.27539383432523457
total_refunds skewness: 4.328516700962804
total_extra_data_charges skewness: 4.091209238564625
total_long_distance_charges skewness: 1.2382819839547003
total_revenue skewness: 0.9194102679721669


In [8]:
skweed

['number_of_dependents',
 'number_of_referrals',
 'avg_monthly_gb_download',
 'total_refunds',
 'total_extra_data_charges',
 'total_long_distance_charges',
 'total_revenue']

In [9]:
x = df.drop(columns=['customer_status'])
y = df['customer_status']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [11]:
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()
numerical_features = list(set(X_train.columns.tolist()) - set(categorical_features))

In [12]:
imputer = SimpleImputer(strategy='most_frequent')
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

In [13]:
X_train = pd.DataFrame(X_train, columns=x.columns)
X_test = pd.DataFrame(X_test, columns=x.columns)

In [14]:
# outlier data
def find_outliers_IQR(df):

   q1=df.quantile(0.25)

   q3=df.quantile(0.75)

   IQR=q3-q1

   outliers = df[((df<(q1-1.5*IQR)) | (df>(q3+1.5*IQR)))]

   return outliers

In [15]:
outlier_detection = numeric_data
for k, v in outlier_detection.items():
        outliers = find_outliers_IQR(v)
        perc = len(find_outliers_IQR(v)) * 100.0 / np.shape(outlier_detection)[0]
        print(f'Column :{k}')
        print(f"Number of Outliers = {len(outliers)}","||",f"Max Outlier Value = {outliers.max()}",
              "||", f"Min Outlier Value = {outliers.min()}", "||", f"Percentage of Outliers = {perc:.2f}%")
        print("\n")

Column :age
Number of Outliers = 0 || Max Outlier Value = nan || Min Outlier Value = nan || Percentage of Outliers = 0.00%


Column :number_of_dependents
Number of Outliers = 1627 || Max Outlier Value = 9 || Min Outlier Value = 1 || Percentage of Outliers = 23.10%


Column :number_of_referrals
Number of Outliers = 676 || Max Outlier Value = 11 || Min Outlier Value = 8 || Percentage of Outliers = 9.60%


Column :tenure_in_months
Number of Outliers = 0 || Max Outlier Value = nan || Min Outlier Value = nan || Percentage of Outliers = 0.00%


Column :avg_monthly_long_distance_charges
Number of Outliers = 0 || Max Outlier Value = nan || Min Outlier Value = nan || Percentage of Outliers = 0.00%


Column :avg_monthly_gb_download
Number of Outliers = 362 || Max Outlier Value = 85.0 || Min Outlier Value = 69.0 || Percentage of Outliers = 5.14%


Column :monthly_charge
Number of Outliers = 0 || Max Outlier Value = nan || Min Outlier Value = nan || Percentage of Outliers = 0.00%


Column :total_r

In [16]:
def handle_outliers(X_train, X_test, columns):
    for col in columns:
        Q1 = np.percentile(X_train[col], 25)
        Q3 = np.percentile(X_train[col], 75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Handling outliers in X_train
        X_train[col] = np.where(X_train[col] < lower_bound, lower_bound, X_train[col])
        X_train[col] = np.where(X_train[col] > upper_bound, upper_bound, X_train[col])

        # Handling outliers in X_test based on X_train IQR
        X_test[col] = np.where(X_test[col] < lower_bound, lower_bound, X_test[col])
        X_test[col] = np.where(X_test[col] > upper_bound, upper_bound, X_test[col])

    return X_train, X_test


# Define the columns to apply outlier handling
columns_to_handle = ['avg_monthly_gb_download', 'total_revenue', 'number_of_dependents',
                     'total_refunds', 'total_extra_data_charges','number_of_referrals']

# Apply the function to the specified columns
X_train, X_test = handle_outliers(X_train, X_test, columns_to_handle)

In [17]:
pt = PowerTransformer(method='yeo-johnson')
X_train[skweed] = pt.fit_transform(X_train[skweed])
X_test[skweed] = pt.transform(X_test[skweed])
X_train = pd.DataFrame(X_train, columns=x.columns)
X_test = pd.DataFrame(X_test, columns=x.columns)

In [18]:
numeric_data = df.select_dtypes(include=['number'])
for i in numeric_data:
    print(f"{i} skewness: {X_train[i].skew()}")


age skewness: 0.17485867050241055
number_of_dependents skewness: 0.0
number_of_referrals skewness: 0.43356660739295155
tenure_in_months skewness: 0.23985561011067047
avg_monthly_long_distance_charges skewness: 0.043319959016210864
avg_monthly_gb_download skewness: -0.18569378227548378
monthly_charge skewness: -0.27145577214589467
total_refunds skewness: 0.0
total_extra_data_charges skewness: 0.0
total_long_distance_charges skewness: -0.21815043949747898
total_revenue skewness: -0.15747997817515846


In [19]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
import pandas as pd

# 1) all categorical cols from train set
all_cat_cols = categorical_features

# 2) keep these as ordinal (if present)
ordinal_cols = [c for c in ["contract", "offer"] if c in all_cat_cols]

# 3) all remaining categorical -> onehot
onehot_cols = [c for c in all_cat_cols if c not in ordinal_cols]

print("All categorical:", all_cat_cols)
print("Ordinal:", ordinal_cols)
print("OneHot:", onehot_cols)

# 4) normalize text
for c in all_cat_cols:
    X_train[c] = X_train[c].astype(str).str.strip().str.lower()
    X_test[c] = X_test[c].astype(str).str.strip().str.lower()



All categorical: ['gender', 'married', 'city', 'offer', 'phone_service', 'multiple_lines', 'internet_service', 'internet_type', 'online_security', 'online_backup', 'device_protection_plan', 'premium_tech_support', 'streaming_tv', 'streaming_movies', 'streaming_music', 'unlimited_data', 'contract', 'paperless_billing', 'payment_method']
Ordinal: ['contract', 'offer']
OneHot: ['gender', 'married', 'city', 'phone_service', 'multiple_lines', 'internet_service', 'internet_type', 'online_security', 'online_backup', 'device_protection_plan', 'premium_tech_support', 'streaming_tv', 'streaming_movies', 'streaming_music', 'unlimited_data', 'paperless_billing', 'payment_method']


In [20]:
ord_categories = []
for c in ordinal_cols:
    if c == "contract":
        ord_categories.append(["month-to-month", "one year", "two year"])
    elif c == "offer":
        ord_categories.append(["no offer", "offer a", "offer b", "offer c", "offer d", "offer e"])
    else:
        ord_categories.append(sorted(X_train[c].dropna().unique().tolist()))

oe = OrdinalEncoder(
    categories=ord_categories,
    handle_unknown="use_encoded_value",
    unknown_value=-1
)

if ordinal_cols:
    X_train[ordinal_cols] = oe.fit_transform(X_train[ordinal_cols])
    X_test[ordinal_cols] = oe.transform(X_test[ordinal_cols])

In [21]:
ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

if onehot_cols:
    X_train_ohe = ohe.fit_transform(X_train[onehot_cols])
    X_test_ohe = ohe.transform(X_test[onehot_cols])

    ohe_cols = ohe.get_feature_names_out(onehot_cols)

    X_train_ohe_df = pd.DataFrame(X_train_ohe, columns=ohe_cols, index=X_train.index)
    X_test_ohe_df = pd.DataFrame(X_test_ohe, columns=ohe_cols, index=X_test.index)

    X_train = pd.concat([X_train.drop(columns=onehot_cols), X_train_ohe_df], axis=1)
    X_test = pd.concat([X_test.drop(columns=onehot_cols), X_test_ohe_df], axis=1)

In [22]:
sc = StandardScaler()
X_train[numerical_features] = sc.fit_transform(X_train[numerical_features])
X_test[numerical_features] = sc.transform(X_test[numerical_features])

In [23]:
y_train.value_counts()

customer_status
Stayed     3781
Churned    1496
Joined      357
Name: count, dtype: int64

In [24]:
le = LabelEncoder()

y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

In [25]:
sm = SMOTE(random_state=42)
X_train, y_train = sm.fit_resample(X_train,y_train)

In [26]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train,y_train)

y_pred = rf.predict(X_test)
# ...existing code...

In [27]:

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
# ...existing code...

Accuracy: 0.8261178140525195
              precision    recall  f1-score   support

           0       0.70      0.61      0.65       373
           1       0.67      0.63      0.65        97
           2       0.88      0.93      0.91       939

    accuracy                           0.83      1409
   macro avg       0.75      0.72      0.74      1409
weighted avg       0.82      0.83      0.82      1409



In [28]:
print("Accuracy      :", accuracy_score(y_test, y_pred))
print("F1 Macro      :", f1_score(y_test, y_pred, average="macro"))
print("F1 Weighted   :", f1_score(y_test, y_pred, average="weighted"))

# ROC-AUC (binary + multiclass handling)
if hasattr(rf, "predict_proba"):
    y_proba = rf.predict_proba(X_test)
    n_classes = len(np.unique(y_test))

    if n_classes == 2:
        # binary case
        print("ROC-AUC       :", roc_auc_score(y_test, y_proba[:, 1]))
    else:
        # multiclass case
        print("ROC-AUC (OVR, macro):", roc_auc_score(
            y_test, y_proba, multi_class="ovr", average="macro"
        ))

print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
# ...existing code...

Accuracy      : 0.8261178140525195
F1 Macro      : 0.7360094668672899
F1 Weighted   : 0.8212699573215498
ROC-AUC (OVR, macro): 0.9210147881719305

Confusion Matrix:
 [[228  30 115]
 [ 33  61   3]
 [ 64   0 875]]

Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.61      0.65       373
           1       0.67      0.63      0.65        97
           2       0.88      0.93      0.91       939

    accuracy                           0.83      1409
   macro avg       0.75      0.72      0.74      1409
weighted avg       0.82      0.83      0.82      1409



In [31]:
import joblib,pickle
from pathlib import Path

MODEL_DIR = Path("../p_models")
MODEL_DIR.mkdir(parents=True, exist_ok=True)
feature_names = X_train.columns.tolist()

with open('../p_models/feature_names.pkl', 'wb') as f:
    pickle.dump(feature_names, f)

with open(MODEL_DIR / "churn_model.pkl", 'wb') as f:
    pickle.dump(rf, f)

with open(MODEL_DIR / "imputer.pkl", 'wb') as f:
    pickle.dump(imputer, f)

with open(MODEL_DIR / "ordinal_encoder.pkl", 'wb') as f:
    pickle.dump(oe, f)

with open(MODEL_DIR / "onehot_encoder.pkl", 'wb') as f:
    pickle.dump(ohe, f)

with open(MODEL_DIR / "standard_scaler.pkl", 'wb') as f:
    pickle.dump(sc, f)

with open(MODEL_DIR / "label_encoder.pkl", 'wb') as f:
    pickle.dump(le, f)

with open(MODEL_DIR / "power_transformer.pkl", 'wb') as f:
    pickle.dump(pt, f)
print("Saved in:", MODEL_DIR.resolve())

Saved in: D:\Dekstop\Churn Prediction\p_models
