In [20]:
# For Data Manipulation
import numpy as np
import pandas as pd
import pickle

# For Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import optuna
from optuna.visualization import plot_optimization_history , plot_parallel_coordinate , plot_param_importances , plot_slice

# For ML Model
from sklearn.model_selection import train_test_split , StratifiedKFold, cross_val_predict , cross_val_score
from sklearn.preprocessing import LabelEncoder , OneHotEncoder , RobustScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from imblearn.over_sampling import SMOTE
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline
from sklearn.metrics import classification_report , f1_score , confusion_matrix, roc_auc_score

# For ML Algorithm
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier
from xgboost import XGBClassifier

In [21]:
telc = pd.read_csv(r"C:\Users\iaman\ML\PROJECTS\Churn Rate Analysis\Dataset\WA_Fn-UseC_-Telco-Customer-Churn.csv")

telc.shape

(7043, 21)

In [22]:
telc.sample(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
1313,6599-RCLCJ,Male,0,Yes,No,47,Yes,Yes,Fiber optic,No,...,Yes,Yes,Yes,Yes,Month-to-month,Yes,Electronic check,109.55,5124.55,Yes
5532,8174-LNWMW,Female,0,No,No,31,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Credit card (automatic),20.9,689.35,No
1178,0125-LZQXK,Male,0,No,No,15,Yes,No,Fiber optic,No,...,Yes,Yes,Yes,Yes,Month-to-month,Yes,Electronic check,101.35,1553.95,Yes
6146,4009-ALQFH,Female,0,No,No,25,Yes,No,Fiber optic,No,...,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,99.5,2369.05,Yes
5724,8042-JVNFH,Male,0,No,No,1,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,71.35,71.35,Yes


In [23]:
telc['Churn'].value_counts()

Churn
No     5174
Yes    1869
Name: count, dtype: int64

In [24]:
print(f'Checking for Missing Values : {telc.isna().sum().sum()}')
print(f'Checking for Duplicated Values : {telc.duplicated().sum()}')

Checking for Missing Values : 0
Checking for Duplicated Values : 0


In [25]:
telc['TotalCharges'] = pd.to_numeric(telc['TotalCharges'] , errors = 'coerce' )

In [26]:
telc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [27]:
num_feature = telc.select_dtypes( include = ['int' , 'float'] ).columns.tolist()
cat_feature = telc.select_dtypes( include = ['object'] ).columns.tolist()

In [28]:
num_feature

['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']

In [29]:
cat_feature

['customerID',
 'gender',
 'Partner',
 'Dependents',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod',
 'Churn']

In [35]:
le = LabelEncoder()

X = telc.drop( columns = ['Churn' , 'customerID'] )
y = le.fit_transform(telc['Churn'])

X_train , X_test , y_train , y_test = train_test_split( X , y , test_size = 0.2 , random_state= 42 )

In [36]:
num_features = X.select_dtypes(include = ['int' , 'float']).columns.tolist()
cat_features = X.select_dtypes(include = ['object']).columns.tolist()

In [37]:
numeric_transformer = Pipeline([
    ('imputer', IterativeImputer()),
    ('scaler', RobustScaler())
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, num_features),
    ('encoder' , OneHotEncoder(handle_unknown='ignore') , cat_features)
] , remainder='passthrough')

# Objective Function

In [38]:
def objective(trials):

    classifier_name = trials.suggest_categorical('clf' , ['LogisticRegression' , 'SVC' , 'DecisionTreeClassifier' , 'RandomForestClassifier' , 'XGBClassifier'])

    if classifier_name == 'LogisticRegression':
        C = trials.suggest_float('C' , 1e-4 , 10.0 , log = True)
        solver = trials.suggest_categorical(
            'solver' , ['liblinear' , 'lbfgs']
        )

        lr = LogisticRegression( C = C  , solver = solver)

        model = Pipeline(steps=[
            ('preprocessor' , preprocessor),
            ('smote' , SMOTE(random_state=42)),
            ('clf' , lr)
        ])

    elif classifier_name == 'SVC':
        C = trials.suggest_float('C', 1e-3 , 100.0 , log = True)
        kernel = trials.suggest_categorical('kernel' , ['linear' , 'rbf'])

        if kernel == 'rbf':
            gamma = trials.suggest_float('svc_gamma' , 1e-4 , 1.0 , log = True)
        else :
            gamma = 'scale'

        svc = SVC(
            C = C,
            kernel = kernel,
            gamma = gamma,
            probability = True,
            random_state = 42
        )

        model = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('smote', SMOTE(random_state=42)),
            ('clf', svc)
        ])

    elif classifier_name == 'DecisionTreeClassifier':
        max_depth = trials.suggest_int("max_depth", 2, 30)
        min_samples_split = trials.suggest_int("min_samples_split", 2, 20)
        min_samples_leaf = trials.suggest_int("min_samples_leaf", 1, 20)
        criterion = trials.suggest_categorical(
            "criterion", ["gini", "entropy"]
        )

        dt = DecisionTreeClassifier(
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            criterion=criterion,
            random_state=42
        )

        model = Pipeline([
            ("preprocessor", preprocessor),
            ("clf", dt)
        ])

    elif classifier_name == 'RandomForestClassifier':
        n_estimators = trials.suggest_int("n_estimators", 100, 500)
        max_depth = trials.suggest_int("max_depth", 3, 30)
        min_samples_split = trials.suggest_int("min_samples_split", 2, 20)
        min_samples_leaf = trials.suggest_int("min_samples_leaf", 1, 20)
        max_features = trials.suggest_categorical(
            "max_features", ["sqrt", "log2", None]
        )

        rf = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            max_features=max_features,
            random_state=42,
            n_jobs=-1
        )

        model = Pipeline([
            ("preprocessor", preprocessor),
            ("clf", rf)
        ])

    elif classifier_name == 'XGBClassifier':
        params = {
            'n_estimators' : trials.suggest_int('n_estimators' , 100 , 500),
            'max_depth' : trials.suggest_int('max_depth' , 3 , 10),
            'learning_rate' : trials.suggest_float('learning_rate' , 0.01 , 0.3 , log = True),
            'subsample' : trials.suggest_float('subsample' , 0.6 , 1.0),
            'colsample_bytree' : trials.suggest_float('colsample_bytree', 0.6 , 1.0),
            'gamma' : trials.suggest_float('xgb_gamma' , 0 , 5),
            'reg_alpha' : trials.suggest_float('reg_alpha' , 0 , 5),
            'reg_lambda' : trials.suggest_float('reg_lambda' , 1e-4 , 10.0 , log = True),
        }

        xgb = XGBClassifier(
            **params,
            objective = 'binary:logistic',
            eval_metric = 'logloss',
            random_state = 42,
            n_jobs = -1
        )

        model = Pipeline(steps=[
            ('preprocessor' , preprocessor),
            ('smote' , SMOTE(random_state= 42)),
            ('clf' , xgb)
        ])


    score = cross_val_score( model , X_train , y_train , cv = 5 , scoring = 'f1' ).mean()

    return score


In [39]:
study =  optuna.create_study( direction = 'maximize' , sampler = optuna.samplers.TPESampler())
study.optimize(objective , n_trials = 10 , show_progress_bar=True)

[I 2026-02-04 15:25:12,155] A new study created in memory with name: no-name-dc6f2641-79e6-45df-917e-bcaf95e3ec2f


  0%|          | 0/10 [00:00<?, ?it/s]

[I 2026-02-04 15:25:18,946] Trial 0 finished with value: 0.5712257930647118 and parameters: {'clf': 'RandomForestClassifier', 'n_estimators': 112, 'max_depth': 14, 'min_samples_split': 5, 'min_samples_leaf': 6, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.5712257930647118.
[I 2026-02-04 15:25:25,131] Trial 1 finished with value: 0.6253847800410791 and parameters: {'clf': 'XGBClassifier', 'n_estimators': 251, 'max_depth': 7, 'learning_rate': 0.10371975661162125, 'subsample': 0.6595879452626963, 'colsample_bytree': 0.6667486411732371, 'xgb_gamma': 4.704459068578785, 'reg_alpha': 3.8794771079178574, 'reg_lambda': 5.447355105840797}. Best is trial 1 with value: 0.6253847800410791.
[I 2026-02-04 15:28:46,230] Trial 2 finished with value: 0.6167318104825517 and parameters: {'clf': 'SVC', 'C': 0.036598320672164626, 'kernel': 'rbf', 'svc_gamma': 0.1487047056109632}. Best is trial 1 with value: 0.6253847800410791.
[I 2026-02-04 15:28:48,174] Trial 3 finished with value: 0.513890350962

In [41]:
best_trial = study.best_trial
print("Best trial parameters:", best_trial.params)
print("Best trial accuracy:", best_trial.value)

Best trial parameters: {'clf': 'XGBClassifier', 'n_estimators': 398, 'max_depth': 9, 'learning_rate': 0.03307624592744446, 'subsample': 0.8369969896920051, 'colsample_bytree': 0.6128530082309902, 'xgb_gamma': 4.645413142559386, 'reg_alpha': 4.354163812001783, 'reg_lambda': 0.832490651523361}
Best trial accuracy: 0.6293610550319381


In [42]:
best_params = study.best_trial.params.copy()
best_params.pop('clf')

'XGBClassifier'

In [18]:
model_lr = XGBClassifier(**best_params , random_state = 42)

best_model = Pipeline(steps=[
            ('preprocessor' , preprocessor),
            ('smote' , SMOTE(random_state= 42)),
            ('clf' , model_lr)
        ])

best_model.fit(X_train, y_train)

with open(r"C:\Users\iaman\ML\PROJECTS\Churn Rate Analysis\Webpage\model\churn_model.pkl", "wb") as f:
    pickle.dump(best_model, f)

In [19]:
y_pred = best_model.predict(X_test)

print(f'Classification Report : \n{classification_report(y_test,y_pred)}')

Classification Report : 
              precision    recall  f1-score   support

           0       0.92      0.73      0.81      1036
           1       0.52      0.83      0.64       373

    accuracy                           0.76      1409
   macro avg       0.72      0.78      0.73      1409
weighted avg       0.82      0.76      0.77      1409



In [40]:
best_model.predict({
  "gender": "Female",
  "SeniorCitizen": 0,
  "Partner": "Yes",
  "Dependents": "Yes",
  "tenure": 1,
  "PhoneService": "Yes",
  "MultipleLines": "Yes",
  "InternetService": "DSL",
  "OnlineSecurity": "Yes",
  "OnlineBackup": "Yes",
  "DeviceProtection": "Yes",
  "TechSupport": "Yes",
  "StreamingTV": "Yes",
  "StreamingMovies": "Yes",
  "Contract": "Month-to-month",
  "PaperlessBilling": "Yes",
  "PaymentMethod": "Electronic check",
  "MonthlyCharges": 28.95,
  "TotalCharges": 28.95
})

ValueError: Expected 2D array, got scalar array instead:
array={'gender': 'Female', 'SeniorCitizen': 0, 'Partner': 'Yes', 'Dependents': 'Yes', 'tenure': 0, 'PhoneService': 'Yes', 'MultipleLines': 'Yes', 'InternetService': 'DSL', 'OnlineSecurity': 'Yes', 'OnlineBackup': 'Yes', 'DeviceProtection': 'Yes', 'TechSupport': 'Yes', 'StreamingTV': 'Yes', 'StreamingMovies': 'Yes', 'Contract': 'Month-to-month', 'PaperlessBilling': 'Yes', 'PaymentMethod': 'Electronic check', 'MonthlyCharges': 0, 'TotalCharges': 0}.
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.