In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from scipy.stats import randint, uniform
import math
import warnings
from imblearn.combine import SMOTEENN,SMOTETomek
from scipy.stats import chi2_contingency
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler,OrdinalEncoder, PowerTransformer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC
from pathlib import Path
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from imblearn.pipeline import Pipeline as ImbPipeline
import joblib
import json
from datetime import datetime

from sklearn.metrics import accuracy_score, classification_report,ConfusionMatrixDisplay, \
                            precision_score, recall_score, f1_score, roc_auc_score,roc_curve 
warnings.filterwarnings("ignore")

%matplotlib inline

In [2]:
data_df = pd.read_csv(f'{Path.cwd().parent}/data/processed/processed_data.csv')

In [3]:
data_df['Churn'].value_counts()

Churn
0    5163
1    1869
Name: count, dtype: int64

In [4]:
data_df.head(10).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Unnamed: 0,0,1,2,3,4,5,6,7,8,9
customerID,7590-VHVEG,5575-GNVDE,3668-QPYBK,7795-CFOCW,9237-HQITU,9305-CDSKC,1452-KIOVK,6713-OKOMC,7892-POOKP,6388-TABGU
gender,1,0,0,0,1,1,0,1,1,0
SeniorCitizen,1,1,1,1,1,1,1,1,1,1
Partner,1,0,0,0,0,0,0,0,1,0
Dependents,1,1,1,1,1,1,0,1,1,0
tenure,1,34,2,45,2,8,22,10,28,62
PhoneService,1,0,0,1,0,0,0,1,0,0
MultipleLines,No phone service,No,No,No phone service,No,Yes,Yes,No phone service,Yes,No
InternetService,DSL,DSL,DSL,DSL,Fiber optic,Fiber optic,Fiber optic,DSL,Fiber optic,DSL


In [5]:
X = data_df.drop(['customerID', 'Churn','Unnamed: 0'], axis = 1)
y = data_df['Churn']

In [6]:
X_train,  X_test, y_train,y_test = train_test_split(X,y, random_state = 42, test_size =0.2,stratify=y)

In [7]:
multi_class_var = ['MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaymentMethod']

In [8]:
def validation_monitor(pred,test):
    model_test_accuracy = accuracy_score(pred,test)
    model_test_precision = precision_score(pred,test)
    model_test_f1 = f1_score(pred,test)
    model_test_recall = recall_score(pred,test)
    model_test_rocauc_score = roc_auc_score(pred,test)

    return model_test_accuracy,model_test_precision,model_test_f1,model_test_recall,model_test_rocauc_score


In [9]:
preprocessor = ColumnTransformer([
    ('onehotencoding',OneHotEncoder(),multi_class_var)
])

In [10]:
model_output = {}

In [11]:
models = [LogisticRegression(),RandomForestClassifier(), AdaBoostClassifier(), GradientBoostingClassifier(),KNeighborsClassifier(),
          DecisionTreeClassifier(),XGBClassifier()]

In [12]:
scalar = StandardScaler()

In [13]:
param_distributions = {
    
    'LogisticRegression': {
        'model__C': uniform(0.001, 100),
        'model__penalty': ['l1', 'l2', 'elasticnet'],
        'model__solver': ['liblinear', 'saga'],
        'model__class_weight': ['balanced', None],
        'model__max_iter': [200, 500,1000]
    },
    
    'RandomForestClassifier': {
        'model__n_estimators': randint(100, 500),
        'model__max_depth': [10, 20, 30, None],
        'model__min_samples_split': randint(2, 20),
        'model__min_samples_leaf': randint(1, 10),
        'model__max_features': ['sqrt', 'log2', None],
        'model__class_weight': ['balanced', 'balanced_subsample', None]
    },
    
    'AdaBoostClassifier': {
        'model__n_estimators': randint(100, 500),
        'model__learning_rate': uniform(0.01, 1.0)
    },
    
    'GradientBoostingClassifier': {
        'model__n_estimators': randint(100, 500),
        'model__learning_rate': uniform(0.01, 0.3),
        'model__max_depth': randint(3, 8),
        'model__min_samples_split': randint(2, 20),
        'model__min_samples_leaf': randint(1, 10),
        'model__subsample': uniform(0.6, 0.4),
        'model__max_features': ['sqrt', 'log2', None]
    },
    
    'KNeighborsClassifier': {
        'model__n_neighbors': randint(3, 21),
        'model__weights': ['uniform', 'distance'],
        'model__metric': ['euclidean', 'manhattan', 'minkowski'],
        'model__p': [1, 2],
        'model__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
    },
    
    'DecisionTreeClassifier': {
        'model__max_depth': [5, 10, 15, 20, None],
        'model__min_samples_split': randint(2, 50),
        'model__min_samples_leaf': randint(1, 20),
        'model__max_features': ['sqrt', 'log2', None],
        'model__criterion': ['gini', 'entropy'],
        'model__splitter': ['best', 'random'],
        'model__class_weight': ['balanced', None]
    },

    'XGBClassifier': {
        'model__n_estimators': randint(100, 500),
        'model__max_depth': randint(3, 10),                # Random int between 3-10
        'model__learning_rate': uniform(0.01, 0.3),        # Random float 0.01-0.31
        'model__subsample': uniform(0.6, 0.4),             # Random float 0.6-1.0
        'model__colsample_bytree': uniform(0.6, 0.4),      # Random float 0.6-1.0
        'model__scale_pos_weight': [1, 3, 5, 10],           # For imbalanced data
        'model__min_child_weight': randint(1, 10)
    }

}

In [30]:
best_overall_f1 = -1
best_overall_pipeline = None
best_overall_name = ""

for model in models:

    model_name = model.__class__.__name__

    pipeline = ImbPipeline([
        ('pre', preprocessor),
        ('smote', SMOTEENN(random_state=42)),
        ('scalar',scalar),
        ('model',model)
    ])

    search = RandomizedSearchCV(
        n_iter = 100,
        estimator = pipeline,
        param_distributions=param_distributions[model_name],
        cv = 5,
        scoring = 'f1',
        n_jobs=-1,
        random_state = 42
    )    

    search.fit(X_train, y_train)

    print(f"Completed training for {model_name}")

    if search.best_score_ > best_overall_f1:
        best_overall_f1 = search.best_score_
        best_model_overall = search.best_estimator_
        best_model_parameter = search.best_params_
        best_overall_name = model_name

print(f'---------------Best Model Attributes----------------')
print(f'Best Model : {best_model_overall}')
print(f'f1 : {best_model_parameter}')
print(f'recall : {best_overall_f1}')





Completed training for LogisticRegression
Completed training for RandomForestClassifier
Completed training for AdaBoostClassifier
Completed training for GradientBoostingClassifier
Completed training for KNeighborsClassifier
Completed training for DecisionTreeClassifier
Completed training for XGBClassifier
---------------Best Model Attributes----------------
Best Model : Pipeline(steps=[('pre',
                 ColumnTransformer(transformers=[('onehotencoding',
                                                  OneHotEncoder(),
                                                  ['MultipleLines',
                                                   'InternetService',
                                                   'OnlineSecurity',
                                                   'OnlineBackup',
                                                   'DeviceProtection',
                                                   'TechSupport', 'StreamingTV',
                                          

In [31]:
y_pred = best_model_overall.predict(X_test)

In [45]:
model_test_accuracy = round(validation_monitor(y_pred,y_test)[0],2)
model_test_precision = round(validation_monitor(y_pred,y_test)[1],2)
model_test_f1 = round(validation_monitor(y_pred,y_test)[2],2)
model_test_recall = round(validation_monitor(y_pred,y_test)[3],2)

print(f"Model test accuracy:{model_test_accuracy}")
print(f"Model test precision:{model_test_precision}")
print(f"Model test f1:{model_test_f1}")
print(f"Model test recall:{model_test_recall}")



Model test accuracy:0.73
Model test precision:0.75
Model test f1:0.6
Model test recall:0.5


In [33]:
print("Test Set Performance:")
print(classification_report(y_test, y_pred))
print(f"ROC-AUC: {roc_auc_score(y_test, y_pred):.3f}")


Test Set Performance:
              precision    recall  f1-score   support

           0       0.89      0.73      0.80      1033
           1       0.50      0.75      0.60       374

    accuracy                           0.73      1407
   macro avg       0.69      0.74      0.70      1407
weighted avg       0.79      0.73      0.75      1407

ROC-AUC: 0.738


In [42]:
print(search.best_score_)

0.6097196331012055


In [46]:
model_metadata = {
    "model_name":"Streamflix_Churn_Analysis",
    "model_type": best_model_overall.named_steps['model'].__class__.__name__,
    "training_date":datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    "best_parameters":best_model_parameter,
    "f1_score":round(best_overall_f1,2),
    "feature_name":X_train.columns.tolist(),
    "n_features": len(X_train.columns),
    "training_samples": len(X_train),
    "test_samples":len(X_test),
    "class_distribution":{'train': y_train.value_counts().to_dict(),'test': y_test.value_counts().to_dict()},
    'test_performance': {
        'accuracy': model_test_accuracy,
        'precision': model_test_precision,
        'recall':  model_test_recall,
        'f1_score': model_test_f1
    },
    'python_version': '3.14'
}



In [47]:
model_metadata

{'model_name': 'Streamflix_Churn_Analysis',
 'model_type': 'AdaBoostClassifier',
 'training_date': '2026-02-22 14:12:45',
 'best_parameters': {'model__learning_rate': np.float64(0.03306242504141576),
  'model__n_estimators': 158},
 'f1_score': np.float64(0.61),
 'feature_name': ['gender',
  'SeniorCitizen',
  'Partner',
  'Dependents',
  'tenure',
  'PhoneService',
  'MultipleLines',
  'InternetService',
  'OnlineSecurity',
  'OnlineBackup',
  'DeviceProtection',
  'TechSupport',
  'StreamingTV',
  'StreamingMovies',
  'Contract',
  'PaperlessBilling',
  'PaymentMethod',
  'MonthlyCharges',
  'TotalCharges'],
 'n_features': 19,
 'training_samples': 5625,
 'test_samples': 1407,
 'class_distribution': {'train': {0: 4130, 1: 1495},
  'test': {0: 1033, 1: 374}},
 'test_performance': {'accuracy': 0.73,
  'precision': 0.75,
  'recall': 0.5,
  'f1_score': 0.6},
 'python_version': '3.14'}

In [55]:
model_location = f"{Path.cwd().parent}/models/best_model.pkl"
joblib.dump(best_model_overall, model_location)

model_metadata_location = f"{Path.cwd().parent}/models/best_model_metadata.json"
with open(model_metadata_location, 'w') as f:
    json.dump(model_metadata, f, indent=4)