## PreProcessing    

In [18]:
import pandas as pd 
import numpy as np

In [19]:
file_path=r"C:\Users\Administrator\OneDrive\Desktop\Machine_learning_cs303\Practicals\Experiment_6\Dataset\Customer Churn.csv"

data=pd.read_csv(file_path)
data.head()

Unnamed: 0,Call Failure,Complains,Subscription Length,Charge Amount,Seconds of Use,Frequency of use,Frequency of SMS,Distinct Called Numbers,Age Group,Tariff Plan,Status,Age,Customer Value,Churn
0,8,0,38,0,4370,71,5,17,3,1,1,30,197.64,0
1,0,0,39,0,318,5,7,4,2,1,2,25,46.035,0
2,10,0,37,0,2453,60,359,24,3,1,1,30,1536.52,0
3,10,0,38,0,4198,66,1,35,1,1,1,15,240.02,0
4,3,0,38,0,2393,58,2,33,1,1,1,15,145.805,0


In [20]:
# dataset info

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3150 entries, 0 to 3149
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Call  Failure            3150 non-null   int64  
 1   Complains                3150 non-null   int64  
 2   Subscription  Length     3150 non-null   int64  
 3   Charge  Amount           3150 non-null   int64  
 4   Seconds of Use           3150 non-null   int64  
 5   Frequency of use         3150 non-null   int64  
 6   Frequency of SMS         3150 non-null   int64  
 7   Distinct Called Numbers  3150 non-null   int64  
 8   Age Group                3150 non-null   int64  
 9   Tariff Plan              3150 non-null   int64  
 10  Status                   3150 non-null   int64  
 11  Age                      3150 non-null   int64  
 12  Customer Value           3150 non-null   float64
 13  Churn                    3150 non-null   int64  
dtypes: float64(1), int64(13)

In [21]:
# Checking for null values
data.isnull().sum()

Call  Failure              0
Complains                  0
Subscription  Length       0
Charge  Amount             0
Seconds of Use             0
Frequency of use           0
Frequency of SMS           0
Distinct Called Numbers    0
Age Group                  0
Tariff Plan                0
Status                     0
Age                        0
Customer Value             0
Churn                      0
dtype: int64

Clean Data with 0 missing values and all field are either integer or float

In [22]:
features=[x for x in data.columns][:-1]

In [23]:
# Creating Numerical pipline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

# Numerical Pipline 

numeric_pipe=Pipeline(steps=[
    ("imputer",SimpleImputer(strategy='mean')),
    ("Standard scaling ",StandardScaler()),
])

preprocessing=ColumnTransformer(transformers=[
    ("numeric",numeric_pipe,features)
],remainder='passthrough')

In [24]:
# Train test split 
from sklearn.model_selection import train_test_split
X , y = data.iloc[:,:-1] , data.iloc[:,-1]
X_train , X_test, y_train , y_test = train_test_split(X,y, test_size=0.15 , random_state=42)


## Baseline Model

In [25]:
from sklearn.linear_model import LogisticRegression

lr_model=Pipeline(steps=[
    ('preprocessing',preprocessing),
    ('model',LogisticRegression(max_iter=500))
])
lr_model.fit(X_train,y_train)

**Evaluation matrix (roc-aoc , confusion , f1 score)**

In [26]:
from sklearn.metrics import roc_auc_score , f1_score , confusion_matrix , roc_curve

y_pred=lr_model.predict(X_test)
print(f"Confusion Metric:\n{confusion_matrix(y_test,y_pred)}")
print(f"\nF1 score:\n{f1_score(y_test,y_pred)}")

Confusion Metric:
[[386   7]
 [ 49  31]]

F1 score:
0.5254237288135594


**Roc_auc Curve**

In [27]:
import plotly.graph_objects as go

fpr ,tpr , threshold = roc_curve(y_test,y_pred)
auc=roc_auc_score(y_test,y_pred)
# Generate a trace for the Logistic Regression ROC curve
trace0 = go.Scatter(
    x=fpr,
    y=tpr,
    mode='lines',
    name=f'Logistic Regression (Area = {auc:.2f})'
)



# Diagonal line
trace2 = go.Scatter(
    x=[0, 1],
    y=[0, 1],
    mode='lines',
    name='Random (Area = 0.5)',
    line=dict(dash='dash')
)

data = [trace0,  trace2]

# Define layout with square aspect ratio
layout = go.Layout(
    title='Receiver Operating Characteristic',
    xaxis=dict(title='False Positive Rate'),
    yaxis=dict(title='True Positive Rate'),
    autosize=False,
    width=800,
    height=800,
    showlegend=True
)

# Define figure and add data
fig = go.Figure(data=data, layout=layout)

# Show figure
fig.show()


**Cross Validation**

In [28]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(lr_model, X_train, y_train, cv=5, scoring='accuracy')

print("Cross-validation scores:", scores)
print("Mean accuracy:", np.mean(scores))

Cross-validation scores: [0.8880597  0.88992537 0.90093458 0.89719626 0.90280374]
Mean accuracy: 0.8957839308132236


## HyperParameter Tunning 

In [31]:
from sklearn.model_selection import RandomizedSearchCV , GridSearchCV 
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import uniform

rf_model=Pipeline(steps=[
    ("preprocessing",preprocessing),
    ("model",RandomForestClassifier())
])

param_dist_rf = {
    'model__n_estimators': [100, 200, 300],
    'model__max_depth': [5, 10, 15, None],
    'model__bootstrap': [True, False]
}

param_grid_lr = {
    'model__C': [0.01, 0.1, 1, 10, 100],           
    'model__solver': ['liblinear', 'lbfgs']
}

# GridSearch For LR model
lr_grid_search=GridSearchCV(
    estimator=lr_model,
    param_grid=param_grid_lr,
    scoring='accuracy',
    cv=5,
    n_jobs=-1,
    verbose=2
)


rf_random_search = RandomizedSearchCV(
    estimator=rf_model,
    param_distributions=param_dist_rf,
    n_iter=10,
    scoring='accuracy',
    cv=5,
    verbose=2,
    n_jobs=-1,
    random_state=42
)


lr_grid_search.fit(X_train,y_train)
rf_random_search.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [38]:
print(f"Best Parameters For LogisticRegression:-\n{lr_grid_search.best_params_}")
print(f"Best CV score: {lr_grid_search.best_score_}")
print(f"Test Accuracy: {lr_grid_search.best_estimator_.score(X_test,y_test)}")
print()
print(f"Best Parameters For RandomForest:-\n{rf_random_search.best_params_}")
print(f"Best CV score: {rf_random_search.best_score_}")
print(f"Test Accuracy: {rf_random_search.best_estimator_.score(X_test,y_test)}")

Best Parameters For LogisticRegression:-
{'model__C': 0.01, 'model__solver': 'liblinear'}
Best CV score: 0.9017589621983539
Test Accuracy: 0.8900634249471459

Best Parameters For RandomForest:-
{'model__n_estimators': 300, 'model__max_depth': None, 'model__bootstrap': True}
Best CV score: 0.9607755614451108
Test Accuracy: 0.9408033826638478


In [45]:
# Get tuned pipelines
best_rf = rf_random_search.best_estimator_
best_lr = lr_grid_search.best_estimator_

# Already fitted during search — ready to predict
y_pred_rf = best_rf.predict(X_test)
y_pred_lr = best_lr.predict(X_test)

# Evaluate
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Random Forest F1:", f1_score(y_test, y_pred_rf))
print("Random Forest ROC-AUC:", roc_auc_score(y_test, best_rf.predict_proba(X_test)[:, 1]))

print("\nLogistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Logistic Regression F1:", f1_score(y_test, y_pred_lr))
print("Logistic Regression ROC-AUC:", roc_auc_score(y_test, best_lr.predict_proba(X_test)[:, 1]))

print("\nClassification Report (Random Forest):\n", classification_report(y_test, y_pred_rf))
print("\nClassification Report (Logistic Regression):\n", classification_report(y_test, y_pred_lr))


Random Forest Accuracy: 0.9408033826638478
Random Forest F1: 0.8133333333333334
Random Forest ROC-AUC: 0.9792461832061069

Logistic Regression Accuracy: 0.8900634249471459
Logistic Regression F1: 0.5357142857142857
Logistic Regression ROC-AUC: 0.9155375318066157

Classification Report (Random Forest):
               precision    recall  f1-score   support

           0       0.95      0.98      0.96       393
           1       0.87      0.76      0.81        80

    accuracy                           0.94       473
   macro avg       0.91      0.87      0.89       473
weighted avg       0.94      0.94      0.94       473


Classification Report (Logistic Regression):
               precision    recall  f1-score   support

           0       0.89      0.99      0.94       393
           1       0.94      0.38      0.54        80

    accuracy                           0.89       473
   macro avg       0.91      0.68      0.74       473
weighted avg       0.90      0.89      0.87       

**Roc-AUC curve**

In [None]:
import plotly.graph_objects as go

lr_fpr ,lr_tpr , lr_threshold = roc_curve(y_test,y_pred_lr)
lr_auc=roc_auc_score(y_test,y_pred_lr)

rf_fpr ,rf_tpr , rf_threshold = roc_curve(y_test,y_pred_rf)
rf_auc=roc_auc_score(y_test,y_pred_rf)
# Generate a trace for the Logistic Regression ROC curve
trace0 = go.Scatter(
    x=lr_fpr,
    y=lr_tpr,
    mode='lines',
    name=f'Logistic Regression (Area = {lr_auc:.2f})'
)

trace1 = go.Scatter(
    x=rf_fpr,
    y=rf_tpr,
    mode='lines',
    name=f'RandomForest (Area = {rf_auc:.2f})'
)



# Diagonal line
trace2 = go.Scatter(
    x=[0, 1],
    y=[0, 1],
    mode='lines',
    name='Random (Area = 0.5)',
    line=dict(dash='dash')
)

data = [trace0, trace1, trace2]

# Define layout with square aspect ratio
layout = go.Layout(
    title='Receiver Operating Characteristic',
    xaxis=dict(title='False Positive Rate'),
    yaxis=dict(title='True Positive Rate'),
    autosize=False,
    width=800,
    height=800,
    showlegend=True
)

# Define figure and add data
fig = go.Figure(data=data, layout=layout)

# Show figure
fig.show()