In [1]:
import numpy as np
import pandas as pd

## PreProcessing 

In [2]:
file_path=r"C:\Users\Administrator\OneDrive\Desktop\Machine_learning_cs303\Practicals\Experiment_6\Dataset\Customer Churn.csv"
data=pd.read_csv(file_path)

data.head()


Unnamed: 0,Call Failure,Complains,Subscription Length,Charge Amount,Seconds of Use,Frequency of use,Frequency of SMS,Distinct Called Numbers,Age Group,Tariff Plan,Status,Age,Customer Value,Churn
0,8,0,38,0,4370,71,5,17,3,1,1,30,197.64,0
1,0,0,39,0,318,5,7,4,2,1,2,25,46.035,0
2,10,0,37,0,2453,60,359,24,3,1,1,30,1536.52,0
3,10,0,38,0,4198,66,1,35,1,1,1,15,240.02,0
4,3,0,38,0,2393,58,2,33,1,1,1,15,145.805,0


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3150 entries, 0 to 3149
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Call  Failure            3150 non-null   int64  
 1   Complains                3150 non-null   int64  
 2   Subscription  Length     3150 non-null   int64  
 3   Charge  Amount           3150 non-null   int64  
 4   Seconds of Use           3150 non-null   int64  
 5   Frequency of use         3150 non-null   int64  
 6   Frequency of SMS         3150 non-null   int64  
 7   Distinct Called Numbers  3150 non-null   int64  
 8   Age Group                3150 non-null   int64  
 9   Tariff Plan              3150 non-null   int64  
 10  Status                   3150 non-null   int64  
 11  Age                      3150 non-null   int64  
 12  Customer Value           3150 non-null   float64
 13  Churn                    3150 non-null   int64  
dtypes: float64(1), int64(13)

In [5]:
#  Check for missing values 
data.isnull().sum()

Call  Failure              0
Complains                  0
Subscription  Length       0
Charge  Amount             0
Seconds of Use             0
Frequency of use           0
Frequency of SMS           0
Distinct Called Numbers    0
Age Group                  0
Tariff Plan                0
Status                     0
Age                        0
Customer Value             0
Churn                      0
dtype: int64

In [36]:
columns=[col for col in data.columns][:-1]


13

In [24]:
from sklearn.model_selection import train_test_split

X_train , X_test , y_train , y_test= train_test_split(data.iloc[:,:-1],data.iloc[:,-1],test_size=0.2, random_state=42)

In [41]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

numerical_pipeline=Pipeline(steps=[
    ("imputer",SimpleImputer(strategy='mean')),
    ("Scaling",StandardScaler())
])

#  Column tranformation 
preprocessing=ColumnTransformer(transformers=[
    ('Numeric',numerical_pipeline,columns)

],remainder="passthrough")

## BaseLine Logistic Regression

In [42]:
from sklearn.linear_model import LogisticRegression

model=Pipeline(steps=[
    ('preprocessing',preprocessing),
    ("model",LogisticRegression())
])

model.fit(X_train,y_train)

In [45]:
from sklearn.metrics import confusion_matrix ,accuracy_score
y_pred=model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test,y_pred)}")
print(f"Confusion matric:\n{confusion_matrix(y_test,y_pred)}")

Accuracy: 0.8698412698412699
Confusion matric:
[[504  16]
 [ 66  44]]


 **ROC - AUC Curve**

In [53]:
from sklearn.metrics import roc_curve
import plotly.graph_objects as go
import numpy as np
import matplotlib.pyplot as plt

plt.figure(figsize=(10,5))
fpr, tpr, thresholds = roc_curve(y_test,y_pred)


# Generate a trace for ROC curve
trace0 = go.Scatter(
    x=fpr,
    y=tpr,
    mode='lines',
    name='ROC curve'
)

# Only label every nth point to avoid cluttering
n = 10
indices = np.arange(len(thresholds)) % n == 0  # Choose indices where index mod n is 0

trace1 = go.Scatter(
    x=fpr[indices],
    y=tpr[indices],
    mode='markers+text',
    name='Threshold points',
    text=[f"Thr={thr:.2f}" for thr in thresholds[indices]],
    textposition='top center'
)


# Diagonal line
trace2 = go.Scatter(
    x=[0, 1],
    y=[0, 1],
    mode='lines',
    name='Random (Area = 0.5)',
    line=dict(dash='dash')
)

data = [trace0, trace1, trace2]

# Define layout with square aspect ratio
layout = go.Layout(
    title='Receiver Operating Characteristic',
    xaxis=dict(title='False Positive Rate'),
    yaxis=dict(title='True Positive Rate'),
    autosize=False,
    width=600,
    height=600,
    showlegend=False
)

# Define figure and add data
fig = go.Figure(data=data, layout=layout)

# Show figure
fig.show()

<Figure size 1000x500 with 0 Axes>

## Cross Validation

In [54]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(model, X_train, y_train, cv=5)  # 5-fold CV
print("Cross-validation scores:", scores)
print("Mean CV Accuracy:", scores.mean())


Cross-validation scores: [0.90674603 0.88690476 0.90079365 0.90277778 0.90079365]
Mean CV Accuracy: 0.8996031746031745


## Hyperparmeter Tunning

In [79]:
from sklearn.model_selection import RandomizedSearchCV , GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import time 


rfmodel=Pipeline(steps=[
    ("preprocessing",preprocessing),
    ("model",RandomForestClassifier())
])

# Parameter Grid
lr_param_grid = {
    'model__C': [0.01, 0.1, 1, 10, 100],
    'model__solver': ['liblinear', 'lbfgs'],
    'model__penalty': ['l2'],
    'model__max_iter':[500,600,800,1100]
}

rf_param_grid={
    'model__n_estimators':[100,200,500,800,1000],
    'model__max_depth':[10,15,20,30]
}



In [82]:
lr_random_search=RandomizedSearchCV(
    estimator=model,
    param_distributions=lr_param_grid,
    scoring='f1',
    n_iter=20,
    cv=5,
    random_state=42,
    n_jobs=-1
)
rf_random_search=RandomizedSearchCV(
    estimator=rfmodel,
    param_distributions=rf_param_grid,
    scoring='f1',
    n_iter=20,
    cv=5,
    random_state=42,
    n_jobs=-1
)

In [None]:
lr_random_search.fit(X_train, y_train)
rf_random_search.fit(X_train, y_train)



Best Parameters: {'model__solver': 'lbfgs', 'model__penalty': 'l2', 'model__max_iter': 800, 'model__C': 100}
Best CV Score: 0.5994739584034345
Test Accuracy: 0.873015873015873
Best Parameters: {'model__n_estimators': 100, 'model__max_depth': 30}
Best CV Score: 0.8667586499030481
Test Accuracy: 0.9349206349206349


In [85]:
# Results
print("LogisticRegression")
print("Best Parameters:", lr_random_search.best_params_)
print("Best CV Score:", lr_random_search.best_score_)
print("Test Accuracy:", lr_random_search.best_estimator_.score(X_test, y_test))

print("\nRandomForestClassifier")
print("Best Parameters:", rf_random_search.best_params_)
print("Best CV Score:", rf_random_search.best_score_)
print("Test Accuracy:",rf_random_search.best_estimator_.score(X_test, y_test))

LogisticRegression
Best Parameters: {'model__solver': 'lbfgs', 'model__penalty': 'l2', 'model__max_iter': 800, 'model__C': 100}
Best CV Score: 0.5994739584034345
Test Accuracy: 0.873015873015873

RandomForestClassifier
Best Parameters: {'model__n_estimators': 100, 'model__max_depth': 30}
Best CV Score: 0.8667586499030481
Test Accuracy: 0.9349206349206349


In [92]:
from sklearn.metrics import roc_auc_score , confusion_matrix

best_lr=lr_random_search.best_estimator_
best_lr.fit(X_train,y_train)
best_rf=rf_random_search.best_estimator_
best_rf.fit(X_train,y_train)
yrf_pred=best_rf.predict(X_test)
ylr_pred=best_lr.predict(X_test)
print(f"Best lr:Confusion Metric\n{confusion_matrix(y_test,best_lr.predict(X_test))}")
print(f"Best rf:Confusion Metric\n{confusion_matrix(y_test,best_rf.predict(X_test))}")


Best lr:Confusion Metric
[[503  17]
 [ 63  47]]
Best rf:Confusion Metric
[[508  12]
 [ 30  80]]


In [103]:
# For RandomForest
rffpr, rftpr, rfthresholds = roc_curve(y_test,yrf_pred)
# For Logistic Regression
lrfpr, lrtpr, lrthresholds = roc_curve(y_test,ylr_pred)


# Generate a trace for ROC curve
trace0 = go.Scatter(
    x=rffpr,
    y=rftpr,
    mode='lines',
    name='ROC curve RandomForest'
)

# Only label every nth point to avoid cluttering
n = 100
indices = np.arange(len(thresholds)) % n == 0  # Choose indices where index mod n is 0

trace1 = go.Scatter(
    x=rffpr[indices],
    y=rftpr[indices],
    mode='markers+text',
    name='Threshold points',
    text=[f"Thr={thr}" for thr in rfthresholds[indices]],
    textposition='top center'
)


# Diagonal line
trace2 = go.Scatter(
    x=[0, 1],
    y=[0, 1],
    mode='lines',
    name='Random (Area = 0.5)',
    line=dict(dash='dash')
)
trace3 = go.Scatter(
    x=lrfpr,
    y=lrtpr,
    mode='lines',
    name='ROC curve LogisticRegression'
)

# Only label every nth point to avoid cluttering
n = 100
indices = np.arange(len(thresholds)) % n == 0  # Choose indices where index mod n is 0

trace4 = go.Scatter(
    x=lrfpr[indices],
    y=lrtpr[indices],
    mode='markers+text',
    name='Threshold points',
    text=[f"Thr={thr}" for thr in lrthresholds[indices]],
    textposition='top center'
)


# Diagonal line
trace5 = go.Scatter(
    x=[0, 1],
    y=[0, 1],
    mode='lines',
    name='Random (Area = 0.5)',
    line=dict(dash='dash')
)

data = [trace0, trace1, trace2,trace3,trace4,trace5]

# Define layout with square aspect ratio
layout = go.Layout(
    title='Receiver Operating Characteristic',
    xaxis=dict(title='False Positive Rate'),
    yaxis=dict(title='True Positive Rate'),
    autosize=False,
    width=900,
    height=700,
    showlegend=True,
    
)

# Define figure and add data
fig = go.Figure(data=data, layout=layout)

# Show figure
fig.show()