In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
import joblib


In [22]:
file_path=r"C:\Users\Administrator\OneDrive\Desktop\Machine_learning_cs303\Practicals\Experiment_5\dataset\Telco Customer Churn.csv"
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [23]:
df.drop(columns='customerID',inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   MultipleLines     7043 non-null   object 
 7   InternetService   7043 non-null   object 
 8   OnlineSecurity    7043 non-null   object 
 9   OnlineBackup      7043 non-null   object 
 10  DeviceProtection  7043 non-null   object 
 11  TechSupport       7043 non-null   object 
 12  StreamingTV       7043 non-null   object 
 13  StreamingMovies   7043 non-null   object 
 14  Contract          7043 non-null   object 
 15  PaperlessBilling  7043 non-null   object 
 16  PaymentMethod     7043 non-null   object 


In [24]:
df.isnull().sum()

gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [25]:
numeric_col=[col for col in df.columns if df[col].dtype!='object']
categorical_col=[col for col in df.columns if df[col].dtype=='object' and col!='Churn']

In [26]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder , LabelEncoder

numeric_pipeline=Pipeline(steps=[
    ('Scaling',StandardScaler())
])

categorical_pipeline=Pipeline(steps=[
    ('Encoding',OneHotEncoder(drop='first',handle_unknown='ignore',sparse_output=False))
])

preprocessing=ColumnTransformer(transformers=[
    ('Numeric',numeric_pipeline,numeric_col),
    ('categorical',categorical_pipeline,categorical_col)
],remainder='passthrough')


In [27]:
# Define the models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'KNN': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'AdaBoost': AdaBoostClassifier()
}

In [31]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

X=df.iloc[:,:-1]
y=df.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

lr=LabelEncoder()
y_train=lr.fit_transform(y_train)
y_test=lr.fit_transform(y_test)

In [32]:
# Create a dictionary to store results
results = {}
best_model = None
best_auc = 0

In [None]:
print("--- Model Training and Evaluation ---")
for name, model in models.items():
    # Create the full pipeline: preprocess + model
    clf = Pipeline(steps=[
        ('preprocessing', preprocessing),
        ('classifier', model)
    ])
    
    # Train the model
    clf.fit(X_train, y_train)
    
    # Make predictions
    y_pred = clf.predict(X_test)
    y_prob = clf.predict_proba(X_test)[:, 1]
    
    # Evaluate
    accuracy = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)
    
    # Store results
    results[name] = {'Accuracy': accuracy, 'ROC_AUC': auc}
    
    print(f"\n--- {name} ---")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"ROC AUC: {auc:.4f}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    
    # Save the best model
    if auc > best_auc:
        best_auc = auc
        best_model = clf
        best_model_name = name

--- Model Training and Evaluation ---





--- Logistic Regression ---
Accuracy: 0.8003
ROC AUC: 0.8448
Confusion Matrix:
[[1387  165]
 [ 257  304]]





--- KNN ---
Accuracy: 0.7667
ROC AUC: 0.7894
Confusion Matrix:
[[1312  240]
 [ 253  308]]





--- Decision Tree ---
Accuracy: 0.7719
ROC AUC: 0.6959
Confusion Matrix:
[[1335  217]
 [ 265  296]]


In [None]:
print("\n--- Model Comparison Summary ---")
results_df = pd.DataFrame(results).T
print(results_df)

# Save the best performing model
joblib.dump(best_model, f'{best_model_name.replace(" ", "_")}_best_model.pkl')
print(f"\nSaved best model ({best_model_name}) to disk.")