<a href="https://colab.research.google.com/github/anupa35/Machine-Learning/blob/main/Task2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Task 2



# 1. Preprocessing

In [1]:
import pandas as pd
import numpy as np

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
df = pd.read_csv("/content/drive/MyDrive/WA_Fn-UseC_-Telco-Customer-Churn.csv")

In [4]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [5]:
df.info() #checking data types

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [6]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce') # changing incorrect data type of TotalCharges
df.isnull().sum() # checking for null values

Unnamed: 0,0
customerID,0
gender,0
SeniorCitizen,0
Partner,0
Dependents,0
tenure,0
PhoneService,0
MultipleLines,0
InternetService,0
OnlineSecurity,0


In [7]:
df['TotalCharges'] = df['TotalCharges'].fillna(df['TotalCharges'].median()) # Handling missing values of TotalCharges

In [8]:
df.isnull().sum() # checking for null values again

Unnamed: 0,0
customerID,0
gender,0
SeniorCitizen,0
Partner,0
Dependents,0
tenure,0
PhoneService,0
MultipleLines,0
InternetService,0
OnlineSecurity,0


In [9]:
df = df.drop(columns=['customerID']) # removing the customerID column since it is not useful

In [10]:
# converting Yes/No containing columns into 1/0 values
columnswith_YesNo = ['Partner', 'Dependents', 'PhoneService','PaperlessBilling', 'Churn']

for col in columnswith_YesNo:
    df[col] = df[col].map({'Yes':1, 'No':0})

In [11]:
# converting gender column data into 1/0
df['gender'] = df['gender'].map({'Male':1, 'Female':0})

#Feature Engineering

In [12]:
service_cols = [
    'PhoneService','OnlineSecurity','OnlineBackup',
    'DeviceProtection','TechSupport','StreamingTV','StreamingMovies'
]

df["num_services"] = df[service_cols].apply(lambda row: (row=='Yes').sum(), axis=1)

In [13]:
df["tenure_group"] = pd.cut(
    df["tenure"],
    bins=[0, 6, 12, 24, 48, 72],
    labels=["0-6","6-12","12-24","24-48","48-72"],
    include_lowest=True
)

In [14]:
df["charge_ratio"] = df["MonthlyCharges"] / (df["TotalCharges"] + 1)

In [15]:
df["avg_monthly_spend"] = df["TotalCharges"] / (df["tenure"] + 1)

In [16]:
df["is_fiber"] = (df["InternetService"] == "Fiber optic").astype(int)

In [17]:
df["is_monthly"] = (df["Contract"] == "Month-to-month").astype(int)

In [18]:
columns_multicatego = [
    'InternetService','Contract','PaymentMethod','MultipleLines',
    'OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport',
    'StreamingTV','StreamingMovies','tenure_group'
]

columns_numeric = [
    'tenure','MonthlyCharges','TotalCharges','num_services',
    'charge_ratio','avg_monthly_spend'
]

In [19]:
X = df.drop("Churn", axis=1)
y = df["Churn"]

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [20]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

preprocessor_nn = ColumnTransformer([
    ('categorical', OneHotEncoder(handle_unknown='ignore'), columns_multicatego),
    ('numeric', StandardScaler(), columns_numeric)
], remainder='passthrough')

preprocessor_tree = ColumnTransformer([
    ('categorical', OneHotEncoder(handle_unknown='ignore'), columns_multicatego),
    ('numeric', 'passthrough', columns_numeric)
], remainder='passthrough')

X_train_nn = preprocessor_nn.fit_transform(X_train)
X_test_nn = preprocessor_nn.transform(X_test)

X_train_tree = preprocessor_tree.fit_transform(X_train)
X_test_tree = preprocessor_tree.transform(X_test)

# Feed Forward Neural Network

In [21]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

In [49]:
class_weight = {0:1,1:2}

In [50]:
input_dim = X_train_nn.shape[1]

from tensorflow.keras import Input

# Tuned neural network architecture
model = Sequential([
    Input(shape=(input_dim,)),

    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.25),

    Dense(64, activation='relu'),
    BatchNormalization(),
    Dropout(0.25),

    Dense(32, activation='relu'),
    Dropout(0.2),

    Dense(1, activation='sigmoid')
])

In [51]:
# Tuned optimizer with custom learning rate
optimizer = tf.keras.optimizers.Adam(learning_rate=0.0007)

model.compile(
    optimizer=optimizer,
    loss='binary_crossentropy',
    metrics=['accuracy']
)

In [52]:
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=8,
    restore_best_weights=True
)

In [53]:
# Model training
history = model.fit(
    X_train_nn,
    y_train,
    validation_split=0.2,
    epochs=100,
    batch_size=32,
    callbacks=[early_stop],
    class_weight=class_weight,
    verbose=1
)

Epoch 1/100
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 24ms/step - accuracy: 0.6800 - loss: 0.8409 - val_accuracy: 0.7374 - val_loss: 0.5371
Epoch 2/100
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.7662 - loss: 0.6735 - val_accuracy: 0.7391 - val_loss: 0.4975
Epoch 3/100
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 18ms/step - accuracy: 0.7749 - loss: 0.6200 - val_accuracy: 0.7400 - val_loss: 0.4903
Epoch 4/100
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - accuracy: 0.7700 - loss: 0.6356 - val_accuracy: 0.7453 - val_loss: 0.4816
Epoch 5/100
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 17ms/step - accuracy: 0.7703 - loss: 0.6260 - val_accuracy: 0.7462 - val_loss: 0.4793
Epoch 6/100
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.7943 - loss: 0.5955 - val_accuracy: 0.7507 - val_loss: 0.4738
Epoch 7/100
[1

In [58]:
# Evaluation
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))



Confusion Matrix:
[[797 238]
 [101 273]]


In [57]:
# Predictions
y_pred_prob = model.predict(X_test_nn)
y_pred = (y_pred_prob > 0.50).astype(int)


print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nROC AUC Score:", roc_auc_score(y_test, y_pred_prob))

[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.77      0.82      1035
           1       0.53      0.73      0.62       374

    accuracy                           0.76      1409
   macro avg       0.71      0.75      0.72      1409
weighted avg       0.79      0.76      0.77      1409


ROC AUC Score: 0.8440491875274484


# Decision Tree

In [40]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

In [41]:
dt = DecisionTreeClassifier(class_weight={0:1,1:2}, random_state=42)

In [42]:
param_grid = {
    'max_depth': [3, 5, 7, 10, None],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 5, 10]
}

In [43]:
grid_search = GridSearchCV(
    estimator=dt,
    param_grid=param_grid,
    scoring='roc_auc',  # optimize for ROC-AUC
    cv=5,
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train_tree, y_train)

Fitting 5 folds for each of 80 candidates, totalling 400 fits


In [44]:
print("Best parameters:", grid_search.best_params_)

best_dt = grid_search.best_estimator_
best_dt.fit(X_train_tree, y_train)

Best parameters: {'max_depth': 5, 'min_samples_leaf': 10, 'min_samples_split': 2}


In [45]:
# Predictions
y_pred_tree = best_dt.predict(X_test_tree)
y_pred_prob_tree = best_dt.predict_proba(X_test_tree)[:, 1]


In [46]:
# Confusion matrix
print(confusion_matrix(y_test, y_pred_tree))


[[795 240]
 [ 89 285]]


In [47]:
# Classification report
print(classification_report(y_test, y_pred_tree))

# ROC-AUC
print("ROC AUC Score:", roc_auc_score(y_test, y_pred_prob_tree))

              precision    recall  f1-score   support

           0       0.90      0.77      0.83      1035
           1       0.54      0.76      0.63       374

    accuracy                           0.77      1409
   macro avg       0.72      0.77      0.73      1409
weighted avg       0.80      0.77      0.78      1409

ROC AUC Score: 0.8343886434679274
