In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns   
from pathlib import Path






In [None]:

data_path ="../data/raw/telecom_customer_churn.csv"

df = pd.read_csv(data_path)
df.head()


In [None]:
df.shape

In [None]:
df["Churn"] = df["Customer Status"].apply(lambda x: 1 if x == "Churned" else 0)
df.head()

Columns to drop:

Customer ID
Customer Status
Churn Category
Churn Reason

categorical columns
| Column                                           | Missing       | Strategy           |
| ------------------------------------------------ | ------------- | ------------------ |
| Offer                                            | ~3877 missing | Fill "No Offer"    |
| Multiple Lines                                   | ~682 missing  | Fill "No Phone"    |
| Internet Type                                    | ~1526 missing | Fill "No Internet" |
| Internet-related features (Online Security etc.) | ~1526 missing | Fill "No Internet" |


numerical columns


| Avg Monthly Long Distance Charges                | ~682 missing  | Fill 0 if no phone |

| Avg Monthly GB Download                          | ~1526 missing | Fill 0             |

In [None]:
df.drop(columns=["Customer ID", "Customer Status", "Churn Category","Churn Reason"], inplace=True)
df.head()

In [None]:
df["Avg Monthly Long Distance Charges"]

In [None]:
def detect_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    
    print(f"{column}: {len(outliers)} outliers")
    return outliers


In [None]:
detect_outliers_iqr(df, "Avg Monthly Long Distance Charges")

In [None]:
detect_outliers_iqr(df, "Avg Monthly GB Download")


In [None]:
sns.histplot(df["Avg Monthly GB Download"], kde=True)
plt.show()

In [None]:
df[df["Phone Service"] == "No"].shape


we are filling null values in Avg Monthly Long Distance Charges=0
Reason: If no phone service → no long distance → charges should be 0.

In [None]:
df["Avg Monthly Long Distance Charges"].fillna(0, inplace=True)

In [None]:
df[df["Internet Service"] == "No"].shape


In [None]:
df["Avg Monthly GB Download"].fillna(0, inplace=True)

In [None]:
df.info()

In [None]:
df = df.drop(columns=["City", "Zip Code", "Latitude", "Longitude"])


Offer → 3166 non-null

Multiple Lines → 6361 non-null

 13  Online Security                    5517 non-null   object 

 14  Online Backup                      5517 non-null   object  
 
 15  Device Protection Plan             5517 non-null   object  
 
 16  Premium Tech Support               5517 non-null   object  
 
 17  Streaming TV                       5517 non-null   object  
 
 18  Streaming Movies                   5517 non-null   object  
 
 19  Streaming Music                    5517 non-null   object 


Usage_Segment → 5517 non-null


In [None]:
df["Offer"] = df["Offer"].fillna("No Offer")
df["Multiple Lines"] = df["Multiple Lines"].fillna("No Phone")


In [None]:
df["Internet Type"] = df["Internet Type"].fillna("No Internet Service")
df["Internet Type"] = df["Internet Type"].fillna("No Internet Service")


In [None]:
internet_cols = [
    "Online Security",
    "Online Backup",
    "Device Protection Plan",
    "Premium Tech Support",
    "Streaming TV",
    "Streaming Movies",
    "Streaming Music",
    "Unlimited Data"
]

for col in internet_cols:
    df[col] = df[col].fillna("No Internet Service")


In [None]:
df["Offer"].value_counts()

In [None]:
df["Multiple Lines"].isnull().sum()

In [None]:
df["Multiple Lines"].value_counts()

In [None]:
df.info()

In [None]:
df["Internet Service"].value_counts()

In [None]:
df["Internet Type"].value_counts()

In [None]:
# df["Usage_Segment"] = pd.cut(
#     df["Avg Monthly GB Download"],
#     bins=[0, 30, 60, df["Avg Monthly GB Download"].max()],
#     labels=["Low", "Medium", "High"]
# )
# df["Usage_Segment"].value_counts()

In [None]:
df["Streaming Music"].value_counts()

In [None]:
df["Streaming Movies"].value_counts()

In [None]:
df["Contract"].value_counts()

In [None]:
df["Total Revenue"].min()

In [None]:
df["Tenure in Months"].min()


In [None]:
df["Streaming TV"].value_counts()

FEATURE ENGINEERING

In [None]:
df["Revenue_Per_Month"] = df["Total Revenue"] / df["Tenure in Months"]
df["Is_Monthly_Contract"] = (df["Contract"] == "Month-to-Month").astype(int)



In [None]:
df["Unlimited Data"].value_counts()

In [None]:
df.columns

In [None]:

df.groupby("Churn")["Is_Monthly_Contract"].mean()

In [None]:
df.groupby("Churn")["Revenue_Per_Month"].mean()

In [None]:
pd.crosstab(df["Streaming TV"], df["Churn"], normalize="index")


In [None]:
df.info()

ONE HOT ENCODING

In [None]:
categ_col= df.select_dtypes(include=["object"]).columns
categ_col

In [None]:
binary_map = {
    "Gender": {"Male": 1, "Female": 0},
    "Married": {"Yes": 1, "No": 0},
    "Phone Service": {"Yes": 1, "No": 0},
    "Paperless Billing": {"Yes": 1, "No": 0},
    "Internet Service": {"Yes": 1, "No": 0}
}

for col, mapping in binary_map.items():
    df[col] = df[col].map(mapping)
df.head()

In [None]:
dummy_cols = [
    "Offer",
    "Multiple Lines",
    "Internet Type",
    "Online Security",
    "Online Backup",
    "Device Protection Plan",
    "Premium Tech Support",
    "Streaming TV",
    "Streaming Movies",
    "Streaming Music",
    "Unlimited Data",
    "Payment Method"
]

df = pd.get_dummies(df, columns=dummy_cols, drop_first=True)
df.head()

In [None]:
df = df.drop(columns=["Contract"])
df.head()

In [None]:
df.info()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import  StandardScaler

In [None]:
x=df.drop(columns=["Churn"])
y=df["Churn"]

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=42)

In [None]:
x_train.shape

In [None]:
x_test.shape

In [None]:
scaler=StandardScaler()
x_train_scaled=scaler.fit_transform(x_train)
x_test_scaled=scaler.transform(x_test)

model training 

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
model=LogisticRegression()
model.fit(x_train_scaled, y_train)

In [None]:
y_pred = model.predict(x_test_scaled)



In [None]:
y_pred

In [None]:
y_probs = model.predict_proba(x_test_scaled)[:, 1]


In [None]:
y_probs

In [None]:
print(y_probs[:20])
print(min(y_probs), max(y_probs))


In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
roc_auc_score(y_test, y_probs)

In [None]:
thresholds = [0.3, 0.4, 0.5, 0.6, 0.7]

for t in thresholds:
    y_pred_custom = (y_probs >= t).astype(int)
    print(f"\nThreshold: {t}")
    print(classification_report(y_test, y_pred_custom))

In [None]:
print(classification_report(y_test, y_pred))


In [None]:
print(confusion_matrix(y_test, y_pred))

In [None]:
print("Train Accuracy:", model.score(x_train, y_train))
print("Test Accuracy:", model.score(x_test, y_test))

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
accuracy_score(y_test, y_pred)

DECIISON TREE

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
tree = DecisionTreeClassifier(
        max_depth=8,
        min_samples_split=20,
        min_samples_leaf=10,
        random_state=42,
        class_weight="balanced"
    )
    
tree.fit(x_train_scaled, y_train)
y_pred = tree.predict(x_test_scaled)
print(roc_auc_score(y_test, tree.predict_proba(x_test_scaled)[:, 1]))

In [None]:
depths = range(2, 21)

for d in depths:
    tree = DecisionTreeClassifier(
        max_depth=d,
        min_samples_split=20,
        min_samples_leaf=10,
        random_state=42
    )
    
    tree.fit(x_train_scaled, y_train)
    y_pred = tree.predict(x_test_scaled)
    
    classification_report_Data = classification_report(y_test, y_pred)
    
    print(f"Depth: {d} | Train: {tree.score(x_train_scaled, y_train):.3f} | Test: {tree.score(x_test_scaled, y_test):.3f} ")
    print(classification_report_Data)

In [None]:
tree.fit(x_train, y_train)

In [None]:
y_pred = tree.predict(x_test)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
print("Train Accuracy:", tree.score(x_train, y_train))
print("Test Accuracy:", tree.score(x_test, y_test))