In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


pd.set_option("display.float_format", "{:,.2f}".format)
sns.set(style="whitegrid")

### ***-: UPLODING '.csv' FILE FOR USE :-***

In [None]:
from google.colab import files
print("Please upload your CSV file:")
uploaded = files.upload()
sales_data = next(iter(uploaded))

### ***-: CHENGING '.csv' INTO DATAFRAME :-***

In [None]:
df = pd.read_csv("Telco_Customer_Churn_Raw.csv")

### ***-: BASIC DATA VARIFICATION & VALIDATION  :-***

In [None]:
# Checking Dataframe shape :-

df.shape

In [None]:
# Checking top 5 rows :-

df.head()

In [None]:
# Checking missing values :-

df.isnull().sum()

In [None]:
# Checking structure :-

df.info()

In [None]:
# Statistical summary :-

df.describe()

In [None]:
# 'Churn' value calculation :-

df["Churn"].value_counts()

### ***-: DATA CLEANING & PREPROCESSING :-***

In [None]:
#1. Convert 'TotalCharges' to numeric :-

df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")
df["TotalCharges"].isna().sum()

In [None]:
#2. Handle Missing 'TotalCharges' :-

df["TotalCharges"] = df["TotalCharges"].fillna(0)
df["TotalCharges"].isna().sum()

In [None]:
#3. Fix Data Types :-

df["tenure"] = df["tenure"].astype(int)
df["MonthlyCharges"] = df["MonthlyCharges"].astype(float)
df["TotalCharges"] = df["TotalCharges"].astype(float)

In [None]:
#4. Encode Target Variable :-

df["Churn_flag"] = df["Churn"].map({"Yes": 1, "No": 0})
df["Churn_flag"].value_counts()

In [None]:
#5. Standardize Categorical Values :-

cols_to_clean = [
    "MultipleLines", "OnlineSecurity", "OnlineBackup",
    "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies"
]

for col in cols_to_clean:
    df[col] = df[col].replace(
        {"No internet service": "No", "No phone service": "No"}
    )


In [None]:
#6. Remove Duplicates :-

df.duplicated().sum()
df = df.drop_duplicates()

***Final Data Quality Check  :-***

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isna().sum()

### ***-: CONCLUSION :-***

**Data Cleaning Summary**
- Converted `TotalCharges` from object to numeric and handled invalid values
- Filled missing `TotalCharges` using business logic (new customers)
- Encoded churn target variable for analysis
- Standardized categorical values for consistency
- Removed duplicate records

This ensured the dataset was fully prepared for exploratory analysis and modeling.

### ***-: EXPLORATORY DATA ANALYSIS [EDA] :-***

In [None]:
#Q1. What percentage of customers are churning?

churn_rate = df["Churn_flag"].mean()
churn_rate

In [None]:
# Visualize :-

plt.figure(figsize=(5, 4))
sns.countplot(x="Churn", data=df)
plt.title("Customer Churn Distribution")
plt.show()

In [None]:
#Q2. Do new customers churn more than long-term customers?

plt.figure(figsize=(7, 4))
sns.boxplot(x="Churn", y="tenure", data=df)
plt.title("Tenure vs Churn")
plt.show()

In [None]:
plt.figure(figsize=(7, 4))
sns.violinplot(x="Churn", y="MonthlyCharges", data=df)
plt.title("Monthly Charges Distribution by Churn (Violin Plot)")
plt.show()

In [None]:
#Q3. Do higher bills increase churn risk?

plt.figure(figsize=(7, 4))
sns.boxplot(x="Churn", y="MonthlyCharges", data=df)
plt.title("Monthly Charges vs Churn")
plt.show()

In [None]:
#Q4. Which contract types are most risky?

plt.figure(figsize=(7, 4))
sns.countplot(x="Contract", hue="Churn", data=df)
plt.title("Churn by Contract Type")
plt.xticks(rotation=15)
plt.show()

In [None]:
#Q5. Internet Service Impact :-

plt.figure(figsize=(7, 4))
sns.countplot(x="InternetService", hue="Churn", data=df)
plt.title("Churn by Internet Service Type")
plt.show()

In [None]:
#Q6. Payment Method & Churn :-

plt.figure(figsize=(8, 4))
sns.countplot(x="PaymentMethod", hue="Churn", data=df)
plt.title("Churn by Payment Method")
plt.xticks(rotation=30)
plt.show()

In [None]:
#Q7. Correlation :-

plt.figure(figsize=(6, 4))
sns.heatmap(
    df[["tenure", "MonthlyCharges", "TotalCharges", "Churn_flag"]].corr(),
    annot=True,
    cmap="coolwarm"
)
plt.title("Correlation Matrix")
plt.show()

### ***-: CONCLUSION :-***

**Exploratory Data Analysis Summary**

Key findings from the exploratory analysis include:
- Approximately one-fourth of customers have churned
- Customers with shorter tenure show significantly higher churn
- Higher monthly charges are associated with increased churn risk
- Month-to-month contracts have the highest churn rates
- Customers using electronic check payment methods churn more frequently
- Long-term contracts and auto-payment methods reduce churn

These insights highlight critical risk factors that can be targeted
through customer retention strategies.

**Churn by Contract Type – Interpretation**

Customers on month-to-month contracts exhibit significantly higher churn
compared to those on one-year or two-year contracts.
This behavior can be attributed to:
- Lower switching costs for short-term contracts
- Higher pricing flexibility for competitors
- Lack of long-term commitment or loyalty incentives

In contrast, long-term contracts create customer lock-in and reduce churn,
highlighting the importance of contract-based retention strategies.

### ***-: ADVANCED EDA & FEATURR ENGINEERING :-***

In [None]:
#1. Tenure Grouping (Lifecycle Segmentation) :-

df["tenure_group"] = pd.cut(
    df["tenure"],
    bins=[-1, 12, 24, 48, 72],
    labels=["0–1 year", "1–2 years", "2–4 years", "4+ years"]
)

In [None]:
# Visualize 'Churn by Tenure Group' :-

plt.figure(figsize=(7, 4))
sns.countplot(x="tenure_group", hue="Churn", data=df)
plt.title("Churn by Tenure Group")
plt.show()

In [None]:
#2. Service Count Feature (Engagement Proxy) :-

service_cols = [
    "PhoneService", "MultipleLines", "OnlineSecurity",
    "OnlineBackup", "DeviceProtection", "TechSupport",
    "StreamingTV", "StreamingMovies"
]

df["service_count"] = df[service_cols].apply(
    lambda x: (x == "Yes").sum(), axis=1
)

In [None]:
# Visualize 'Service Count vs Churn' :-

plt.figure(figsize=(7, 4))
sns.boxplot(x="Churn", y="service_count", data=df)
plt.title("Service Count vs Churn")
plt.show()

In [None]:
#3. Average Revenue per Month (ARPU) :-

df["avg_monthly_spend"] = df["TotalCharges"] / (df["tenure"] + 1)

In [None]:
# Visualize 'Average Monthly Spend vs Churn' :-

plt.figure(figsize=(7, 4))
sns.violinplot(x="Churn", y="avg_monthly_spend", data=df)
plt.title("Average Monthly Spend vs Churn")
plt.show()

In [None]:
#4. Contract Commitment Flag :-

df["long_term_contract"] = df["Contract"].apply(
    lambda x: 1 if x in ["One year", "Two year"] else 0
)


df.groupby("long_term_contract")["Churn_flag"].mean()

In [None]:
#5. Auto-Payment Flag :-

df["auto_payment"] = df["PaymentMethod"].apply(
    lambda x: 1 if "automatic" in x.lower() else 0
)


df.groupby("auto_payment")["Churn_flag"].mean()

### ***-: CONCLUSION :-***

**Advanced Feature Engineering Summary**

To enhance churn prediction and business interpretability, several
behavior-driven features were engineered:
- Tenure groups to represent customer lifecycle stages
- Service count as a proxy for engagement
- Average monthly spend to normalize revenue across tenure
- Long-term contract indicator to capture commitment
- Auto-payment indicator to reflect billing behavior

These features capture customer engagement, value, and commitment,
providing a strong foundation for predictive modeling.

### ***-: MODELING [CHURN PREDICTION] :-***

*Goal is to build a baseline churn prediction model and interpret results; we’ll exclude IDs and keep meaningful features.*

In [None]:
#1. Select Features & Target:-


target = "Churn_flag"

features = [
    "tenure",
    "MonthlyCharges",
    "TotalCharges",
    "service_count",
    "avg_monthly_spend",
    "long_term_contract",
    "auto_payment",
    "SeniorCitizen"
]

In [None]:
X = df[features]
y = df[target]

In [None]:
#2. Train–Test Split :-

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.25,
    random_state=42,
    stratify=y
)

In [None]:
#3. Scale Numerical Features :-

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
#4. Logistic Regression (Baseline Model) :-

from sklearn.linear_model import LogisticRegression

log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train_scaled, y_train)

In [None]:
y_pred_log = log_model.predict(X_test_scaled)

In [None]:
#5. Evaluate Logistic Regression :-

from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(y_test, y_pred_log))

In [None]:
sns.heatmap(
    confusion_matrix(y_test, y_pred_log),
    annot=True, fmt="d", cmap="Blues"
)
plt.title("Logistic Regression – Confusion Matrix")
plt.show()

In [None]:
#6. Feature Importance :-

feature_importance = pd.DataFrame({
    "Feature": features,
    "Coefficient": log_model.coef_[0]
}).sort_values("Coefficient", ascending=False)

feature_importance

In [None]:
# Visualize :-

plt.figure(figsize=(7, 4))
sns.barplot(
    x="Coefficient", y="Feature",
    data=feature_importance
)
plt.title("Logistic Regression Feature Importance")
plt.show()

In [None]:
#7. Decision Tree (Comparison Model) :-

from sklearn.tree import DecisionTreeClassifier

tree_model = DecisionTreeClassifier(
    max_depth=5,
    random_state=42
)
tree_model.fit(X_train, y_train)

In [None]:
y_pred_tree = tree_model.predict(X_test)

In [None]:
# Evaluate :-

print(classification_report(y_test, y_pred_tree))

**Model Performance Summary**

Two baseline models were developed to predict customer churn:
- Logistic Regression provided interpretable results and highlighted
  key churn drivers such as contract type, tenure, and monthly charges.
- Decision Tree captured non-linear relationships but showed signs of
  overfitting at higher depths.

Logistic Regression was selected as the preferred model due to its
balance between performance and interpretability, making it suitable
for business decision-making.

### ***-: BUSINESS INSIGHTS & CHURN RISK SEGMENTATION :-***

*Goal is to turn predictions into customer-level actions*

In [None]:
#1. Generate Churn Probabilities (Risk Scores) :-

churn_prob = log_model.predict_proba(X_test_scaled)[:, 1]

churn_results = X_test.copy()
churn_results["Actual_Churn"] = y_test.values
churn_results["Churn_Probability"] = churn_prob



churn_results.head()

In [None]:
#2. Create Churn Risk Segments :-

def risk_segment(p):
    if p >= 0.7:
        return "High Risk"
    elif p >= 0.4:
        return "Medium Risk"
    else:
        return "Low Risk"

churn_results["Churn_Risk_Segment"] = churn_results["Churn_Probability"].apply(risk_segment)



churn_results["Churn_Risk_Segment"].value_counts(normalize=True)

In [None]:
#3. Risk Segment vs Actual Churn :-

pd.crosstab(
    churn_results["Churn_Risk_Segment"],
    churn_results["Actual_Churn"],
    normalize="index"
)

In [None]:
#4. Visualize Risk Distribution :-

plt.figure(figsize=(6, 4))
sns.countplot(x="Churn_Risk_Segment", data=churn_results)
plt.title("Customer Churn Risk Distribution")
plt.show()

**Churn Risk Segmentation Insights**

Customers were segmented into High, Medium, and Low churn risk categories
based on predicted churn probabilities.
Key observations:
- High-risk customers exhibit significantly higher actual churn rates
- Medium-risk customers represent an opportunity for proactive engagement
- Low-risk customers are relatively stable and require minimal intervention

This segmentation enables targeted retention strategies and efficient
allocation of marketing resources.

### ***-: BUSINESS INSIGHTS & CHURN RISK SEGMENTATION :-***

*Goal is to deliver a clean, usable churn-risk list for business teams*

In [None]:
#1. Reattach Customer IDs :-

churn_export = df.loc[X_test.index, [
    "customerID",
    "tenure",
    "MonthlyCharges",
    "Contract",
    "PaymentMethod",
    "service_count"
]].copy()

churn_export = churn_export.join(
    churn_results[["Churn_Probability", "Churn_Risk_Segment", "Actual_Churn"]]
)


churn_export.head()

In [None]:
#2. Create Final Export Table :-

final_churn_customers = churn_export[[
    "customerID",
    "Churn_Probability",
    "Churn_Risk_Segment",
    "Actual_Churn",
    "tenure",
    "MonthlyCharges",
    "service_count",
    "Contract",
    "PaymentMethod"
]].sort_values("Churn_Probability", ascending=False)


final_churn_customers.head(10)

In [None]:
#3. Export to CSV :-

final_churn_customers.to_csv(
    "churn_risk_customers.csv",
    index=False
)

In [None]:
#4. Download CSV (Google Colab) :-

from google.colab import files
files.download("churn_risk_customers.csv")

**Business Deliverable: Churn Risk Customer List**

A customer-level churn risk dataset was generated to support retention
initiatives. The dataset includes predicted churn probabilities and
risk segmentation to enable targeted marketing actions.

This file can be directly used by marketing and customer success teams
to prioritize outreach and design personalized retention campaigns.
