# Telecom Churn Analysis

This notebook covers the end-to-end analysis of the telecom churn dataset.

**Objective**: Predict churn and prescribe retention strategies with cost analysis.

**Sections**:
1. **Preprocessing**
2. **Feature Engineering**
3. **Descriptive Analytics**
4. **Predictive Analytics (Modeling)**
5. **Prescriptive Analytics (Strategies & Costs)**

In [None]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import warnings

warnings.filterwarnings('ignore')
sns.set(style="whitegrid")

## 1. Data Loading & Preprocessing

In [None]:
# Load Data
file_path = 'telecom_churn.csv'

try:
    df = pd.read_csv(file_path)
    print(f"Data loaded. Shape: {df.shape}")
    display(df.head())
except FileNotFoundError:
    print("File not found. Please check the path.")

In [None]:
# Handle Missing Values
print("Missing values before:")
print(df.isnull().sum()[df.isnull().sum() > 0])

df = df.fillna(method='ffill')
print("Missing values handled.")

In [None]:
# Outlier Detection (Numerical Columns)
numerical_cols = df.select_dtypes(include=[np.number]).columns
print("Checking and capping outliers...")

for col in numerical_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    
    df[col] = np.where(df[col] < lower, lower, df[col])
    df[col] = np.where(df[col] > upper, upper, df[col])
    
print("Outliers processed.")

In [None]:
# Encode Categorical Variables
print("Encoding categorical variables...")
le = LabelEncoder()
categorical_cols = df.select_dtypes(include=['object']).columns

for col in categorical_cols:
    if 'date' not in col:
        df[col] = le.fit_transform(df[col].astype(str))
        
df.head()

## 2. Feature Engineering

In [None]:
# Engineering New Features
if 'calls_made' in df.columns and 'sms_sent' in df.columns:
    df['total_interactions'] = df['calls_made'] + df['sms_sent']

if 'data_used' in df.columns:
    df['data_usage_log'] = np.log1p(df['data_used'].clip(lower=0))
    
print("Features created: total_interactions, data_usage_log")

## 3. Descriptive Analytics (Visualizations)

In [None]:
# Churn Distribution Plot
if 'churn' in df.columns:
    plt.figure(figsize=(6, 4))
    sns.countplot(x='churn', data=df)
    plt.title("Distribution of Churn")
    plt.show()

In [None]:
# Numerical Distributions (Histograms)
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
cols_to_plot = [c for c in numerical_cols if c not in ['churn', 'customer_id']]

if cols_to_plot:
    df[cols_to_plot].hist(figsize=(15, 10), bins=20, edgecolor='black')
    plt.suptitle("Numerical Feature Distributions")
    plt.show()

In [None]:
# Correlation Heatmap
plt.figure(figsize=(12, 10))
corr = df.corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title("Feature Correlation Matrix")
plt.show()

## 4. Predictive Analytics (Modeling)

In [None]:
# Prepare Data (Split & Scale)
if 'churn' in df.columns:
    X = df.drop(['churn', 'customer_id', 'date_of_registration'], axis=1, errors='ignore')
    y = df['churn']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    print("Train/Test split done. Data Scaled.")
else:
    print("Target 'churn' not found!")

In [None]:
# 1. Logistic Regression Model
lr_model = LogisticRegression()
lr_model.fit(X_train_scaled, y_train)
print("Logistic Regression Trained.")

In [None]:
# 2. Random Forest Model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)
print("Random Forest Trained.")

In [None]:
# 3. Gradient Boosting Model
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_model.fit(X_train_scaled, y_train)
print("Gradient Boosting Trained.")

In [None]:
# Best Model Selection & Comparison
models = {
    'Logistic Regression': lr_model,
    'Random Forest': rf_model,
    'Gradient Boosting': gb_model
}

best_score = 0
best_model_name = ""

print(f"{'Model':<25} | {'F1-Score':<10} | {'AUC':<10}")
print("-" * 50)

for name, model in models.items():
    y_pred = model.predict(X_test_scaled)
    y_prob = model.predict_proba(X_test_scaled)[:, 1]
    
    f1 = f1_score(y_test, y_pred, zero_division=0)
    auc = roc_auc_score(y_test, y_prob)
    
    print(f"{name:<25} | {f1:.4f}     | {auc:.4f}")
    
    if f1 > best_score:
        best_score = f1
        best_model_name = name

print("-" * 50)
print(f"\n>> WINNER: {best_model_name} with F1-Score: {best_score:.4f}")

## 5. Prescriptive Analytics

In [None]:
# Markov Chain Analysis
# Simulating state transitions to predict long-term customer distribution.
df['value_segment'] = pd.qcut(df['estimated_salary'], 3, labels=['Low', 'Medium', 'High'])

# Hypothetical Transition Matrix
transition_matrix = np.array([
    [0.70, 0.10, 0.05, 0.15], # Low
    [0.05, 0.80, 0.10, 0.05], # Medium
    [0.02, 0.08, 0.88, 0.02], # High
    [0.00, 0.00, 0.00, 1.00]  # Churn
])
states = ['Low', 'Medium', 'High', 'Churn']

print("Projected Distribution (12 months):")
curr = np.array([0.4, 0.4, 0.2, 0.0])
future = curr.dot(np.linalg.matrix_power(transition_matrix, 12))
for s, p in zip(states, future):
    print(f"{s}: {p:.2%}")

In [None]:
# Monte Carlo Simulation (Risk Analysis)
cohort_size = 1000
avg_revenue = 50 # Assumed average monthly revenue ($)
base_churn = 0.15 # Assumed base churn rate
runs = 1000
results_mc = []

for _ in range(runs):
    fluctuated_churn = max(0, min(1, np.random.normal(base_churn, 0.02)))
    retained = cohort_size * ((1 - fluctuated_churn) ** 12)
    results_mc.append(retained * avg_revenue * 12)
    
print(f"Expected Annual Revenue (Mean): ${np.mean(results_mc):,.2f}")

In [None]:
# ADVANCED: AI-Powered Recommendation System (K-Means Clustering)
# 1. Train Clustering Model
features = ['estimated_salary']
if 'data_used' in df.columns: features.append('data_used')
if 'calls_made' in df.columns: features.append('calls_made')

X_cluster = df[features].copy()
kmeans = KMeans(n_clusters=4, random_state=42)
clusters = kmeans.fit_predict(X_cluster)
df['Cluster'] = clusters

# 2. Name Segments
cluster_means = df.groupby('Cluster')[features].mean()
cluster_names = {}
for c_id, row in cluster_means.iterrows():
    label = "Standard User"
    if row.get('data_used', 0) > df['data_used'].mean() * 1.5:
        label = "Heavy Data User"
    elif row.get('estimated_salary', 0) > df['estimated_salary'].mean() * 1.5:
        label = "High Net-Worth"
    elif row.get('data_used', 0) < df['data_used'].mean() * 0.5:
        label = "Low Engagement"
    cluster_names[c_id] = label
    
df['Segment_Name'] = df['Cluster'].map(cluster_names)

# 3. Show Recommendations for Sample High-Risk Customers
print("AI Recommendations for High-Risk Customers:\n")
high_churn_sample = df[df['churn'] == 1].head(5)

for idx, row in high_churn_sample.iterrows():
    seg = row['Segment_Name']
    msg = ""
    if "Heavy Data" in seg: msg = "Strategy: Offer 50% Off Unlimited Data Plan"
    elif "High Net-Worth" in seg: msg = "Strategy: VIP Concierge Service"
    elif "Low Engagement" in seg: msg = "Strategy: Free Recharge (Win-Back)"
    else: msg = "Strategy: Standard 1-Month Free Service"
    
    print(f"Customer {row['customer_id']} | Segment: {seg} | {msg}")