# 🔄 Customer Churn Prediction (Revised)

This notebook predicts whether a telecom customer will churn using demographic and service-related data. It includes:
- Data cleaning
- Feature selection
- Data preparation
- Model training with three classifiers
- Performance comparison

In [None]:
# 📦 Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import warnings
warnings.filterwarnings("ignore")

## 📂 Load Dataset

In [None]:
df = pd.read_csv("customer_churn.csv")
df.head()

## 🧼 Data Cleaning

In [None]:

# Convert TotalCharges to numeric
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")
# Fill missing TotalCharges with median
df["TotalCharges"].fillna(df["TotalCharges"].median(), inplace=True)
# Check for any remaining nulls
df.isnull().sum()


## 📊 Dataset Overview

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
sns.countplot(data=df, x="Churn")
plt.title("Churn Distribution")
plt.show()

## 🧠 Feature Selection (Correlation with Churn)

In [None]:

df_corr = df.copy()
df_corr["Churn"] = df_corr["Churn"].map({"Yes": 1, "No": 0})
df_encoded = pd.get_dummies(df_corr.drop(columns="Churn"))
df_encoded["Churn"] = df_corr["Churn"]
correlations = df_encoded.corr()["Churn"].sort_values(ascending=False)
correlations


## 🛠 Feature Preparation

In [None]:

X = df.drop(columns=["Churn"])
y = df["Churn"].map({"Yes": 1, "No": 0})

numeric_features = ["tenure", "MonthlyCharges", "TotalCharges"]
categorical_features = [col for col in X.columns if col not in numeric_features]


## 🔀 Train/Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

## 🧹 Preprocessing Pipeline

In [None]:

preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numeric_features),
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
])


## 🤖 Model Training & Evaluation

In [None]:

models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

results = {}
for name, model in models.items():
    pipe = Pipeline([
        ("preprocessor", preprocessor),
        ("classifier", model)
    ])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    results[name] = acc
    print(f"\n{name} Accuracy: {acc:.2f}")
    print(classification_report(y_test, y_pred))


## 📈 Accuracy Comparison

In [None]:

plt.figure(figsize=(8,5))
sns.barplot(x=list(results.keys()), y=list(results.values()))
plt.title("Model Accuracy Comparison")
plt.ylabel("Accuracy")
plt.ylim(0, 1)
plt.grid(True)
plt.show()
