<a href="https://colab.research.google.com/github/areesha-del/AI-ML-Hands-on/blob/main/Week_3%2CDay(1)_homework.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**CHURN DATASET**

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

# ==============================
# Load & Prepare Data
# ==============================
df = pd.read_csv("/content/churn.csv")

df["Churn"] = df["Churn?"].map({"True.": 1, "False.": 0})
df.drop("Churn?", axis=1, inplace=True)

# Keep only numeric columns
df = df.select_dtypes(include=[np.number])

# ==============================
# Correlation-Based Feature Drop
# ==============================
corr = df.corr()["Churn"].abs()

low_corr_cols = corr[corr < 0.20].index.tolist()
low_corr_cols.remove("Churn") if "Churn" in low_corr_cols else None

print("\nDropped Columns (Correlation < 0.20):")
print(low_corr_cols)

df_clean = df.drop(columns=low_corr_cols)

# ==============================
# Train/Test Split (Clean Model)
# ==============================
X_clean = df_clean.drop("Churn", axis=1)
y_clean = df_clean["Churn"]

Xc_train, Xc_test, yc_train, yc_test = train_test_split(
    X_clean, y_clean, test_size=0.2, random_state=42, stratify=y_clean
)

# ==============================
# Clean & Lean Model
# ==============================
rf_clean = RandomForestClassifier(random_state=42)
rf_clean.fit(Xc_train, yc_train)

yc_pred = rf_clean.predict(Xc_test)
f1_clean = f1_score(yc_test, yc_pred)

print("F1 Score (Clean & Lean Model):", f1_clean)

# ==============================
# Full Feature Model
# ==============================
X_full = df.drop("Churn", axis=1)
y_full = df["Churn"]

Xf_train, Xf_test, yf_train, yf_test = train_test_split(
    X_full, y_full, test_size=0.2, random_state=42, stratify=y_full
)

rf_full = RandomForestClassifier(random_state=42)
rf_full.fit(Xf_train, yf_train)

yf_pred = rf_full.predict(Xf_test)
f1_full = f1_score(yf_test, yf_pred)

print("F1 Score (Full Model - No Drop):", f1_full)

# ==============================
# Hyperparameter Tuning
# ==============================
param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [None, 10, 20, 30],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

search = RandomizedSearchCV(
    RandomForestClassifier(random_state=42),
    param_distributions=param_grid,
    n_iter=20,
    scoring="f1",
    cv=5,
    n_jobs=-1,
    random_state=42
)

search.fit(Xf_train, yf_train)

best_full_model = search.best_estimator_

yf_tuned_pred = best_full_model.predict(Xf_test)
f1_full_tuned = f1_score(yf_test, yf_tuned_pred)

print("F1 Score (Full + Tuned Model):", f1_full_tuned)
print("Best Parameters:", search.best_params_)

# ==============================
# Final Comparison
# ==============================
print("\n--- FINAL COMPARISON ---")
print("Clean & Lean F1:", f1_clean)
print("Full Model F1:", f1_full)
print("Full + Tuned F1:", f1_full_tuned)



Dropped Columns (Correlation < 0.20):
['Account Length', 'Area Code', 'VMail Message', 'Day Calls', 'Eve Mins', 'Eve Calls', 'Eve Charge', 'Night Mins', 'Night Calls', 'Night Charge', 'Intl Mins', 'Intl Calls', 'Intl Charge']
F1 Score (Clean & Lean Model): 0.4021164021164021
F1 Score (Full Model - No Drop): 0.6233766233766234
F1 Score (Full + Tuned Model): 0.6410256410256411
Best Parameters: {'n_estimators': 300, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 30}

--- FINAL COMPARISON ---
Clean & Lean F1: 0.4021164021164021
Full Model F1: 0.6233766233766234
Full + Tuned F1: 0.6410256410256411
