In [1]:
# =====================================
# STAGE-1 v3 : UNIVERSAL AUTO INSIGHTS
# =====================================

import pandas as pd
import numpy as np

# =============================
# 1Ô∏è‚É£ LOAD DATA
# =============================
df = pd.read_csv("/content/train.csv")

print("\nüìä DATASET OVERVIEW")
print("Rows:", df.shape[0])
print("Columns:", df.shape[1])

# Speed fix for large data
if len(df) > 50000:
    df = df.sample(50000, random_state=42)
    print("‚ö° Sampled to 50,000 rows")

# =============================
# 2Ô∏è‚É£ TARGET INPUT
# =============================
target = input("Enter target column name: ")

if target not in df.columns:
    raise ValueError("‚ùå Target column not found")

# =============================
# 3Ô∏è‚É£ DATA HEALTH CHECK
# =============================
print("\nüß™ DATA HEALTH CHECK")

missing_pct = (df.isnull().sum() / len(df)) * 100
high_missing = missing_pct[missing_pct > 20]

print("Columns with >20% missing:")
print(high_missing.sort_values(ascending=False))
print("Duplicate Rows:", df.duplicated().sum())

# =============================
# 4Ô∏è‚É£ DETECT PROBLEM TYPE
# =============================
if df[target].dtype == "object" or df[target].nunique() <= 10:
    problem_type = "classification"
else:
    problem_type = "regression"

print("\nüéØ Problem Type Detected:", problem_type)

# =============================
# 5Ô∏è‚É£ TARGET INSIGHTS
# =============================
print("\nüéØ TARGET INSIGHTS")

print("Type:", df[target].dtype)
print("Unique:", df[target].nunique())

if problem_type == "classification":
    print("\nClass Distribution:")
    print(df[target].value_counts())

    print("\nClass Percentage:")
    print(round(df[target].value_counts(normalize=True) * 100, 2))

else:
    print("Mean:", round(df[target].mean(), 2))
    print("Median:", df[target].median())
    print("Skewness:", round(df[target].skew(), 2))

# =============================
# 6Ô∏è‚É£ NUMERIC CORRELATION
# =============================
print("\nüìà TOP NUMERIC DRIVERS")

numeric_cols = df.select_dtypes(include=np.number).columns.tolist()

if target in numeric_cols:
    numeric_cols.remove(target)

if problem_type == "classification":
    df_temp = df.copy()
    df_temp[target] = df_temp[target].astype("category").cat.codes
    correlations = (
        df_temp[numeric_cols]
        .corrwith(df_temp[target])
        .abs()
        .sort_values(ascending=False)
    )
else:
    correlations = (
        df[numeric_cols]
        .corrwith(df[target])
        .abs()
        .sort_values(ascending=False)
    )

print(correlations.head(5))

# =============================
# 7Ô∏è‚É£ SEGMENT INSIGHTS (BOTH TYPES)
# =============================
print("\nüì¶ SEGMENT INSIGHTS")

cat_cols = df.select_dtypes(include=["object", "category"]).columns

for col in cat_cols:
    if col != target and df[col].nunique() <= 15:
        print(f"\nüîπ {col} impact on {target}")

        if problem_type == "classification":
            print(
                df.groupby(col)[target]
                .value_counts(normalize=True)
                .unstack()
                .fillna(0)
                .round(2)
            )
        else:
            print(
                df.groupby(col)[target]
                .mean()
                .sort_values(ascending=False)
                .round(2)
            )

# =============================
# 8Ô∏è‚É£ AUTO BUSINESS FLAGS
# =============================
print("\nüí° AUTO BUSINESS FLAGS")

if problem_type == "classification":
    imbalance = df[target].value_counts(normalize=True).max()
    if imbalance > 0.75:
        print("‚úî Class imbalance detected ‚Üí Use F1 / Recall / ROC-AUC")

else:
    if abs(df[target].skew()) > 1:
        print("‚úî Target skewed ‚Üí Consider log transformation")

if len(high_missing) > 0:
    print("‚úî Data cleaning required")

print("\nüèÅ STAGE-1 COMPLETE")


üìä DATASET OVERVIEW
Rows: 7043
Columns: 21
Enter target column name: Churn

üß™ DATA HEALTH CHECK
Columns with >20% missing:
Series([], dtype: float64)
Duplicate Rows: 0

üéØ Problem Type Detected: classification

üéØ TARGET INSIGHTS
Type: object
Unique: 2

Class Distribution:
Churn
No     5174
Yes    1869
Name: count, dtype: int64

Class Percentage:
Churn
No     73.46
Yes    26.54
Name: proportion, dtype: float64

üìà TOP NUMERIC DRIVERS
tenure            0.352229
MonthlyCharges    0.193356
SeniorCitizen     0.150889
dtype: float64

üì¶ SEGMENT INSIGHTS

üîπ gender impact on Churn
Churn     No   Yes
gender            
Female  0.73  0.27
Male    0.74  0.26

üîπ Partner impact on Churn
Churn      No   Yes
Partner            
No       0.67  0.33
Yes      0.80  0.20

üîπ Dependents impact on Churn
Churn         No   Yes
Dependents            
No          0.69  0.31
Yes         0.85  0.15

üîπ PhoneService impact on Churn
Churn           No   Yes
PhoneService            
No    

In [2]:
# ============================================
# UNIVERSAL STAGE-2 (INDUSTRY LEVEL VERSION)
# Works for Classification & Regression
# ============================================

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    classification_report, mean_absolute_error, mean_squared_error, r2_score
)

# -----------------------------
# 1Ô∏è‚É£ LOAD DATASET
# -----------------------------
df = pd.read_csv("/content/train.csv")   # üî• CHANGE FILE NAME
target = "Churn"   # üî• CHANGE TARGET COLUMN

# -----------------------------
# 2Ô∏è‚É£ REMOVE ID COLUMN IF EXISTS
# -----------------------------
for col in df.columns:
    if "id" in col.lower():
        df = df.drop(columns=[col])

# -----------------------------
# 3Ô∏è‚É£ HANDLE MISSING VALUES (INDUSTRY SAFE)
# -----------------------------
# Convert numeric-like object columns first
for col in df.columns:
    if df[col].dtype == "object":
        try:
            df[col] = pd.to_numeric(df[col])
        except:
            pass

# Fill numeric columns with median
for col in df.select_dtypes(include=["int64", "float64"]).columns:
    df[col] = df[col].fillna(df[col].median())

# Fill categorical columns with mode
for col in df.select_dtypes(include=["object"]).columns:
    df[col] = df[col].fillna(df[col].mode()[0])

# -----------------------------
# 4Ô∏è‚É£ DETECT PROBLEM TYPE
# -----------------------------
if df[target].dtype == "object" or df[target].nunique() <= 10:
    problem_type = "classification"
else:
    problem_type = "regression"

print("Detected Problem Type:", problem_type)

# -----------------------------
# 5Ô∏è‚É£ SEPARATE TARGET
# -----------------------------
y = df[target]
X = df.drop(columns=[target])

# Encode only features
X = pd.get_dummies(X, drop_first=True)

# Encode classification target if needed
if problem_type == "classification" and y.dtype == "object":
    y = y.astype("category").cat.codes

# -----------------------------
# 6Ô∏è‚É£ TRAIN TEST SPLIT
# -----------------------------
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# -----------------------------
# 7Ô∏è‚É£ SCALING (For Linear Models)
# -----------------------------
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_val_s = scaler.transform(X_val)

# -----------------------------
# 8Ô∏è‚É£ MODEL SELECTION
# -----------------------------
if problem_type == "classification":
    models = {
        "Logistic": LogisticRegression(max_iter=5000),
        "RandomForest": RandomForestClassifier(n_estimators=300, random_state=42)
    }
else:
    models = {
        "Linear": LinearRegression(),
        "RandomForest": RandomForestRegressor(n_estimators=300, random_state=42)
    }

results = {}

# -----------------------------
# 9Ô∏è‚É£ TRAIN & EVALUATE
# -----------------------------
for name, model in models.items():

    if name in ["Logistic", "Linear"]:
        model.fit(X_train_s, y_train)
        preds = model.predict(X_val_s)
    else:
        model.fit(X_train, y_train)
        preds = model.predict(X_val)

    if problem_type == "classification":

        if name == "Logistic":
            probs = model.predict_proba(X_val_s)[:, 1]
        else:
            probs = model.predict_proba(X_val)[:, 1]

        results[name] = {
            "Accuracy": accuracy_score(y_val, preds),
            "Precision": precision_score(y_val, preds),
            "Recall": recall_score(y_val, preds),
            "F1": f1_score(y_val, preds),
            "ROC_AUC": roc_auc_score(y_val, probs)
        }

    else:
        results[name] = {
            "MAE": mean_absolute_error(y_val, preds),
            "RMSE": np.sqrt(mean_squared_error(y_val, preds)),
            "R2": r2_score(y_val, preds)
        }

results_df = pd.DataFrame(results).T
print("\nModel Comparison:\n")
display(results_df)

# -----------------------------
# üîü SELECT BEST MODEL
# -----------------------------
if problem_type == "classification":
    best_model_name = results_df["ROC_AUC"].idxmax()
else:
    best_model_name = results_df["R2"].idxmax()

print("\nBest Model:", best_model_name)

# -----------------------------
# 1Ô∏è‚É£1Ô∏è‚É£ FEATURE IMPORTANCE
# -----------------------------
if "RandomForest" in best_model_name:

    if problem_type == "classification":
        final_model = RandomForestClassifier(n_estimators=300, random_state=42)
    else:
        final_model = RandomForestRegressor(n_estimators=300, random_state=42)

    final_model.fit(X_train, y_train)

    importance = pd.Series(
        final_model.feature_importances_,
        index=X.columns
    ).sort_values(ascending=False)

    print("\nTop 10 Important Features:")
    display(importance.head(10))

Detected Problem Type: classification

Model Comparison:



Unnamed: 0,Accuracy,Precision,Recall,F1,ROC_AUC
Logistic,0.784244,0.610932,0.509383,0.555556,0.817786
RandomForest,0.801278,0.683794,0.463807,0.552716,0.847012



Best Model: RandomForest

Top 10 Important Features:


Unnamed: 0,0
tenure,0.117913
MonthlyCharges,0.091141
PaymentMethod_Electronic check,0.028271
Contract_Two year,0.025045
InternetService_Fiber optic,0.024724
OnlineSecurity_Yes,0.022782
TechSupport_Yes,0.021107
PaperlessBilling_Yes,0.019791
Contract_One year,0.019459
gender_Male,0.018776
