In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report
from sklearn.ensemble import HistGradientBoostingClassifier
import warnings
warnings.filterwarnings('ignore')

# 1. FEATURE ENGINEERING

def engineer_features(X):
    X_new = X.copy()
    
    for col in X_new.columns:
        if X_new[col].isnull().sum() > 0:
            X_new[f'{col}_missing'] = X_new[col].isnull().astype(int)
    
    X_f = X_new.fillna(X_new.median(numeric_only=True)).fillna('unknown')
    
    X_new['engagement_score'] = X_f['sessions_per_week'] * X_f['avg_session_duration_min'] * (X_f['features_used_pct']/100)
    X_new['recency'] = 1 / (X_f['days_since_last_login'] + 1)
    X_new['support_pain'] = X_f['num_support_tickets_90d'] * (1 + X_f['unresolved_tickets']) / X_f['avg_ticket_resolution_hrs'].replace(0, 1)
    X_new['stability'] = 1 / (X_f['app_crash_count_30d'] + 1)
    X_new['value_ratio'] = X_f['subscription_duration_days'] / (X_f['monthly_price_inr'] + 1)
    X_new['price_sens'] = ((X_f['monthly_price_inr'] > X_f['monthly_price_inr'].median()).astype(int) *
                           (X_f['sessions_per_week'] < X_f['sessions_per_week'].median()).astype(int))
    X_new['tech_issues'] = (X_f['app_crash_count_30d'] > 0).astype(int) + (X_f['num_support_tickets_90d'] > 0).astype(int)
    X_new['competitor_risk'] = X_f['competitor_app_installed'].astype(int) * 2 + (~X_f['notification_opt_in']).astype(int)
    
    for col in X_new.select_dtypes([np.number]).columns:
        if X_new[col].min() >= 0 and abs(X_new[col].skew()) > 1.0:
            X_new[col + '_log'] = np.log1p(X_new[col])
    
    return X_new

# 2. PIPELINE

def build_base_pipeline():
    preprocessor = ColumnTransformer([
        ('num', Pipeline([
            ('impute', SimpleImputer(strategy='median')),
            ('scale', StandardScaler())
        ]), lambda X: X.select_dtypes(include='number').columns),
        
        ('cat', Pipeline([
            ('impute', SimpleImputer(strategy='most_frequent')),
            ('encode', OneHotEncoder(drop='if_binary', handle_unknown='ignore', sparse_output=False))
        ]), lambda X: X.select_dtypes(exclude='number').columns)
    ])
    
    model = HistGradientBoostingClassifier(random_state=42)
    
    return Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])

# MAIN EXECUTION

if __name__ == "__main__":
    
    print("TRAIN ON TRAIN_B, PREDICT ON TEST_B")

    
    train_df = pd.read_csv('train_B.csv')
    test_df = pd.read_csv('test_B.csv')
    
    X_train = train_df.drop(columns=['churn_reason', 'id'])
    y = train_df['churn_reason']
    
    le = LabelEncoder()
    y_enc = le.fit_transform(y)
    
    print("Engineering features...")
    X_train_eng = engineer_features(X_train)
    X_test_eng = engineer_features(test_df.drop(columns=['id']))

    # ---- FIX: align test columns to train columns ----
    missing_cols = set(X_train_eng.columns) - set(X_test_eng.columns)
    for col in missing_cols:
        X_test_eng[col] = 0

    extra_cols = set(X_test_eng.columns) - set(X_train_eng.columns)
    X_test_eng = X_test_eng.drop(columns=extra_cols)

    X_test_eng = X_test_eng[X_train_eng.columns]
    
    print("Building base pipeline...")
    base_pipeline = build_base_pipeline()
    
    param_grid = {
        'model__max_iter': [300, 500],
        'model__learning_rate': [0.02, 0.05],
        'model__max_depth': [5, 7],
        'model__min_samples_leaf': [5, 10],
        'model__l2_regularization': [0.0, 0.5, 1.0]
    }
    
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    grid_search = GridSearchCV(
        base_pipeline,
        param_grid,
        cv=cv,
        scoring='f1_macro',
        n_jobs=-1,
        verbose=1
    )
    
    print("Starting hyperparameter tuning...")
    grid_search.fit(X_train_eng, y_enc)
    
    print("\nBest parameters:")
    print(grid_search.best_params_)
    print(f"Best CV Macro F1: {grid_search.best_score_:.4f}")
    
    best_model = grid_search.best_estimator_
    
    preds_enc = best_model.predict(X_test_eng)
    preds = le.inverse_transform(preds_enc)
    
    submission = pd.DataFrame({
        'id': test_df['id'],
        'prediction': preds
    })
    
    submission.to_csv('predictions.csv', index=False)
    print("Saved predictions.csv")

TRAIN ON TRAIN_B, PREDICT ON TEST_B
Engineering features...
Building base pipeline...
Starting hyperparameter tuning...
Fitting 5 folds for each of 48 candidates, totalling 240 fits

Best parameters:
{'model__l2_regularization': 1.0, 'model__learning_rate': 0.02, 'model__max_depth': 5, 'model__max_iter': 300, 'model__min_samples_leaf': 5}
Best CV Macro F1: 0.5596
Saved predictions.csv
