In [None]:
import pandas as pd
import numpy as np
import sqlite3
import mlflow
import dagshub
import os
import joblib
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

dagshub.init(repo_owner='abdsalam25', repo_name='churn-project', mlflow=True)

try:
    df = pd.read_csv("../churn_data.csv")
except:
    df = pd.read_csv("churn_data.csv")

df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['Churn'] = df['Churn'].apply(lambda x: 1 if x == 'Yes' else 0)

features = ['gender', 'SeniorCitizen', 'tenure', 'MonthlyCharges', 'Partner', 'Dependents', 'PhoneService']
X = df[features]
y = df['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

cleaner = ColumnTransformer(transformers=[
    ('num', Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]), ['tenure', 'MonthlyCharges']),
    ('cat', OneHotEncoder(handle_unknown='ignore'), ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService'])
])

models_config = {
    "Ridge": {
        "model": RidgeClassifier(),
        "params": {"model__alpha": [0.1, 1.0]}
    },
    "GradientBoosting": {
        "model": GradientBoostingClassifier(),
        "params": {"model__n_estimators": [50], "model__learning_rate": [0.1]}
    },
    "XGBoost": {
        "model": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
        "params": {"model__n_estimators": [50], "model__max_depth": [3]}
    },
    "LightGBM": {
        "model": LGBMClassifier(verbose=-1),
        "params": {"model__n_estimators": [50]}
    }
}

experiment_id = 1

for model_name, config in models_config.items():
    for use_pca in [False, True]:
        for use_tuning in [False, True]:
            
            run_name = f"Exp{experiment_id}_{model_name}_PCA-{use_pca}_Tuned-{use_tuning}"
            print(f"Running {run_name}...")
            
            mlflow.set_experiment("Final_Project_Experiments")
            with mlflow.start_run(run_name=run_name):
                
                steps = [('preprocessor', cleaner)]
                if use_pca:
                    steps.append(('pca', PCA(n_components=2)))
                steps.append(('model', config["model"]))
                pipeline = Pipeline(steps)
                
                if use_tuning:
                    model = GridSearchCV(pipeline, config["params"], cv=3, scoring='f1_macro')
                else:
                    model = pipeline
                
                model.fit(X_train, y_train)
                
                if use_tuning:
                    score = model.best_score_
                    final_model = model.best_estimator_
                else:
                    score = cross_val_score(model, X_train, y_train, cv=3, scoring='f1_macro').mean()
                    final_model = model
                
                mlflow.log_param("model", model_name)
                mlflow.log_param("pca", use_pca)
                mlflow.log_metric("cv_f1", score)
                
                if experiment_id == 16:
                    joblib.dump(final_model, "../model.joblib")
                    print("Saved final model.")

            experiment_id += 1

print("DONE! Check DagsHub now.")

  return FileStore(store_uri, store_uri)
2025/12/15 17:27:51 INFO mlflow.tracking.fluent: Experiment with name 'Churn_Model_Exp' does not exist. Creating a new experiment.


DagsHub Creds set. Loading data...
Data loaded from CSV.
Training Complete! F1 Score: 0.5263157894736842
SUCCESS: model.joblib saved to project root.
