<a href="https://colab.research.google.com/github/arnavdesai6143/Datathon_TM126/blob/main/train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
CTG Fetal Distress Classification - Training Script
Extracted from main notebook - trains models and saves them
"""

import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import os

# Scikit-learn imports
from sklearn.model_selection import (
    train_test_split, StratifiedKFold, cross_validate,
    RandomizedSearchCV, GridSearchCV
)
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    balanced_accuracy_score, f1_score, classification_report,
    confusion_matrix, ConfusionMatrixDisplay, precision_recall_fscore_support
)
from sklearn.inspection import permutation_importance

# Model imports
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

# Imbalanced-learn imports
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE, BorderlineSMOTE

warnings.filterwarnings('ignore')
sns.set_style("whitegrid")
plt.rcParams['figure.dpi'] = 100

print("="*70)
print("CTG FETAL DISTRESS CLASSIFICATION - MODEL PIPELINE")
print("="*70)

# ============================================================================
# SECTION 2: LOAD YOUR CLEANED DATA
# ============================================================================
cleaned_df = pd.read_csv('ctg_cleaned.csv')
X = cleaned_df.drop(columns=['NSP'], errors='ignore')
y = cleaned_df['NSP'].astype(int)
X = X.select_dtypes(include=[np.number])

print(f"\n📊 Dataset Overview:")
print(f"   Total samples: {len(X)}")
print(f"   Features: {X.shape[1]}")
print(f"   Class distribution:")
for cls in sorted(y.unique()):
    count = (y == cls).sum()
    pct = count / len(y) * 100
    print(f"      Class {cls}: {count} ({pct:.1f}%)")

# ============================================================================
# SECTION 3: TRAIN/TEST SPLIT (STRATIFIED)
# ============================================================================
RANDOM_STATE = 42
TEST_SIZE = 0.20

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=TEST_SIZE,
    stratify=y,
    random_state=RANDOM_STATE
)

print(f"\n✓ Train/Test Split: {len(X_train)} train, {len(X_test)} test")

# ============================================================================
# SECTION 4: DEFINE MODEL PIPELINES WITH IMBALANCE HANDLING
# ============================================================================
print("\n" + "="*70)
print("BUILDING MODEL PIPELINES")
print("="*70)

cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

models = {}

models['Logistic_Regression'] = {
    'pipeline': ImbPipeline([
        ('scaler', StandardScaler()),
        ('smote', SMOTE(random_state=RANDOM_STATE, k_neighbors=5)),
        ('clf', LogisticRegression(
            multi_class='multinomial',
            solver='lbfgs',
            class_weight='balanced',
            max_iter=4000,
            random_state=RANDOM_STATE
        ))
    ]),
    'param_grid': {
        'clf__C': [0.1, 0.5, 1.0, 2.0, 5.0, 10.0],
        'clf__penalty': ['l2'],
        'smote__k_neighbors': [3, 5, 7]
    },
    'search_iterations': 15
}

models['Random_Forest'] = {
    'pipeline': ImbPipeline([
        ('smote', SMOTE(random_state=RANDOM_STATE)),
        ('clf', RandomForestClassifier(
            class_weight='balanced_subsample',
            random_state=RANDOM_STATE,
            n_jobs=-1,
            min_samples_leaf=2,
            bootstrap=True
        ))
    ]),
    'param_grid': {
        'clf__n_estimators': [500, 800, 1000, 1200],
        'clf__max_depth': [None, 20, 30, 40],
        'clf__min_samples_split': [2, 5, 10],
        'clf__min_samples_leaf': [1, 2, 4],
        'clf__max_features': ['sqrt', 'log2']
    },
    'search_iterations': 20
}

models['Gradient_Boosting'] = {
    'pipeline': ImbPipeline([
        ('smote', BorderlineSMOTE(random_state=RANDOM_STATE)),
        ('clf', GradientBoostingClassifier(
            random_state=RANDOM_STATE,
            subsample=0.8
        ))
    ]),
    'param_grid': {
        'clf__n_estimators': [200, 300, 400, 500],
        'clf__learning_rate': [0.01, 0.05, 0.1],
        'clf__max_depth': [3, 5, 7],
        'clf__min_samples_split': [2, 5, 10],
        'clf__subsample': [0.8, 0.9, 1.0]
    },
    'search_iterations': 20
}

models['MLP_Neural_Network'] = {
    'pipeline': ImbPipeline([
        ('scaler', StandardScaler()),
        ('smote', SMOTE(random_state=RANDOM_STATE)),
        ('clf', MLPClassifier(
            activation='relu',
            early_stopping=True,
            validation_fraction=0.15,
            random_state=RANDOM_STATE,
            max_iter=500
        ))
    ]),
    'param_grid': {
        'clf__hidden_layer_sizes': [(128, 64), (256, 128, 64), (128, 64, 32), (256, 128)],
        'clf__alpha': [1e-5, 1e-4, 1e-3, 1e-2],
        'clf__learning_rate_init': [1e-3, 5e-4, 1e-4],
        'clf__batch_size': [64, 128, 256]
    },
    'search_iterations': 18
}

print(f"✓ Created {len(models)} model pipelines")

# ============================================================================
# SECTION 5: HYPERPARAMETER TUNING WITH CROSS-VALIDATION
# ============================================================================
print("\n" + "="*70)
print("TRAINING WITH CROSS-VALIDATION & HYPERPARAMETER SEARCH")
print("="*70)

best_estimators = {}

for name, model_config in models.items():
    print(f"\nTraining: {name}")

    search = RandomizedSearchCV(
        estimator=model_config['pipeline'],
        param_distributions=model_config['param_grid'],
        n_iter=model_config['search_iterations'],
        scoring='f1_macro',
        cv=cv_strategy,
        n_jobs=-1,
        refit=True,
        random_state=RANDOM_STATE,
        verbose=1
    )

    search.fit(X_train, y_train)
    best_estimators[name] = search.best_estimator_

# ============================================================================
# SAVE TRAINED MODELS
# ============================================================================
os.makedirs('models', exist_ok=True)

print("\n💾 Saving trained models...")
for name, estimator in best_estimators.items():
    model_path = f'models/{name}_model.pkl'
    joblib.dump(estimator, model_path)
    print(f"   ✓ Saved {model_path}")

print("\n✓ Training complete! All models saved.")