In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import KNNImputer ,SimpleImputer
from sklearn.metrics import roc_auc_score, log_loss
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import warnings
warnings.filterwarnings('ignore')

In [15]:
# Load the data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [16]:
print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

Train shape: (15000, 20)
Test shape: (10000, 19)


In [17]:
# Check for the target variable distribution
print("\nTarget variable distribution:")
print(train_df['Status'].value_counts(normalize=True) * 100)


Target variable distribution:
Status
C     67.340000
D     30.246667
CL     2.413333
Name: proportion, dtype: float64


In [18]:
# Exploratory Data Analysis
def analyze_missing_values(df, name):
    missing = df.isnull().sum()
    missing_pct = missing / len(df) * 100
    print(f"\nMissing values in {name} dataset:")
    for col, pct in zip(missing.index, missing_pct):
        if pct > 0:
            print(f"{col}: {pct:.2f}%")

In [19]:
analyze_missing_values(train_df, 'train')
analyze_missing_values(test_df, 'test')


Missing values in train dataset:
Drug: 43.67%
Ascites: 43.61%
Hepatomegaly: 43.67%
Spiders: 43.69%
Cholesterol: 55.58%
Copper: 44.27%
Alk_Phos: 43.70%
SGOT: 43.71%
Tryglicerides: 55.90%
Platelets: 3.85%
Prothrombin: 0.12%

Missing values in test dataset:
Drug: 42.84%
Ascites: 42.82%
Hepatomegaly: 42.87%
Spiders: 42.89%
Cholesterol: 55.47%
Copper: 43.58%
Alk_Phos: 42.91%
SGOT: 42.92%
Tryglicerides: 55.81%
Platelets: 3.63%
Prothrombin: 0.16%


In [20]:
# Separate features and target
X_train = train_df.drop('Status', axis=1).copy()
y_train = train_df['Status'].copy()
X_test = test_df.copy()

In [23]:
# Preprocess the data
def preprocess_data(X_train, y_train, X_test):
    # Encode categorical target
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y_train)
    print(f"\nEncoded target classes: {label_encoder.classes_}")
    
    # Keep track of original indices
    X_train['original_index'] = X_train.index
    X_test['original_index'] = X_test.index
    
    # Identify data types
    categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()
    numeric_cols = X_train.select_dtypes(include=['number']).columns.tolist()
    numeric_cols.remove('original_index')  # Remove the index column we added
    
    print(f"\nCategorical columns: {categorical_cols}")
    print(f"Numeric columns: {numeric_cols}")
    
    # Strategy for handling missing values:
    # 1. For categorical: impute with most frequent value
    # 2. For numerical: use KNN imputation
    
    # Create pipeline for categorical features
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),  # Use most frequent value for categorical imputation
        ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])
    
    # Create pipeline for numerical features
    numeric_transformer = Pipeline(steps=[
        ('imputer', KNNImputer(n_neighbors=5)),
        ('scaler', StandardScaler())
    ])
    
    # Combine transformers
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', categorical_transformer, categorical_cols),
            ('num', numeric_transformer, numeric_cols)
        ])
    
    # Apply preprocessing
    print("\nPreprocessing data...")
    X_train_processed = preprocessor.fit_transform(X_train)
    X_test_processed = preprocessor.transform(X_test)
    
    # Get feature names after preprocessing
    categorical_feature_names = []
    if categorical_cols:
        ohe = preprocessor.named_transformers_['cat'].named_steps['encoder']
        categorical_feature_names = ohe.get_feature_names_out(categorical_cols).tolist()
    
    numeric_feature_names = numeric_cols
    all_feature_names = categorical_feature_names + numeric_feature_names
    
    print(f"Processed feature count: {len(all_feature_names)}")
    
    return X_train_processed, y_encoded, X_test_processed, label_encoder, all_feature_names, X_train['original_index'], X_test['original_index']


In [24]:
# Apply preprocessing
X_train_processed, y_encoded, X_test_processed, label_encoder, feature_names, train_indices, test_indices = preprocess_data(X_train, y_train, X_test)



Encoded target classes: ['C' 'CL' 'D']

Categorical columns: ['Drug', 'Sex', 'Ascites', 'Hepatomegaly', 'Spiders', 'Edema']
Numeric columns: ['id', 'N_Days', 'Age', 'Bilirubin', 'Cholesterol', 'Albumin', 'Copper', 'Alk_Phos', 'SGOT', 'Tryglicerides', 'Platelets', 'Prothrombin', 'Stage']

Preprocessing data...
Processed feature count: 27


In [25]:
# Create validation set
X_train_final, X_val, y_train_final, y_val = train_test_split(
    X_train_processed, y_encoded, 
    test_size=0.2, 
    random_state=42, 
    stratify=y_encoded
)

print(f"\nTraining data shape: {X_train_final.shape}")
print(f"Validation data shape: {X_val.shape}")
print(f"Test data shape: {X_test_processed.shape}")


Training data shape: (12000, 27)
Validation data shape: (3000, 27)
Test data shape: (10000, 27)


In [33]:
# Create and train models
def train_xgboost(X_train, y_train, X_val, y_val):
    print("\nTraining XGBoost model...")
    num_classes = len(np.unique(y_train))
    
    if num_classes == 2:
        objective = 'binary:logistic'
        eval_metric = 'logloss'
    else:
        objective = 'multi:softprob'
        eval_metric = 'mlogloss'
    
    model = xgb.XGBClassifier(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=6,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective=objective,
        eval_metric=eval_metric,
        random_state=42,
        use_label_encoder=False
    )

    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        callbacks=[xgb.callback.EarlyStopping(rounds=20, save_best=True)],
        verbose=False
    )
    
    
    # Get validation score
    if num_classes == 2:
        y_pred_val = model.predict_proba(X_val)[:, 1]
        auc = roc_auc_score(y_val, y_pred_val)
        y_pred_val_tensor = np.array(y_pred_val).reshape(-1, 1)
        y_pred_val_complement = 1 - y_pred_val_tensor
        y_pred_val_probs = np.hstack((y_pred_val_complement, y_pred_val_tensor))
        loss = log_loss(y_val, y_pred_val_probs)
    else:
        y_pred_val = model.predict_proba(X_val)
        auc = roc_auc_score(y_val, y_pred_val, multi_class='ovr')
        loss = log_loss(y_val, y_pred_val)
    
    print(f"XGBoost - Validation AUC: {auc:.4f}, Log Loss: {loss:.4f}")
    return model, auc, loss


In [28]:
# LBGM 
def train_lightgbm(X_train, y_train, X_val, y_val):
    print("\nTraining LightGBM model...")
    num_classes = len(np.unique(y_train))
    
    if num_classes == 2:
        objective = 'binary'
        metric = 'binary_logloss'
    else:
        objective = 'multiclass'
        metric = 'multi_logloss'
    
    model = lgb.LGBMClassifier(
        boosting_type='gbdt',
        num_leaves=31,
        max_depth=-1,
        learning_rate=0.05,
        n_estimators=500,
        subsample=0.8,
        colsample_bytree=0.8,
        objective=objective,
        random_state=42,
        metric=metric,
        num_class=num_classes if num_classes > 2 else 1
    )
    
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        callbacks=[lgb.early_stopping(20, verbose=False)]
    )
    
    # Get validation score
    if num_classes == 2:
        y_pred_val = model.predict_proba(X_val)[:, 1]
        auc = roc_auc_score(y_val, y_pred_val)
        loss = log_loss(y_val, model.predict_proba(X_val))
    else:
        y_pred_val = model.predict_proba(X_val)
        auc = roc_auc_score(y_val, y_pred_val, multi_class='ovr')
        loss = log_loss(y_val, y_pred_val)
    
    print(f"LightGBM - Validation AUC: {auc:.4f}, Log Loss: {loss:.4f}")
    return model, auc, loss

In [29]:
def train_catboost(X_train, y_train, X_val, y_val):
    print("\nTraining CatBoost model...")
    num_classes = len(np.unique(y_train))
    
    if num_classes == 2:
        loss_function = 'Logloss'
    else:
        loss_function = 'MultiClass'
    
    model = cb.CatBoostClassifier(
        iterations=500,
        learning_rate=0.05,
        depth=6,
        l2_leaf_reg=3,
        loss_function=loss_function,
        eval_metric='AUC',
        random_seed=42,
        verbose=False
    )
    
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        early_stopping_rounds=20,
        verbose=False
    )
    
    # Get validation score
    if num_classes == 2:
        y_pred_val = model.predict_proba(X_val)[:, 1]
        auc = roc_auc_score(y_val, y_pred_val)
        loss = log_loss(y_val, model.predict_proba(X_val))
    else:
        y_pred_val = model.predict_proba(X_val)
        auc = roc_auc_score(y_val, y_pred_val, multi_class='ovr')
        loss = log_loss(y_val, y_pred_val)
    
    print(f"CatBoost - Validation AUC: {auc:.4f}, Log Loss: {loss:.4f}")
    return model, auc, loss

In [34]:
# Train all models
xgb_model, xgb_auc, xgb_loss = train_xgboost(X_train_final, y_train_final, X_val, y_val)
lgb_model, lgb_auc, lgb_loss = train_lightgbm(X_train_final, y_train_final, X_val, y_val)
cb_model, cb_auc, cb_loss = train_catboost(X_train_final, y_train_final, X_val, y_val)


Training XGBoost model...


TypeError: XGBClassifier.fit() got an unexpected keyword argument 'callbacks'

In [None]:
# Create ensemble weights based on validation performance
# We use inverse of log loss as weight (lower loss = higher weight)
weights = np.array([1/xgb_loss, 1/lgb_loss, 1/cb_loss])
weights = weights / weights.sum()  # Normalize to sum to 1
print(f"\nEnsemble weights: XGBoost={weights[0]:.3f}, LightGBM={weights[1]:.3f}, CatBoost={weights[2]:.3f}")

In [None]:
num_classes = len(label_encoder.classes_)
if num_classes == 2:
    val_pred_xgb = xgb_model.predict_proba(X_val)[:, 1]
    val_pred_lgb = lgb_model.predict_proba(X_val)[:, 1]
    val_pred_cb = cb_model.predict_proba(X_val)[:, 1]
    
    val_pred_ensemble = (
        weights[0] * val_pred_xgb +
        weights[1] * val_pred_lgb +
        weights[2] * val_pred_cb
    )
    
    # Evaluate ensemble
    ensemble_auc = roc_auc_score(y_val, val_pred_ensemble)
    # For log loss we need probabilities for both classes
    ensemble_probs = np.column_stack((1 - val_pred_ensemble, val_pred_ensemble))
    ensemble_loss = log_loss(y_val, ensemble_probs)
else:
    val_pred_xgb = xgb_model.predict_proba(X_val)
    val_pred_lgb = lgb_model.predict_proba(X_val)
    val_pred_cb = cb_model.predict_proba(X_val)
    
    val_pred_ensemble = (
        weights[0] * val_pred_xgb +
        weights[1] * val_pred_lgb +
        weights[2] * val_pred_cb
    )
    
    # Evaluate ensemble
    ensemble_auc = roc_auc_score(y_val, val_pred_ensemble, multi_class='ovr')
    ensemble_loss = log_loss(y_val, val_pred_ensemble)

print(f"\nEnsemble - Validation AUC: {ensemble_auc:.4f}, Log Loss: {ensemble_loss:.4f}")


In [None]:
# Make predictions on test set
if num_classes == 2:
    test_pred_xgb = xgb_model.predict_proba(X_test_processed)[:, 1]
    test_pred_lgb = lgb_model.predict_proba(X_test_processed)[:, 1]
    test_pred_cb = cb_model.predict_proba(X_test_processed)[:, 1]
    
    test_pred_ensemble = (
        weights[0] * test_pred_xgb +
        weights[1] * test_pred_lgb +
        weights[2] * test_pred_cb
    )
    
    # Convert to class probabilities
    test_pred_probs = np.column_stack((1 - test_pred_ensemble, test_pred_ensemble))
else:
    test_pred_xgb = xgb_model.predict_proba(X_test_processed)
    test_pred_lgb = lgb_model.predict_proba(X_test_processed)
    test_pred_cb = cb_model.predict_proba(X_test_processed)
    
    test_pred_ensemble = (
        weights[0] * test_pred_xgb +
        weights[1] * test_pred_lgb +
        weights[2] * test_pred_cb
    )
    
    test_pred_probs = test_pred_ensemble

# Create the submission DataFrame matching the required format
results_df = pd.DataFrame({
    'id': test_indices
})

# Add probability columns for each class with the exact column names from the submission format
for i, class_name in enumerate(label_encoder.classes_):
    results_df[f'Status_{class_name}'] = test_pred_probs[:, i]

# Sort by original index to maintain original order
results_df = results_df.sort_values('id').reset_index(drop=True)


In [None]:
print("\nFirst few predictions:")
print(results_df.head())

In [None]:
# Save predictions to CSV
results_df.to_csv('ensemble_predictions.csv', index=False)
print("\nPredictions saved to 'ensemble_predictions.csv' with columns:", results_df.columns.tolist())

In [None]:
# Create feature importance visualization for the base models
def print_feature_importance(model, model_name, feature_names):
    if hasattr(model, 'feature_importances_'):
        importances = model.feature_importances_
        indices = np.argsort(importances)[::-1]
        
        print(f"\nTop 10 features for {model_name}:")
        for i in range(min(10, len(feature_names))):
            idx = indices[i]
            if idx < len(feature_names):
                print(f"{feature_names[idx]}: {importances[idx]:.4f}")

# Print feature importances
print_feature_importance(xgb_model, "XGBoost", feature_names)
print_feature_importance(lgb_model, "LightGBM", feature_names)
print_feature_importance(cb_model, "CatBoost", feature_names)

In [None]:
# Cross-validation for more robust evaluation
def cross_validate_ensemble(X, y, n_splits=5):
    print("\nPerforming cross-validation...")
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    cv_aucs = []
    cv_losses = []
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        print(f"Fold {fold+1}/{n_splits}")
        X_train_fold, X_val_fold = X[train_idx], X[val_idx]
        y_train_fold, y_val_fold = y[train_idx], y[val_idx]
        
        # Train models
        xgb_model, xgb_auc, xgb_loss = train_xgboost(X_train_fold, y_train_fold, X_val_fold, y_val_fold)
        lgb_model, lgb_auc, lgb_loss = train_lightgbm(X_train_fold, y_train_fold, X_val_fold, y_val_fold)
        cb_model, cb_auc, cb_loss = train_catboost(X_train_fold, y_train_fold, X_val_fold, y_val_fold)
        
        # Weight models
        weights = np.array([1/xgb_loss, 1/lgb_loss, 1/cb_loss])
        weights = weights / weights.sum()
        
        # Make ensemble prediction
        num_classes = len(np.unique(y))
        if num_classes == 2:
            val_pred_xgb = xgb_model.predict_proba(X_val_fold)[:, 1]
            val_pred_lgb = lgb_model.predict_proba(X_val_fold)[:, 1]
            val_pred_cb = cb_model.predict_proba(X_val_fold)[:, 1]
            
            val_pred_ensemble = (
                weights[0] * val_pred_xgb +
                weights[1] * val_pred_lgb +
                weights[2] * val_pred_cb
            )
            
            # Evaluate ensemble
            fold_auc = roc_auc_score(y_val_fold, val_pred_ensemble)
            # For log loss we need probabilities for both classes
            ensemble_probs = np.column_stack((1 - val_pred_ensemble, val_pred_ensemble))
            fold_loss = log_loss(y_val_fold, ensemble_probs)
        else:
            val_pred_xgb = xgb_model.predict_proba(X_val_fold)
            val_pred_lgb = lgb_model.predict_proba(X_val_fold)
            val_pred_cb = cb_model.predict_proba(X_val_fold)
            
            val_pred_ensemble = (
                weights[0] * val_pred_xgb +
                weights[1] * val_pred_lgb +
                weights[2] * val_pred_cb
            )
            
            # Evaluate ensemble
            fold_auc = roc_auc_score(y_val_fold, val_pred_ensemble, multi_class='ovr')
            fold_loss = log_loss(y_val_fold, val_pred_ensemble)
        
        cv_aucs.append(fold_auc)
        cv_losses.append(fold_loss)
        print(f"Fold {fold+1} - AUC: {fold_auc:.4f}, Log Loss: {fold_loss:.4f}")
    
    print(f"\nCross-validation results:")
    print(f"Mean AUC: {np.mean(cv_aucs):.4f} ± {np.std(cv_aucs):.4f}")
    print(f"Mean Log Loss: {np.mean(cv_losses):.4f} ± {np.std(cv_losses):.4f}")


In [None]:
# Perform cross-validation
cross_validate_ensemble(X_train_processed, y_encoded)

print("\nModel training and evaluation complete!")