<a href="https://colab.research.google.com/github/armanfeili/novartis_datathon_2025/blob/Arman/notebooks/colab/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# üß¨ Novartis Datathon 2025 - Main Training Notebook

This notebook provides full access to the project pipeline:
1. **Environment Setup** - Mount Drive, clone repo, install dependencies
2. **Data Exploration** - Load and explore the raw data  
3. **Feature Engineering** - Build features from raw data
4. **Model Training** - Train models with cross-validation
5. **Evaluation** - Analyze model performance
6. **Inference** - Generate predictions for submission

---

## 1. Environment Setup

In [5]:
# Check if running in Colab
import sys
IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    print("‚úÖ Google Drive mounted successfully")
else:
    print("‚ö†Ô∏è Not running in Colab - using local paths")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
‚úÖ Google Drive mounted successfully


In [6]:
import os

# --- Configuration ---
REPO_URL = "https://github.com/armanfeili/novartis_datathon_2025.git"
BRANCH = "Arman"  # Change this if you are working on a different branch

# Paths depend on environment
if IN_COLAB:
    DRIVE_BASE = "/content/drive/MyDrive"
    PROJECT_PATH = f"{DRIVE_BASE}/novartis_datathon_2025"
    DATA_PATH = f"{DRIVE_BASE}/novartis-datathon-2025/data"
    ARTIFACTS_PATH = f"{DRIVE_BASE}/novartis-datathon-2025/artifacts"
    SUBMISSIONS_PATH = f"{DRIVE_BASE}/novartis-datathon-2025/submissions"
else:
    # Local development paths (relative to repo root)
    PROJECT_PATH = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath("__file__"))))
    DATA_PATH = os.path.join(PROJECT_PATH, "data")
    ARTIFACTS_PATH = os.path.join(PROJECT_PATH, "artifacts")
    SUBMISSIONS_PATH = os.path.join(PROJECT_PATH, "submissions")

# ---------------------

if IN_COLAB:
    if not os.path.exists(PROJECT_PATH):
        print(f"üì• Cloning repository to {PROJECT_PATH}...")
        !git clone {REPO_URL} {PROJECT_PATH}
    else:
        print(f"üìÇ Repository exists at {PROJECT_PATH}. Pulling latest changes...")
        %cd {PROJECT_PATH}
        !git fetch origin {BRANCH}
        !git reset --hard origin/{BRANCH}

    %cd {PROJECT_PATH}

# Create required directories
for path in [DATA_PATH, ARTIFACTS_PATH, SUBMISSIONS_PATH]:
    os.makedirs(path, exist_ok=True)
    os.makedirs(os.path.join(path, "raw") if "data" in path else path, exist_ok=True)

print(f"\nüìÅ Project Path: {PROJECT_PATH}")
print(f"üìÅ Data Path: {DATA_PATH}")
print(f"üìÅ Artifacts Path: {ARTIFACTS_PATH}")
print(f"üìÅ Submissions Path: {SUBMISSIONS_PATH}")

üìÇ Repository exists at /content/drive/MyDrive/novartis_datathon_2025. Pulling latest changes...
/content/drive/MyDrive/novartis_datathon_2025
remote: Enumerating objects: 19, done.[K
remote: Counting objects: 100% (19/19), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 10 (delta 6), reused 10 (delta 6), pack-reused 0 (from 0)[K
Unpacking objects: 100% (10/10), 8.06 KiB | 26.00 KiB/s, done.
From https://github.com/armanfeili/novartis_datathon_2025
 * branch            Arman      -> FETCH_HEAD
   67c14aa..5c33709  Arman      -> origin/Arman
HEAD is now at 5c33709 project setup - 3
/content/drive/MyDrive/novartis_datathon_2025

üìÅ Project Path: /content/drive/MyDrive/novartis_datathon_2025
üìÅ Data Path: /content/drive/MyDrive/novartis-datathon-2025/data
üìÅ Artifacts Path: /content/drive/MyDrive/novartis-datathon-2025/artifacts
üìÅ Submissions Path: /content/drive/MyDrive/novartis-datathon-2025/submissions


In [7]:
# Install Dependencies
print("üì¶ Installing dependencies...")
!pip install -q -r requirements.txt

# Verify key packages
import importlib
packages = ['torch', 'numpy', 'pandas', 'lightgbm', 'xgboost', 'catboost', 'sklearn', 'yaml']
for pkg in packages:
    try:
        importlib.import_module(pkg if pkg != 'sklearn' else 'sklearn')
        print(f"  ‚úÖ {pkg}")
    except ImportError:
        print(f"  ‚ùå {pkg} - Installing...")
        !pip install -q {pkg if pkg != 'yaml' else 'pyyaml'}
        !pip install -q {pkg if pkg != 'sklearn' else 'scikit-learn'}

print("\n‚úÖ All dependencies installed!")

üì¶ Installing dependencies...
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m99.2/99.2 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m1.6/1.6 MB[0m [31m63.0 MB/s[0m eta [36m0:00:00[0m
[?25h  ‚úÖ torch
  ‚úÖ numpy
  ‚úÖ pandas
  ‚úÖ lightgbm
  ‚úÖ xgboost
  ‚úÖ catboost
  ‚úÖ sklearn
  ‚úÖ yaml

‚úÖ All dependencies installed!


## 2. Import Project Modules

Import all necessary modules from the `src/` package.

In [8]:
import sys
import os
import warnings
warnings.filterwarnings('ignore')

# Ensure project root is in path
if os.getcwd() not in sys.path:
    sys.path.insert(0, os.getcwd())

# Standard imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from datetime import datetime
import yaml
import json
import logging

# Project imports
from src.utils import load_config, set_seed, setup_logging, timer, get_device
from src.data import DataManager
from src.features import FeatureEngineer
from src.validation import Validator
from src.evaluate import Evaluator
from src.train import run_experiment, get_model_class

# Model imports
from src.models.lgbm_model import LGBMModel
from src.models.xgb_model import XGBModel
from src.models.cat_model import CatBoostModel
from src.models.linear import LinearModel
from src.models.nn import NNModel

# Set plotting style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("husl")

# Check GPU
device = get_device()
print(f"üñ•Ô∏è  Device: {device}")
if 'cuda' in str(device):
    import torch
    print(f"üöÄ GPU: {torch.cuda.get_device_name(0)}")
    print(f"üíæ GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

print("\n‚úÖ All modules imported successfully!")

üñ•Ô∏è  Device: cpu

‚úÖ All modules imported successfully!


## 3. Load Configurations

Load all configuration files for data, features, and models.

In [9]:
# Load all configurations
data_config = load_config('configs/data.yaml')
features_config = load_config('configs/features.yaml')
run_config = load_config('configs/run_defaults.yaml')

# Available model configs
model_configs = {
    'lightgbm': load_config('configs/model_lgbm.yaml'),
    'xgboost': load_config('configs/model_xgb.yaml'),
    'catboost': load_config('configs/model_cat.yaml'),
    'linear': load_config('configs/model_linear.yaml'),
    'neural_network': load_config('configs/model_nn.yaml'),
}

# Set seed for reproducibility
SEED = run_config['reproducibility']['seed']
set_seed(SEED)

print("üìã Configurations loaded:")
print(f"  - Data config: {list(data_config.keys())}")
print(f"  - Features config: {list(features_config.keys())}")
print(f"  - Run config: {list(run_config.keys())}")
print(f"  - Model configs: {list(model_configs.keys())}")
print(f"\nüé≤ Random seed: {SEED}")

üìã Configurations loaded:
  - Data config: ['drive', 'local', 'files', 'keys', 'dates', 'columns', 'validation']
  - Features config: ['feature_groups', 'lags', 'rolling', 'diff', 'time_features', 'interactions', 'selection', 'encoding']
  - Run config: ['experiment', 'run', 'reproducibility', 'cv', 'paths', 'output', 'metrics', 'logging', 'drive', 'hardware']
  - Model configs: ['lightgbm', 'xgboost', 'catboost', 'linear', 'neural_network']

üé≤ Random seed: 42


## 4. Data Loading & Exploration

Load raw data and perform exploratory data analysis.

In [10]:
# Initialize DataManager
data_mgr = DataManager(data_config)

# Check data directories
print("üìÇ Data Directories:")
print(f"  Raw: {data_mgr.raw_dir} (exists: {data_mgr.raw_dir.exists()})")
print(f"  Interim: {data_mgr.interim_dir} (exists: {data_mgr.interim_dir.exists()})")
print(f"  Processed: {data_mgr.processed_dir} (exists: {data_mgr.processed_dir.exists()})")

# List available data files
if data_mgr.raw_dir.exists():
    raw_files = list(data_mgr.raw_dir.glob("*.csv")) + list(data_mgr.raw_dir.glob("*.parquet"))
    print(f"\nüìÑ Available raw files ({len(raw_files)}):")
    for f in raw_files:
        size_mb = f.stat().st_size / (1024 * 1024)
        print(f"  - {f.name} ({size_mb:.2f} MB)")
else:
    print("\n‚ö†Ô∏è Raw data directory not found. Please upload your data files.")

üìÇ Data Directories:
  Raw: /content/drive/MyDrive/novartis-datathon-2025/data/raw (exists: True)
  Interim: /content/drive/MyDrive/novartis-datathon-2025/data/interim (exists: True)
  Processed: /content/drive/MyDrive/novartis-datathon-2025/data/processed (exists: True)

üìÑ Available raw files (0):


In [11]:
# Load raw data
# Note: Update configs/data.yaml with your actual file names
raw_data = data_mgr.load_raw_data()

print(f"üìä Loaded {len(raw_data)} datasets:")
for name, df in raw_data.items():
    print(f"\n  {name}:")
    print(f"    Shape: {df.shape}")
    print(f"    Columns: {list(df.columns)[:10]}{'...' if len(df.columns) > 10 else ''}")
    print(f"    Memory: {df.memory_usage(deep=True).sum() / 1e6:.2f} MB")

AttributeError: 'NoneType' object has no attribute 'items'

In [None]:
# Quick EDA helper functions
def quick_eda(df, name="Dataset"):
    """Perform quick exploratory data analysis on a dataframe."""
    print(f"\n{'='*60}")
    print(f"üìä EDA: {name}")
    print(f"{'='*60}")

    print(f"\nüìê Shape: {df.shape[0]:,} rows √ó {df.shape[1]} columns")

    # Data types
    print(f"\nüìã Data Types:")
    for dtype, count in df.dtypes.value_counts().items():
        print(f"  - {dtype}: {count}")

    # Missing values
    missing = df.isnull().sum()
    if missing.sum() > 0:
        print(f"\n‚ö†Ô∏è Missing Values:")
        for col in missing[missing > 0].index:
            pct = missing[col] / len(df) * 100
            print(f"  - {col}: {missing[col]:,} ({pct:.1f}%)")
    else:
        print(f"\n‚úÖ No missing values")

    # Numeric summary
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    if len(numeric_cols) > 0:
        print(f"\nüìà Numeric Summary (first 5 columns):")
        display(df[numeric_cols[:5]].describe())

    return df

# Run EDA on loaded data
for name, df in raw_data.items():
    quick_eda(df, name)

## 5. Feature Engineering

Build features from the raw/interim data using the FeatureEngineer module.

In [None]:
# Create interim data from raw data
interim_df = data_mgr.make_interim(raw_data)

# Initialize FeatureEngineer
fe = FeatureEngineer(features_config)

# Build features
if not interim_df.empty:
    processed_df = fe.build_features(interim_df)
    print(f"‚úÖ Features built: {processed_df.shape}")
    print(f"üìã Feature columns: {list(processed_df.columns)}")
else:
    print("‚ö†Ô∏è No interim data available. Please check your data loading.")

## 6. Model Training

Train models using cross-validation. Choose from available models:
- `lightgbm` - LightGBM gradient boosting
- `xgboost` - XGBoost gradient boosting
- `catboost` - CatBoost gradient boosting
- `linear` - Linear/Ridge regression
- `neural_network` - PyTorch neural network

In [None]:
# --- Experiment Configuration ---
MODEL_NAME = "lightgbm"  # Options: lightgbm, xgboost, catboost, linear, neural_network
RUN_NAME = "colab_experiment_01"  # Custom name for this run
MODEL_CONFIG_PATH = f"configs/model_lgbm.yaml"  # Path to model config

# Map model names to config paths
MODEL_CONFIG_MAP = {
    'lightgbm': 'configs/model_lgbm.yaml',
    'xgboost': 'configs/model_xgb.yaml',
    'catboost': 'configs/model_cat.yaml',
    'linear': 'configs/model_linear.yaml',
    'neural_network': 'configs/model_nn.yaml',
}

MODEL_CONFIG_PATH = MODEL_CONFIG_MAP.get(MODEL_NAME, MODEL_CONFIG_PATH)
# --------------------------------

print(f"üèÉ Experiment Configuration:")
print(f"  Model: {MODEL_NAME}")
print(f"  Config: {MODEL_CONFIG_PATH}")
print(f"  Run Name: {RUN_NAME}")

In [None]:
# Run the experiment
print(f"\nüöÄ Starting training run: {RUN_NAME}")
print(f"{'='*60}\n")

try:
    run_id, metrics = run_experiment(
        model_name=MODEL_NAME,
        model_config_path=MODEL_CONFIG_PATH,
        run_name=RUN_NAME,
        config_path='configs/run_defaults.yaml'
    )

    print(f"\n{'='*60}")
    print(f"‚úÖ Training complete!")
    print(f"üìÅ Run ID: {run_id}")
    print(f"\nüìä Final Metrics:")
    for metric, value in metrics.items():
        print(f"  - {metric}: {value:.6f}")

except Exception as e:
    print(f"‚ùå Training failed: {e}")
    print("\nüí° Make sure you have:")
    print("  1. Uploaded data to the raw directory")
    print("  2. Updated configs/data.yaml with correct file names")
    print("  3. Set the target column in configs/data.yaml")
    raise

## 7. Evaluation & Analysis

Analyze model performance and visualize results.

In [None]:
# Load results from the latest run
artifacts_base = Path(run_config['paths']['artifacts_dir'])

# Find the latest run (or use specific run_id)
if 'run_id' in dir():
    latest_run = artifacts_base / run_id
else:
    runs = sorted(artifacts_base.glob("*"), key=lambda x: x.stat().st_mtime, reverse=True)
    latest_run = runs[0] if runs else None

if latest_run and latest_run.exists():
    print(f"üìÇ Analyzing run: {latest_run.name}\n")

    # Load metrics
    metrics_path = latest_run / "metrics.json"
    if metrics_path.exists():
        with open(metrics_path) as f:
            saved_metrics = json.load(f)
        print("üìä Saved Metrics:")
        for k, v in saved_metrics.items():
            print(f"  - {k}: {v:.6f}")

    # Load OOF predictions
    oof_path = latest_run / "oof_preds.csv"
    if oof_path.exists():
        oof_df = pd.read_csv(oof_path)
        print(f"\nüìà OOF Predictions: {len(oof_df)} samples")

        # Plot actual vs predicted
        fig, axes = plt.subplots(1, 2, figsize=(14, 5))

        # Scatter plot
        axes[0].scatter(oof_df['actual'], oof_df['pred'], alpha=0.5, s=10)
        axes[0].plot([oof_df['actual'].min(), oof_df['actual'].max()],
                     [oof_df['actual'].min(), oof_df['actual'].max()], 'r--', lw=2)
        axes[0].set_xlabel('Actual')
        axes[0].set_ylabel('Predicted')
        axes[0].set_title('Actual vs Predicted')

        # Residuals distribution
        residuals = oof_df['actual'] - oof_df['pred']
        axes[1].hist(residuals, bins=50, edgecolor='black', alpha=0.7)
        axes[1].axvline(x=0, color='r', linestyle='--', lw=2)
        axes[1].set_xlabel('Residual (Actual - Predicted)')
        axes[1].set_ylabel('Frequency')
        axes[1].set_title('Residuals Distribution')

        plt.tight_layout()
        plt.show()

    # List saved model files
    models = list(latest_run.glob("model_fold_*.bin"))
    print(f"\nüíæ Saved Models: {len(models)} fold(s)")
    for m in models:
        print(f"  - {m.name}")
else:
    print("‚ö†Ô∏è No runs found. Please run training first.")

## 8. Inference & Submission

Generate predictions on test data and create submission file.

In [None]:
import joblib

def generate_submission(run_id: str, test_df: pd.DataFrame, id_col: str = 'id'):
    """Generate predictions using trained models and create submission file."""

    artifacts_dir = Path(run_config['paths']['artifacts_dir']) / run_id
    submissions_dir = Path(run_config['paths']['submissions_dir'])
    submissions_dir.mkdir(parents=True, exist_ok=True)

    # Load all fold models
    model_paths = sorted(artifacts_dir.glob("model_fold_*.bin"))
    print(f"üìÇ Loading {len(model_paths)} model(s) from {run_id}")

    all_preds = []
    for model_path in model_paths:
        model = joblib.load(model_path)

        # Get feature columns (exclude id and target)
        target_col = data_config['columns']['target']
        feature_cols = [c for c in test_df.columns if c not in [id_col, target_col]]

        preds = model.predict(test_df[feature_cols])
        all_preds.append(preds)
        print(f"  ‚úÖ Loaded {model_path.name}")

    # Average predictions across folds
    final_preds = np.mean(all_preds, axis=0)

    # Create submission
    submission = pd.DataFrame({
        id_col: test_df[id_col] if id_col in test_df.columns else range(len(final_preds)),
        'prediction': final_preds
    })

    # Save submission
    submission_path = submissions_dir / f"submission_{run_id}.csv"
    submission.to_csv(submission_path, index=False)
    print(f"\n‚úÖ Submission saved to: {submission_path}")
    print(f"üìä Shape: {submission.shape}")
    print(f"\nüìã Preview:")
    display(submission.head(10))

    return submission

# Example usage (uncomment and modify when you have test data):
# test_df = pd.read_csv(data_mgr.raw_dir / "test.csv")
# test_df = fe.build_features(test_df)  # Apply same feature engineering
# submission = generate_submission(run_id, test_df)

print("üí° To generate submission, uncomment the code above and ensure:")
print("  1. Test data is available in the raw directory")
print("  2. Run ID is defined from a successful training run")

## 9. Experiment Tracking & History

View all experiment runs and compare results.

In [None]:
# List all experiment runs
artifacts_base = Path(run_config['paths']['artifacts_dir'])

if artifacts_base.exists():
    runs = sorted(artifacts_base.glob("*"), key=lambda x: x.stat().st_mtime, reverse=True)

    if runs:
        print(f"üìä Experiment History ({len(runs)} runs):\n")

        run_data = []
        for run_dir in runs[:20]:  # Show last 20 runs
            run_info = {'run_id': run_dir.name}

            # Load metrics if available
            metrics_path = run_dir / "metrics.json"
            if metrics_path.exists():
                with open(metrics_path) as f:
                    metrics = json.load(f)
                run_info.update(metrics)

            # Get timestamp from folder
            run_info['created'] = datetime.fromtimestamp(run_dir.stat().st_mtime).strftime('%Y-%m-%d %H:%M')

            run_data.append(run_info)

        runs_df = pd.DataFrame(run_data)
        display(runs_df)

        # Plot metrics comparison
        if 'rmse' in runs_df.columns and len(runs_df) > 1:
            plt.figure(figsize=(12, 4))
            plt.bar(range(len(runs_df)), runs_df['rmse'])
            plt.xticks(range(len(runs_df)), runs_df['run_id'], rotation=45, ha='right')
            plt.xlabel('Run ID')
            plt.ylabel('RMSE')
            plt.title('RMSE Comparison Across Runs')
            plt.tight_layout()
            plt.show()
    else:
        print("‚ö†Ô∏è No experiment runs found.")
else:
    print("‚ö†Ô∏è Artifacts directory not found. Run training first.")

## 10. Utilities & Helpers

Useful utility functions for common operations.

In [None]:
# Utility functions for common operations

def sync_to_drive():
    """Sync local changes to Google Drive (Colab only)."""
    if IN_COLAB:
        from google.colab import drive
        drive.flush_and_unmount()
        drive.mount('/content/drive')
        print("‚úÖ Synced to Google Drive")
    else:
        print("‚ö†Ô∏è Not in Colab, skipping Drive sync")

def download_submission(run_id: str):
    """Download submission file (Colab only)."""
    if IN_COLAB:
        from google.colab import files
        submission_path = Path(run_config['paths']['submissions_dir']) / f"submission_{run_id}.csv"
        if submission_path.exists():
            files.download(str(submission_path))
            print(f"‚úÖ Downloaded: {submission_path.name}")
        else:
            print(f"‚ùå Submission not found: {submission_path}")
    else:
        print("‚ö†Ô∏è Not in Colab, file is available locally")

def upload_data():
    """Upload data files to raw directory (Colab only)."""
    if IN_COLAB:
        from google.colab import files
        print("üì§ Select files to upload...")
        uploaded = files.upload()
        for filename, content in uploaded.items():
            dest_path = data_mgr.raw_dir / filename
            with open(dest_path, 'wb') as f:
                f.write(content)
            print(f"  ‚úÖ Saved: {dest_path}")
    else:
        print("‚ö†Ô∏è Not in Colab. Place files directly in data/raw/")

def show_gpu_info():
    """Display GPU information."""
    if torch.cuda.is_available():
        print(f"üöÄ GPU: {torch.cuda.get_device_name(0)}")
        print(f"üíæ Total Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
        print(f"üíæ Allocated: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB")
        print(f"üíæ Cached: {torch.cuda.memory_reserved(0) / 1e9:.2f} GB")
    else:
        print("‚ùå No GPU available")

def clear_gpu_cache():
    """Clear GPU memory cache."""
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        print("‚úÖ GPU cache cleared")
    else:
        print("‚ö†Ô∏è No GPU to clear")

print("üõ†Ô∏è Utility functions available:")
print("  - sync_to_drive(): Sync changes to Google Drive")
print("  - download_submission(run_id): Download submission CSV")
print("  - upload_data(): Upload data files to raw directory")
print("  - show_gpu_info(): Display GPU information")
print("  - clear_gpu_cache(): Clear GPU memory cache")