## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import sys
sys.path.insert(0, '..')

from src.utils.data_loader import load_processed_data
from src.models.train import train_model, save_model

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 2. Load Processed Data

In [None]:
# Load features and targets
try:
    X, y = load_processed_data('sample_dataset')
    print(f"Loaded data with shape: X={X.shape}, y={y.shape}")
except FileNotFoundError:
    print("Run 02_feature_engineering.ipynb first to generate processed data")
    import sys
    sys.exit(1)

## 3. Prepare Data

In [None]:
# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"Training targets: {y_train.shape}")
print(f"Test targets: {y_test.shape}")

## 4. Train Baseline Models

In [None]:
# Initialize models
models = {
    'Ridge': Ridge(alpha=1.0),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42)
}

# Train and evaluate models
results = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Train
    model.fit(X_train, y_train)
    
    # Predict
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    train_mse = mean_squared_error(y_train, y_train_pred)
    test_mse = mean_squared_error(y_test, y_test_pred)
    test_mae = mean_absolute_error(y_test, y_test_pred)
    
    results[name] = {
        'model': model,
        'train_r2': train_r2,
        'test_r2': test_r2,
        'train_mse': train_mse,
        'test_mse': test_mse,
        'test_mae': test_mae,
        'y_test_pred': y_test_pred
    }
    
    print(f"  Train R²: {train_r2:.4f}")
    print(f"  Test R²: {test_r2:.4f}")
    print(f"  Test MSE: {test_mse:.4f}")
    print(f"  Test MAE: {test_mae:.4f}")

## 5. Cross-Validation

In [None]:
# Cross-validation scores
cv_results = {}

for name, model in models.items():
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')
    cv_results[name] = cv_scores
    print(f"{name}:")
    print(f"  CV R² scores: {cv_scores}")
    print(f"  Mean: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")

## 6. Model Comparison

In [None]:
# Create comparison dataframe
comparison_df = pd.DataFrame({
    'Model': list(results.keys()),
    'Train R²': [results[m]['train_r2'] for m in results.keys()],
    'Test R²': [results[m]['test_r2'] for m in results.keys()],
    'Test MSE': [results[m]['test_mse'] for m in results.keys()],
    'Test MAE': [results[m]['test_mae'] for m in results.keys()]
})

print("\nModel Comparison:")
print(comparison_df.to_string(index=False))

In [None]:
# Visualize model comparison
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# R² comparison
x = np.arange(len(results))
width = 0.35
axes[0].bar(x - width/2, comparison_df['Train R²'], width, label='Train R²', alpha=0.8)
axes[0].bar(x + width/2, comparison_df['Test R²'], width, label='Test R²', alpha=0.8)
axes[0].set_ylabel('R² Score')
axes[0].set_title('Model R² Comparison')
axes[0].set_xticks(x)
axes[0].set_xticklabels(comparison_df['Model'])
axes[0].legend()
axes[0].grid(axis='y', alpha=0.3)

# MSE comparison
axes[1].bar(comparison_df['Model'], comparison_df['Test MSE'], alpha=0.8, color='orange')
axes[1].set_ylabel('Mean Squared Error')
axes[1].set_title('Test MSE Comparison')
axes[1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

## 7. Best Model Predictions

In [None]:
# Select best model based on test R²
best_model_name = comparison_df.loc[comparison_df['Test R²'].idxmax(), 'Model']
best_model = results[best_model_name]['model']
best_pred = results[best_model_name]['y_test_pred']

print(f"Best Model: {best_model_name}")
print(f"Test R²: {results[best_model_name]['test_r2']:.4f}")

In [None]:
# Predictions vs Actual
plt.figure(figsize=(10, 6))
plt.scatter(y_test, best_pred, alpha=0.6, s=50)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Expression Level')
plt.ylabel('Predicted Expression Level')
plt.title(f'{best_model_name}: Predictions vs Actual')
plt.grid(alpha=0.3)
plt.show()

In [None]:
# Residuals
residuals = y_test - best_pred

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Residuals vs Predicted
axes[0].scatter(best_pred, residuals, alpha=0.6, s=50)
axes[0].axhline(y=0, color='r', linestyle='--', lw=2)
axes[0].set_xlabel('Predicted Values')
axes[0].set_ylabel('Residuals')
axes[0].set_title('Residuals vs Predictions')
axes[0].grid(alpha=0.3)

# Residuals distribution
axes[1].hist(residuals, bins=20, edgecolor='black', alpha=0.7)
axes[1].set_xlabel('Residuals')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Distribution of Residuals')
axes[1].axvline(x=0, color='r', linestyle='--', lw=2)

plt.tight_layout()
plt.show()

## 8. Save Best Model

In [None]:
# Save the best model
model_path = f'../models/{best_model_name.lower().replace(" ", "_")}_model.joblib'
scaler_path = f'../models/{best_model_name.lower().replace(" ", "_")}_scaler.joblib'

save_model(best_model, scaler, model_path, scaler_path)
print(f"Model saved to {model_path}")
print(f"Scaler saved to {scaler_path}")