# Model Training and Feature Importance Analysis

This notebook trains a Random Forest model on the processed molecular data and analyzes which features are most important for predicting flashpoint values.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.inspection import permutation_importance
import warnings
warnings.filterwarnings('ignore')

# Load processed data
X_train = pd.read_csv("../data/processed/X_train.csv")
y_train = pd.read_csv("../data/processed/y_train.csv").squeeze()
X_test = pd.read_csv("../data/processed/X_test.csv")
y_test = pd.read_csv("../data/processed/y_test.csv").squeeze()

print(f"Training data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")
print(f"Features: {list(X_train.columns)}")

ValueError: could not convert string to float: 'bicyclohexyl'

In [None]:
# Check for missing values and data quality
print("Missing values in training data:")
print(X_train.isnull().sum().sum())
print("\nMissing values in test data:")
print(X_test.isnull().sum().sum())

# Basic statistics of target variable
print(f"\nTarget variable statistics:")
print(f"Training set - Min: {y_train.min():.2f}, Max: {y_train.max():.2f}, Mean: {y_train.mean():.2f}")
print(f"Test set - Min: {y_test.min():.2f}, Max: {y_test.max():.2f}, Mean: {y_test.mean():.2f}")

# Check feature types
print(f"\nFeature columns: {X_train.columns.tolist()}")

In [None]:
# Train Random Forest model
print("Training Random Forest Regressor...")
rf_model = RandomForestRegressor(
    n_estimators=200,           # More trees for better performance
    max_depth=15,               # Prevent overfitting
    min_samples_split=5,        # Minimum samples to split a node
    min_samples_leaf=2,         # Minimum samples in leaf nodes
    random_state=42,
    n_jobs=-1                   # Use all available cores
)

# Fit the model
rf_model.fit(X_train, y_train)

# Make predictions
y_train_pred = rf_model.predict(X_train)
y_test_pred = rf_model.predict(X_test)

print("Model training completed!")

In [None]:
# Evaluate model performance
def evaluate_model(y_true, y_pred, dataset_name):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    
    print(f"\n{dataset_name} Performance:")
    print(f"  RMSE: {rmse:.4f}")
    print(f"  MAE:  {mae:.4f}")
    print(f"  R²:   {r2:.4f}")
    
    return {"RMSE": rmse, "MAE": mae, "R2": r2}

# Evaluate on both training and test sets
train_metrics = evaluate_model(y_train, y_train_pred, "Training Set")
test_metrics = evaluate_model(y_test, y_test_pred, "Test Set")

# Calculate overfitting indicator
print(f"\nOverfitting Analysis:")
print(f"  Training R²: {train_metrics['R2']:.4f}")
print(f"  Test R²:     {test_metrics['R2']:.4f}")
print(f"  Difference:  {train_metrics['R2'] - test_metrics['R2']:.4f}")

# Feature Importance Analysis

Let's analyze which features are most important for predicting flashpoint values using both built-in Random Forest feature importance and permutation importance.

In [None]:
# Feature Importance Analysis - Method 1: Built-in Random Forest Feature Importance
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print("Top 15 Most Important Features (Random Forest Built-in):")
print(feature_importance.head(15))

# Plot feature importance
plt.figure(figsize=(12, 8))
top_features = feature_importance.head(15)
plt.barh(range(len(top_features)), top_features['importance'])
plt.yticks(range(len(top_features)), top_features['feature'])
plt.xlabel('Feature Importance')
plt.title('Top 15 Feature Importances (Random Forest)')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

In [None]:
# Feature Importance Analysis - Method 2: Permutation Importance
print("Calculating permutation importance (this may take a few minutes)...")
perm_importance = permutation_importance(
    rf_model, X_test, y_test, 
    n_repeats=10, 
    random_state=42,
    n_jobs=-1
)

# Create dataframe for permutation importance
perm_importance_df = pd.DataFrame({
    'feature': X_train.columns,
    'importance_mean': perm_importance.importances_mean,
    'importance_std': perm_importance.importances_std
}).sort_values('importance_mean', ascending=False)

print("\nTop 15 Most Important Features (Permutation Importance):")
print(perm_importance_df.head(15))

# Plot permutation importance with error bars
plt.figure(figsize=(12, 8))
top_perm_features = perm_importance_df.head(15)
plt.barh(range(len(top_perm_features)), top_perm_features['importance_mean'],
         xerr=top_perm_features['importance_std'])
plt.yticks(range(len(top_perm_features)), top_perm_features['feature'])
plt.xlabel('Permutation Importance')
plt.title('Top 15 Feature Importances (Permutation)')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

In [None]:
# Feature Selection Based on Importance Thresholds
def select_features_by_importance(importance_df, threshold=0.01):
    """Select features above a certain importance threshold"""
    selected_features = importance_df[importance_df['importance'] >= threshold]['feature'].tolist()
    return selected_features

# Define importance thresholds
rf_threshold = 0.01  # Features must have at least 1% importance
perm_threshold = 0.001  # Permutation importance threshold

# Select features using Random Forest importance
rf_selected = select_features_by_importance(feature_importance, rf_threshold)
print(f"Features selected by Random Forest importance (>= {rf_threshold}): {len(rf_selected)}")
print(f"Selected features: {rf_selected}")

# Select features using Permutation importance
perm_selected = perm_importance_df[perm_importance_df['importance_mean'] >= perm_threshold]['feature'].tolist()
print(f"\nFeatures selected by Permutation importance (>= {perm_threshold}): {len(perm_selected)}")
print(f"Selected features: {perm_selected}")

# Find common important features between both methods
common_features = list(set(rf_selected) & set(perm_selected))
print(f"\nCommon important features between both methods: {len(common_features)}")
print(f"Common features: {common_features}")

In [None]:
# Visualize predictions vs actual values
plt.figure(figsize=(15, 5))

# Training set predictions
plt.subplot(1, 3, 1)
plt.scatter(y_train, y_train_pred, alpha=0.5)
plt.plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], 'r--', lw=2)
plt.xlabel('Actual Flashpoint')
plt.ylabel('Predicted Flashpoint')
plt.title(f'Training Set Predictions\nR² = {train_metrics["R2"]:.4f}')

# Test set predictions
plt.subplot(1, 3, 2)
plt.scatter(y_test, y_test_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Flashpoint')
plt.ylabel('Predicted Flashpoint')
plt.title(f'Test Set Predictions\nR² = {test_metrics["R2"]:.4f}')

# Residuals plot
plt.subplot(1, 3, 3)
residuals = y_test - y_test_pred
plt.scatter(y_test_pred, residuals, alpha=0.5)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Flashpoint')
plt.ylabel('Residuals')
plt.title('Residuals Plot (Test Set)')

plt.tight_layout()
plt.show()

In [None]:
# Train a reduced model using only the most important features
print("Training a reduced model using only the most important features...")

# Use the top 10 features from Random Forest importance
top_features_rf = feature_importance.head(10)['feature'].tolist()
X_train_reduced = X_train[top_features_rf]
X_test_reduced = X_test[top_features_rf]

# Train reduced model
rf_reduced = RandomForestRegressor(
    n_estimators=200,
    max_depth=15,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

rf_reduced.fit(X_train_reduced, y_train)

# Make predictions with reduced model
y_train_pred_reduced = rf_reduced.predict(X_train_reduced)
y_test_pred_reduced = rf_reduced.predict(X_test_reduced)

# Evaluate reduced model
print("\n=== REDUCED MODEL PERFORMANCE (Top 10 Features) ===")
train_metrics_reduced = evaluate_model(y_train, y_train_pred_reduced, "Training Set (Reduced)")
test_metrics_reduced = evaluate_model(y_test, y_test_pred_reduced, "Test Set (Reduced)")

# Compare with full model
print("\n=== MODEL COMPARISON ===")
print(f"Full Model Test R²:    {test_metrics['R2']:.4f}")
print(f"Reduced Model Test R²: {test_metrics_reduced['R2']:.4f}")
print(f"Performance difference: {test_metrics_reduced['R2'] - test_metrics['R2']:.4f}")
print(f"Features reduced from {X_train.shape[1]} to {len(top_features_rf)} ({(1-len(top_features_rf)/X_train.shape[1])*100:.1f}% reduction)")

In [None]:
# Categorize features by importance levels
def categorize_features_by_importance(feature_importance_df, high_thresh=0.05, medium_thresh=0.01):
    """Categorize features into high, medium, and low importance"""
    high_importance = feature_importance_df[feature_importance_df['importance'] >= high_thresh]['feature'].tolist()
    medium_importance = feature_importance_df[
        (feature_importance_df['importance'] >= medium_thresh) & 
        (feature_importance_df['importance'] < high_thresh)
    ]['feature'].tolist()
    low_importance = feature_importance_df[feature_importance_df['importance'] < medium_thresh]['feature'].tolist()
    
    return high_importance, medium_importance, low_importance

# Categorize features
high_imp, medium_imp, low_imp = categorize_features_by_importance(feature_importance)

print("=== FEATURE IMPORTANCE CATEGORIZATION ===")
print(f"\nHIGH IMPORTANCE FEATURES (>= 5%): {len(high_imp)}")
for i, feature in enumerate(high_imp, 1):
    importance_value = feature_importance[feature_importance['feature'] == feature]['importance'].iloc[0]
    print(f"  {i}. {feature}: {importance_value:.4f}")

print(f"\nMEDIUM IMPORTANCE FEATURES (1% - 5%): {len(medium_imp)}")
for i, feature in enumerate(medium_imp, 1):
    importance_value = feature_importance[feature_importance['feature'] == feature]['importance'].iloc[0]
    print(f"  {i}. {feature}: {importance_value:.4f}")

print(f"\nLOW IMPORTANCE FEATURES (< 1%): {len(low_imp)}")
print(f"  Features: {low_imp[:10]}...")  # Show first 10 low importance features
print(f"  (and {max(0, len(low_imp)-10)} more)")

# Save feature importance results
feature_importance.to_csv("../data/processed/feature_importance.csv", index=False)
print(f"\n✅ Feature importance results saved to ../data/processed/feature_importance.csv")