In [14]:
import sys
sys.path.append('..') 

In [15]:
# load data
from utils import load_train_val_data

X_train, X_val, y_train, y_val = load_train_val_data()


📥 Loading train/val data...
✅ Data loaded:
   X_train: (2100, 5000)
   X_val: (600, 5000)
   y_train: 2100 samples
   y_val: 600 samples


In [16]:
from ml_pipeline import ModelTrainer

# 1. Create trainer
trainer = ModelTrainer(
    cv_folds=8,
    scoring='f1_weighted',
    n_trials=20,
    random_state=42,
    verbose=True
)


In [17]:
# QUICK FEATURE SELECTION TEST - Add this cell instead for faster results

from models import RandomForestModel
import numpy as np
from sklearn.feature_selection import SelectKBest, f_classif, SelectFromModel
from sklearn.ensemble import RandomForestClassifier

print("⚡ QUICK FEATURE SELECTION TEST")
print("=" * 40)

# Test just the most promising approaches
quick_tests = [
    # Statistical selection - likely to work well with TF-IDF
    ("F-Score_500", SelectKBest(f_classif, k=500)),
    ("F-Score_1000", SelectKBest(f_classif, k=1000)),
    ("F-Score_1500", SelectKBest(f_classif, k=1500)),
    
    # RF importance - uses same algorithm as your model
    ("RF_Import_1000", SelectFromModel(
        RandomForestClassifier(n_estimators=50, random_state=42, n_jobs=-1), 
        max_features=1000)
    ),
]

results = []

for name, selector in quick_tests:
    print(f"\n🧪 Testing {name}...")
    
    # Select features
    X_train_sel = selector.fit_transform(X_train, y_train)
    X_val_sel = selector.transform(X_val)
    n_features = X_train_sel.shape[1]
    
    print(f"   Selected: {n_features} features ({n_features/X_train.shape[1]*100:.1f}%)")
    
    # Quick test with simple RF
    rf_test = RandomForestModel(n_estimators=167, max_depth=10, min_samples_split = 24, min_samples_leaf = 8, max_features = 0.6, random_state=42)
    rf_test.name = f"Test_{name}"
    
    test_results = trainer.train_model(
        model=rf_test,
        X_train=X_train_sel,
        y_train=y_train, 
        X_val=X_val_sel,
        y_val=y_val,
        optimize=False
    )
    
    val_acc = test_results['val_accuracy']
    print(f"   🎯 Val accuracy: {val_acc:.4f}")
    
    results.append({
        'name': name,
        'n_features': n_features,
        'val_acc': val_acc,
        'selector': selector,
        'X_train_sel': X_train_sel,
        'X_val_sel': X_val_sel
    })

# Find best result
best = max(results, key=lambda x: x['val_acc'])
print(f"\n🏆 BEST QUICK TEST: {best['name']}")
print(f"   Features: {best['n_features']}")
print(f"   Accuracy: {best['val_acc']:.4f}")

⚡ QUICK FEATURE SELECTION TEST

🧪 Testing F-Score_500...
   Selected: 500 features (10.0%)
🚀 Training Test_F-Score_500...
🔧 Fitting Test_F-Score_500...
✅ Test_F-Score_500 fitted in 5.20 seconds
🔄 Running 8-fold cross-validation...
✅ Test_F-Score_500 completed in 10.9s
   CV: 0.5550 ± 0.0259
   Train: 0.8048
   Val: 0.5883 (gap: 0.2164)
   🎯 Val accuracy: 0.5883

🧪 Testing F-Score_1000...
   Selected: 1000 features (20.0%)
🚀 Training Test_F-Score_1000...
🔧 Fitting Test_F-Score_1000...
✅ Test_F-Score_1000 fitted in 9.96 seconds
🔄 Running 8-fold cross-validation...
✅ Test_F-Score_1000 completed in 20.9s
   CV: 0.5573 ± 0.0356
   Train: 0.8362
   Val: 0.6000 (gap: 0.2362)
   🎯 Val accuracy: 0.6000

🧪 Testing F-Score_1500...
   Selected: 1500 features (30.0%)
🚀 Training Test_F-Score_1500...
🔧 Fitting Test_F-Score_1500...
✅ Test_F-Score_1500 fitted in 14.87 seconds
🔄 Running 8-fold cross-validation...
✅ Test_F-Score_1500 completed in 31.2s
   CV: 0.5763 ± 0.0285
   Train: 0.8386
   Val: 0.61

In [None]:
# Now optimize the best one
print(f"\n🚀 Optimizing best feature selection...")

anti_overfitting_space = {
    'n_estimators': (200, 500),           # More trees for stability
    'max_depth': (3, 6),                  # Shallower trees
    'min_samples_split': (20, 100),       # Larger splits
    'min_samples_leaf': (15, 50),         # Larger leaves
    'max_features': ['sqrt', 'log2', 0.3], # Fewer features per tree
    'max_samples': (0.6, 0.9),           # Subsample training data
}


high_performance_space = {
    'n_estimators': (300, 800),           # Many trees
    'max_depth': (8, 15),                 # Deeper trees
    'min_samples_split': (2, 15),         # Allow smaller splits
    'min_samples_leaf': (1, 8),           # Allow smaller leaves
    'max_features': [0.4, 0.6, 0.8, 'sqrt'], # More features
}

diversity_space = {
    'n_estimators': (200, 400),
    'max_depth': (4, 10),
    'min_samples_split': (5, 50),         # Wide range
    'min_samples_leaf': (2, 30),          # Wide range
    'max_features': [0.2, 0.4, 'sqrt', 'log2'], # Include low feature counts
    'max_samples': (0.5, 0.9),           # Vary subsampling
    'criterion': ['gini', 'entropy'],
}

balanced_space = {
    'n_estimators': (100, 300),
    'max_depth': (5, 12),
    'min_samples_split': (10, 40),
    'min_samples_leaf': (5, 25),
    'max_features': ['sqrt', 'log2', 0.4, 0.6],
    'criterion': ['gini', 'entropy'],      # Try both splitting criteria
}

rf_name = f"high_performance_with_feature_selection_{best['name']}"

optimized_rf = RandomForestModel(
    n_estimators=100,
    max_depth=8,
    min_samples_leaf=12,
    random_state=42
)
optimized_rf.name = f"Optimized_{best['name']}"

final_results = trainer.train_model(
    model=optimized_rf,
    X_train=best['X_train_sel'],
    y_train=y_train,
    X_val=best['X_val_sel'],
    y_val=y_val,
    param_space=high_performance_space,
    optimize=True
)

print(f"\n📊 FINAL COMPARISON:")
print(f"   Original (5000 features): Your baseline")
print(f"   Selected ({best['n_features']} features): {final_results['val_accuracy']:.4f}")
print(f"   Feature reduction: {(1-best['n_features']/5000)*100:.1f}%")
print(f"   Training speedup: ~{5000/best['n_features']:.1f}x expected")

# Save if better
trainer.save_model(optimized_rf, f"../models/trained/RandomForest/{rf_name}.pkl")
print(f"💾 Feature-selected model saved!")

[I 2025-06-18 23:44:22,802] A new study created in memory with name: no-name-1ae3f20b-4540-4670-aaf9-58b65d3b6de8



🚀 Optimizing best feature selection...
🚀 Training Optimized_F-Score_1500...
🔍 Optimizing Optimized_F-Score_1500 hyperparameters...


  0%|          | 0/20 [00:00<?, ?it/s]

[I 2025-06-18 23:44:27,612] Trial 0 finished with value: 0.5599141950052853 and parameters: {'n_estimators': 487, 'max_depth': 15, 'min_samples_split': 12, 'min_samples_leaf': 5, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.5599141950052853.
[I 2025-06-18 23:45:17,215] Trial 1 finished with value: 0.5773091028916159 and parameters: {'n_estimators': 601, 'max_depth': 13, 'min_samples_split': 2, 'min_samples_leaf': 8, 'max_features': 0.4}. Best is trial 1 with value: 0.5773091028916159.
[I 2025-06-18 23:45:58,562] Trial 2 finished with value: 0.5780748304230525 and parameters: {'n_estimators': 452, 'max_depth': 12, 'min_samples_split': 8, 'min_samples_leaf': 3, 'max_features': 0.4}. Best is trial 2 with value: 0.5780748304230525.
[I 2025-06-18 23:47:36,656] Trial 3 finished with value: 0.5800157096838845 and parameters: {'n_estimators': 528, 'max_depth': 14, 'min_samples_split': 4, 'min_samples_leaf': 5, 'max_features': 0.8}. Best is trial 3 with value: 0.5800157096838845.
[I 2