In [2]:
import sys
sys.path.append('..')

In [3]:
# Load data
from utils import load_train_val_data

X_train, X_val, y_train, y_val = load_train_val_data()

📥 Loading train/val data...
✅ Data loaded:
   X_train: (2100, 5000)
   X_val: (600, 5000)
   y_train: 2100 samples
   y_val: 600 samples


In [4]:
from ml_pipeline import ModelTrainer

# Create trainer (same config as RF)
trainer = ModelTrainer(
    cv_folds=8,
    scoring='f1_weighted', 
    n_trials=20,
    random_state=42,
    verbose=True
)

In [5]:
from utils.SimpleScaler import SimpleScaler
from models.LogisticRegressionModel import LogisticRegressionModel

# QUICK SCALING COMPARISON
print("⚡ SCALING IMPACT TEST")
print("=" * 30)

# Test 1: No scaling
print("\n🧪 Testing WITHOUT scaling...")
lr_unscaled = LogisticRegressionModel(
    solver='lbfgs',  # Changed from liblinear to avoid warnings
    max_iter=3000    # Increased iterations
)
lr_unscaled.name = "LR_Unscaled"

unscaled_results = trainer.train_model(
    model=lr_unscaled,
    X_train=X_train,
    y_train=y_train,
    X_val=X_val,
    y_val=y_val,
    optimize=False
)

# Test 2: With scaling

print(f"Data type: {type(X_train)}")
print(f"Is sparse: {hasattr(X_train, 'sparse') or 'sparse' in str(type(X_train)).lower()}")

# Convert to dense if sparse (this fixes the StandardScaler issue)
if hasattr(X_train, 'toarray'):
    print("Converting sparse matrix to dense...")
    X_train_dense = X_train.toarray()
    X_val_dense = X_val.toarray()
else:
    X_train_dense = X_train
    X_val_dense = X_val

# Now scale the dense data
scaler = SimpleScaler()
X_train_scaled, X_val_scaled = scaler.fit_transform_split(X_train_dense, X_val_dense)

lr_scaled = LogisticRegressionModel(
    solver='lbfgs',  # Using lbfgs instead of liblinear
    max_iter=3000
)
lr_scaled.name = "LR_Scaled"

scaled_results = trainer.train_model(
    model=lr_scaled,
    X_train=X_train_scaled,
    y_train=y_train,
    X_val=X_val_scaled,
    y_val=y_val,
    optimize=False
)

# Compare
print(f"\n📊 SCALING COMPARISON:")
print(f"   Unscaled: {unscaled_results['val_accuracy']:.4f}")
print(f"   Scaled:   {scaled_results['val_accuracy']:.4f}")
print(f"   Improvement: {scaled_results['val_accuracy'] - unscaled_results['val_accuracy']:+.4f}")

⚡ SCALING IMPACT TEST

🧪 Testing WITHOUT scaling...
🚀 Training LR_Unscaled...
🔧 Fitting LR_Unscaled...
✅ LR_Unscaled fitted in 1.47 seconds
🔄 Running 8-fold cross-validation...
✅ LR_Unscaled completed in 2.7s
   CV: 0.5071 ± 0.0288
   Train: 0.6424
   Val: 0.5250 (gap: 0.1174)
Data type: <class 'scipy.sparse._csr.csr_matrix'>
Is sparse: True
Converting sparse matrix to dense...
📏 Scaler fitted on (2100, 5000) training data
✅ Scaling completed:
   Train: (2100, 5000) (mean: 0.000, std: 1.000)
   Val: (600, 5000) (mean: -0.002, std: 1.001)
🚀 Training LR_Scaled...
🔧 Fitting LR_Scaled...
✅ LR_Scaled fitted in 17.62 seconds
🔄 Running 8-fold cross-validation...
✅ LR_Scaled completed in 26.5s
   CV: 0.6320 ± 0.0397
   Train: 0.9686
   Val: 0.6417 (gap: 0.3269)

📊 SCALING COMPARISON:
   Unscaled: 0.5250
   Scaled:   0.6417
   Improvement: +0.1167


In [9]:
# Final optimization with best feature selection
print("🎯 FINAL OPTIMIZATION - Best Feature Selection")
print("=" * 50)

# Define parameter spaces for different optimization strategies
anti_overfitting_space = {
    'C': (0.001, 1.0),           # Strong regularization
    'penalty': ['l1', 'l2'],
    'solver': ['saga'],          # saga supports l1/l2 + multiclass
    'class_weight': ['balanced'],
    'max_iter': (2000, 5000)
}

high_performance_space = {
    'C': (0.1, 100.0),           # Less regularization for performance
    'penalty': ['l2'],
    'solver': ['lbfgs', 'newton-cg', 'sag'],  # Fast solvers for l2
    'class_weight': [None, 'balanced'],
    'max_iter': (1000, 3000)
}

balanced_space = {
    'C': (0.01, 10.0),           # Balanced regularization
    'penalty': ['l1', 'l2'],
    'solver': ['saga'],          # saga for l1/l2
    'class_weight': [None, 'balanced'],
    'max_iter': (1500, 4000)
}

elasticnet_space = {
    'C': (0.001, 10.0),          # Wide regularization range
    'penalty': ['elasticnet'],    # Only elasticnet
    'solver': ['saga'],          # Only solver that supports elasticnet
    'class_weight': [None, 'balanced'],
    'max_iter': (2000, 5000),
    'l1_ratio': (0.1, 0.9)       # Mix of l1 and l2 (only for elasticnet)
}

# Test each strategy
spaces = {
    'anti_overfitting': anti_overfitting_space,
    'high_performance': high_performance_space,
    'balanced': balanced_space,
    'elasticnet': elasticnet_space
}

strategy_name, param_space = list(spaces.items())[0]

print(f"\n🧪 Testing {strategy_name} strategy...")

lr_strategy = LogisticRegressionModel()
lr_strategy.name = f"LR_{strategy_name}"

lr_model = trainer.train_model(
    model=lr_strategy,
    X_train=X_train_scaled,
    y_train=y_train,
    X_val=X_val_scaled,
    y_val=y_val,
    param_space=param_space,
    optimize=True
)



lr_name = f"lr_{strategy_name}_optimized"
trainer.save_model(lr_strategy, f"../models/trained/LogisticRegression/{lr_name}.pkl")

print(f"\n💾 Model saved: {lr_name}.pkl")
print("✅ Logistic Regression optimization complete!")

[I 2025-06-19 09:05:41,987] A new study created in memory with name: no-name-f5f78ae9-9fb5-4564-b373-dba1b2f167b4


🎯 FINAL OPTIMIZATION - Best Feature Selection

🧪 Testing anti_overfitting strategy...
🚀 Training LR_anti_overfitting...
🔍 Optimizing LR_anti_overfitting hyperparameters...


  0%|          | 0/20 [00:00<?, ?it/s]

[I 2025-06-19 09:13:01,436] Trial 0 finished with value: 0.6374009724105645 and parameters: {'C': 0.37516557872851514, 'penalty': 'l1', 'solver': 'saga', 'class_weight': 'balanced', 'max_iter': 3796}. Best is trial 0 with value: 0.6374009724105645.
[I 2025-06-19 09:16:49,688] Trial 1 finished with value: 0.6232093489795265 and parameters: {'C': 0.1568626218019941, 'penalty': 'l1', 'solver': 'saga', 'class_weight': 'balanced', 'max_iter': 4599}. Best is trial 0 with value: 0.6374009724105645.
[I 2025-06-19 09:26:11,027] Trial 2 finished with value: 0.6326313964556711 and parameters: {'C': 0.6015138967314656, 'penalty': 'l1', 'solver': 'saga', 'class_weight': 'balanced', 'max_iter': 4910}. Best is trial 0 with value: 0.6374009724105645.




[I 2025-06-19 09:32:18,364] Trial 3 finished with value: 0.6327922335626527 and parameters: {'C': 0.8326101981596213, 'penalty': 'l1', 'solver': 'saga', 'class_weight': 'balanced', 'max_iter': 2550}. Best is trial 0 with value: 0.6374009724105645.




[I 2025-06-19 09:38:36,123] Trial 4 finished with value: 0.6318425666853835 and parameters: {'C': 0.3049380007165782, 'penalty': 'l1', 'solver': 'saga', 'class_weight': 'balanced', 'max_iter': 2873}. Best is trial 0 with value: 0.6374009724105645.
[I 2025-06-19 09:41:36,093] Trial 5 finished with value: 0.6217102531335204 and parameters: {'C': 0.612241041827657, 'penalty': 'l2', 'solver': 'saga', 'class_weight': 'balanced', 'max_iter': 3099}. Best is trial 0 with value: 0.6374009724105645.




[I 2025-06-19 09:49:27,069] Trial 6 finished with value: 0.6372767794514551 and parameters: {'C': 0.4566139142328189, 'penalty': 'l1', 'solver': 'saga', 'class_weight': 'balanced', 'max_iter': 3543}. Best is trial 0 with value: 0.6374009724105645.
[I 2025-06-19 09:52:22,814] Trial 7 finished with value: 0.6221968166585807 and parameters: {'C': 0.5928221542931804, 'penalty': 'l2', 'solver': 'saga', 'class_weight': 'balanced', 'max_iter': 2511}. Best is trial 0 with value: 0.6374009724105645.
[I 2025-06-19 09:53:29,061] Trial 8 finished with value: 0.6283087924755314 and parameters: {'C': 0.06598654139229423, 'penalty': 'l2', 'solver': 'saga', 'class_weight': 'balanced', 'max_iter': 4426}. Best is trial 0 with value: 0.6374009724105645.
[I 2025-06-19 09:55:45,609] Trial 9 finished with value: 0.6235172864292968 and parameters: {'C': 0.30530915540419734, 'penalty': 'l2', 'solver': 'saga', 'class_weight': 'balanced', 'max_iter': 3320}. Best is trial 0 with value: 0.6374009724105645.
[I 202



[I 2025-06-19 10:42:05,789] Trial 15 finished with value: 0.6373236290336303 and parameters: {'C': 0.4541219326285934, 'penalty': 'l1', 'solver': 'saga', 'class_weight': 'balanced', 'max_iter': 3576}. Best is trial 12 with value: 0.6382436075429607.
[I 2025-06-19 10:42:43,092] Trial 16 finished with value: 0.5703279547545035 and parameters: {'C': 0.022723184128547236, 'penalty': 'l1', 'solver': 'saga', 'class_weight': 'balanced', 'max_iter': 3043}. Best is trial 12 with value: 0.6382436075429607.
[I 2025-06-19 10:49:36,686] Trial 17 finished with value: 0.6360174133211094 and parameters: {'C': 0.3463480515030184, 'penalty': 'l1', 'solver': 'saga', 'class_weight': 'balanced', 'max_iter': 4073}. Best is trial 12 with value: 0.6382436075429607.
[I 2025-06-19 10:52:26,856] Trial 18 finished with value: 0.6212306053812144 and parameters: {'C': 0.540080486695995, 'penalty': 'l2', 'solver': 'saga', 'class_weight': 'balanced', 'max_iter': 2178}. Best is trial 12 with value: 0.6382436075429607.



[I 2025-06-19 11:00:18,617] Trial 19 finished with value: 0.6311727877658437 and parameters: {'C': 0.7122658244111982, 'penalty': 'l1', 'solver': 'saga', 'class_weight': 'balanced', 'max_iter': 3341}. Best is trial 12 with value: 0.6382436075429607.
✅ Optimization completed in 6876.6s
   Best score: 0.6382
   Best params: {'C': 0.42332857369445503, 'penalty': 'l1', 'solver': 'saga', 'class_weight': 'balanced', 'max_iter': 3712}
🔧 Fitting LR_anti_overfitting...
✅ LR_anti_overfitting fitted in 342.64 seconds
🔄 Running 8-fold cross-validation...
✅ LR_anti_overfitting completed in 7643.9s
   CV: 0.6382 ± 0.0432
   Train: 0.9386
   Val: 0.6667 (gap: 0.2719)
💾 Model saved to: ../models/trained/LogisticRegression/lr_anti_overfitting_optimized.pkl

💾 Model saved: lr_anti_overfitting_optimized.pkl
✅ Logistic Regression optimization complete!
