In [5]:
import sys
sys.path.append('..')

In [6]:
import pickle
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from ml_pipeline import ModelTrainer


In [22]:
# Initialize trainer for loading
trainer = ModelTrainer()

# =============================================================================
# 1. LOAD YOUR EXISTING MODELS
# =============================================================================
print("\n📦 Loading existing trained models...")

# Load RF model and extract the raw sklearn model
try:
    rf_wrapper = trainer.load_model('../models/trained/RandomForest/high_performance_with_feature_selection_F-Score_1500.pkl')
    rf_sklearn_model = rf_wrapper.model  # Extract the actual RandomForestClassifier
    print(f"✅ RF model loaded: {rf_wrapper.name}")
    print(f"   Raw model type: {type(rf_sklearn_model)}")
    print(f"   Is fitted: {hasattr(rf_sklearn_model, 'classes_')}")
except Exception as e:
    print(f"❌ Failed to load RF model: {e}")
    rf_sklearn_model = None

# Load LR model and extract the raw sklearn model
try:
    lr_wrapper = trainer.load_model('../models/trained/LogisticRegression/lr_anti_overfitting_optimized.pkl')
    lr_sklearn_model = lr_wrapper.model  # Extract the actual LogisticRegression
    print(f"✅ LR model loaded: {lr_wrapper.name}")
    print(f"   Raw model type: {type(lr_sklearn_model)}")
    print(f"   Is fitted: {hasattr(lr_sklearn_model, 'classes_')}")
except Exception as e:
    print(f"❌ Failed to load LR model: {e}")
    lr_sklearn_model = None


📦 Loading existing trained models...
📂 Model loaded from: ../models/trained/RandomForest/high_performance_with_feature_selection_F-Score_1500.pkl
   Model: Optimized_F-Score_1500
   Fitted: True
✅ RF model loaded: Optimized_F-Score_1500
   Raw model type: <class 'sklearn.ensemble._forest.RandomForestClassifier'>
   Is fitted: True
📂 Model loaded from: ../models/trained/LogisticRegression/lr_anti_overfitting_optimized.pkl
   Model: LR_anti_overfitting
   Fitted: True
✅ LR model loaded: LR_anti_overfitting
   Raw model type: <class 'sklearn.linear_model._logistic.LogisticRegression'>
   Is fitted: True


In [23]:
# Load the original data to fit preprocessing
from utils import load_train_val_data
X_train, X_val, y_train, y_val = load_train_val_data()

📥 Loading train/val data...
✅ Data loaded:
   X_train: (2100, 5000)
   X_val: (600, 5000)
   y_train: 2100 samples
   y_val: 600 samples


In [None]:
# RF Pipeline: Feature Selection (F-Score, k=1500)
if rf_model is not None:
    print("\n🌳 Creating RF pipeline with feature selection...")
    
    # Create pipeline with unfitted components first
    rf_feature_selector = SelectKBest(f_classif, k=1500)
    rf_pipeline = Pipeline([
        ('feature_selection', rf_feature_selector),
        ('model', rf_model.model)  # Use the actual sklearn model, not wrapper
    ])
    
    # Fit the entire pipeline
    print("   Fitting RF pipeline...")
    rf_pipeline.fit(X_train, y_train)
    
    print(f"✅ RF pipeline fitted and ready")
    print(f"   Features: 5000 → 1500 (70% reduction)")
    
    # Test the pipeline
    rf_test_pred = rf_pipeline.predict(X_val[:5])  # Test on small sample
    print(f"✅ RF pipeline created and tested")
    print(f"   Features: 5000 → 1500 (70% reduction)")
    
    # Save complete RF pipeline using pickle directly (avoid ModelTrainer.save_model)
    import os
    import pickle
    os.makedirs('../models/pipelines', exist_ok=True)
    
    with open('../models/pipelines/rf_complete_pipeline.pkl', 'wb') as f:
        pickle.dump(rf_pipeline, f)
    print(f"💾 RF pipeline saved: rf_complete_pipeline.pkl")



🌳 Creating RF pipeline with feature selection...
   Fitting RF pipeline...
✅ RF pipeline fitted and ready
   Features: 5000 → 1500 (70% reduction)
✅ RF pipeline created and tested
   Features: 5000 → 1500 (70% reduction)
💾 RF pipeline saved: rf_complete_pipeline.pkl


In [None]:
# LR Pipeline: Standard Scaling
if lr_model is not None:
    print("\n📊 Creating LR pipeline with scaling...")
    
    # Convert sparse to dense (as done in your training)
    if hasattr(X_train, 'toarray'):
        X_train_dense = X_train.toarray()
        X_val_dense = X_val.toarray()
    else:
        X_train_dense = X_train
        X_val_dense = X_val
    
    # Create pipeline with unfitted components
    lr_scaler = StandardScaler()
    lr_pipeline = Pipeline([
        ('scaler', lr_scaler),
        ('model', lr_model.model)  # Use the actual sklearn model, not wrapper
    ])
    
    # Fit the entire pipeline
    print("   Fitting LR pipeline...")
    lr_pipeline.fit(X_train_dense, y_train)
    
    print(f"✅ LR pipeline fitted and ready")
    print(f"   Preprocessing: Raw → Scaled (StandardScaler)")
    
    # Test the pipeline
    lr_test_pred = lr_pipeline.predict(X_val_dense[:5])  # Test on small sample
    
    # Save complete LR pipeline using pickle directly
    with open('../models/pipelines/lr_complete_pipeline.pkl', 'wb') as f:
        pickle.dump(lr_pipeline, f)
    print(f"💾 LR pipeline saved: lr_complete_pipeline.pkl")


📊 Creating LR pipeline with scaling...
✅ LR pipeline created and tested
   Preprocessing: Raw → Scaled (StandardScaler)
💾 LR pipeline saved: lr_complete_pipeline.pkl




In [None]:
class EnsemblePipelineWrapper:
    """Simple wrapper to handle sparse/dense conversion"""
    
    def __init__(self, pipeline, name):
        self.pipeline = pipeline
        self.name = name
        
    def fit(self, X, y):
        return self  # Already fitted
        
    def predict(self, X):
        if hasattr(X, 'toarray') and 'scaler' in [step[0] for step in self.pipeline.steps]:
            X = X.toarray()
        return self.pipeline.predict(X)
        
    def predict_proba(self, X):
        if hasattr(X, 'toarray') and 'scaler' in [step[0] for step in self.pipeline.steps]:
            X = X.toarray()
        return self.pipeline.predict_proba(X)

In [None]:
# Create wrapped pipelines
if rf_model is not None and lr_model is not None:
    rf_ensemble = EnsemblePipelineWrapper(rf_pipeline, "RF_FeatureSelected")
    lr_ensemble = EnsemblePipelineWrapper(lr_pipeline, "LR_Scaled")
    
    print(f"✅ Ensemble wrappers created:")
    print(f"   - {rf_ensemble.name}: Feature selection pipeline")
    print(f"   - {lr_ensemble.name}: Scaling pipeline")
    
    # Save ensemble-ready models using pickle directly
    with open('../models/pipelines/rf_ensemble_ready.pkl', 'wb') as f:
        pickle.dump(rf_ensemble, f)
    with open('../models/pipelines/lr_ensemble_ready.pkl', 'wb') as f:
        pickle.dump(lr_ensemble, f)
    print(f"💾 Ensemble-ready models saved")

✅ Ensemble wrappers created:
   - RF_FeatureSelected: Feature selection pipeline
   - LR_Scaled: Scaling pipeline
💾 Ensemble-ready models saved


In [21]:
print("\n🎯 Creating ensemble with pipelines...")

if rf_model is not None and lr_model is not None:
    from sklearn.ensemble import VotingClassifier
    
    # Use pipelines directly - they're already sklearn-compatible
    ensemble = VotingClassifier(
        estimators=[
            ('rf', rf_pipeline),  # Use pipeline directly
            ('lr', lr_pipeline)   # Use pipeline directly  
        ],
        voting='soft'
    )
    
    # For LR pipeline, we need to handle sparse input conversion
    # Create a custom predict method that handles this
    class SmartVotingClassifier(VotingClassifier):
        def predict(self, X):
            # Handle sparse input for LR pipeline
            predictions = []
            for name, estimator in self.estimators:
                if name == 'lr' and hasattr(X, 'toarray'):
                    pred = estimator.predict(X.toarray())
                else:
                    pred = estimator.predict(X)
                predictions.append(pred)
            
            # Use majority voting for final prediction
            import numpy as np
            from scipy import stats
            stacked_preds = np.column_stack(predictions)
            final_preds = stats.mode(stacked_preds, axis=1)[0].flatten()
            return final_preds
            
        def predict_proba(self, X):
            # Handle sparse input and average probabilities
            probas = []
            for name, estimator in self.estimators:
                if name == 'lr' and hasattr(X, 'toarray'):
                    proba = estimator.predict_proba(X.toarray())
                else:
                    proba = estimator.predict_proba(X)
                probas.append(proba)
            
            # Average probabilities
            import numpy as np
            return np.mean(probas, axis=0)
    
    # Create smart ensemble
    ensemble = SmartVotingClassifier(
        estimators=[
            ('rf', rf_pipeline),
            ('lr', lr_pipeline)
        ],
        voting='soft'
    )
    
    # Fit the ensemble with a small sample
    print("   Fitting ensemble...")
    sample_size = min(100, X_train.shape[0]) 
    ensemble.fit(X_train[:sample_size], y_train[:sample_size])
    
    print(f"📊 Testing ensemble on validation data...")
    
    # Test with raw sparse data
    ensemble_pred = ensemble.predict(X_val[:100])
    ensemble_proba = ensemble.predict_proba(X_val[:100])
    
    print(f"✅ Ensemble test successful!")
    print(f"   Predictions shape: {ensemble_pred.shape}")
    print(f"   Probabilities shape: {ensemble_proba.shape}")
    print(f"   Sample prediction: {ensemble_pred[:5]}")
    
    # Save the ensemble
    with open('../models/pipelines/rf_lr_ensemble.pkl', 'wb') as f:
        pickle.dump(ensemble, f)
    print(f"💾 Complete ensemble saved: rf_lr_ensemble.pkl")


🎯 Creating ensemble with pipelines...
   Fitting ensemble...


ValueError: The estimator Pipeline should be a classifier.