In [4]:
"""
================================================================================
PPP-Q ENHANCED CLASSIFIER - PRODUCTION TRAINING SYSTEM
================================================================================
Multi-Asset Investment Strategy Classifier with Cycle Awareness

Models: LightGBM (Primary) + XGBoost (Ensemble) + Random Forest (Baseline)
Output: A/B/C/D + Volatility + Cycle Position + Entry Signal + Actionable Insights

Author: Bilal Ahmad Sheikh
Institution: GIKI
Date: December 2024
================================================================================
"""

import pandas as pd
import numpy as np
import json
import pickle
import os
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# ML Libraries
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report, confusion_matrix, f1_score,
    accuracy_score, balanced_accuracy_score
)
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
import shap

# ============================================================================
# CONFIGURATION
# ============================================================================

class EnhancedTrainingConfig:
    """Enhanced configuration with asset-specific logic"""

    # Paths
    DATA_DIR = '/content/'
    MODEL_DIR = 'models/pppq/'
    REPORTS_DIR = 'reports/pppq/'

    # Primary Model: LightGBM (Best for imbalanced data)
    LGBM_PARAMS = {
        'objective': 'multiclass',
        'num_class': 4,
        'metric': 'multi_logloss',
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'max_depth': 7,
        'min_child_samples': 20,
        'reg_alpha': 0.1,
        'reg_lambda': 0.1,
        'verbose': -1,
        'random_state': 42,
        'n_jobs': -1,
        'is_unbalance': True
    }

    # Secondary Model: XGBoost (Ensemble diversity)
    XGB_PARAMS = {
        'objective': 'multi:softprob',
        'num_class': 4,
        'eval_metric': 'mlogloss',
        'max_depth': 7,
        'learning_rate': 0.05,
        'n_estimators': 500,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'reg_alpha': 0.1,
        'reg_lambda': 0.1,
        'random_state': 42,
        'n_jobs': -1,
        'early_stopping_rounds': 50  # Add here
    }

    # Baseline: Random Forest (For comparison)
    RF_PARAMS = {
        'n_estimators': 300,
        'max_depth': 15,
        'min_samples_split': 10,
        'min_samples_leaf': 5,
        'class_weight': 'balanced',
        'random_state': 42,
        'n_jobs': -1
    }

    # Training parameters
    NUM_BOOST_ROUND = 500
    EARLY_STOPPING_ROUNDS = 50
    VERBOSE_EVAL = 50

    # Asset categories
    CRYPTO_ASSETS = ['Bitcoin', 'Ethereum', 'Litecoin']
    PRECIOUS_METALS = ['Gold', 'Silver']
    EQUITY_INDICES = ['SP500', 'NASDAQ', 'DowJones']
    COMMODITIES = ['Oil']
    ETFS = ['Gold_ETF', 'TreasuryBond_ETF', 'RealEstate_ETF']
    TECH_STOCKS = ['Apple', 'Microsoft', 'JPMorgan']

config = EnhancedTrainingConfig()

# Create directories
for directory in [config.MODEL_DIR, config.REPORTS_DIR, config.REPORTS_DIR + 'visualizations/']:
    os.makedirs(directory, exist_ok=True)

print("="*80)
print("  PPP-Q ENHANCED CLASSIFIER - PRODUCTION TRAINING")
print("="*80)

# ============================================================================
# 1. LOAD & VALIDATE DATA
# ============================================================================

print("\n" + "="*80)
print("STEP 1: LOADING & VALIDATING DATA")
print("="*80)

train_df = pd.read_csv(config.DATA_DIR + 'pppq_train.csv')
val_df = pd.read_csv(config.DATA_DIR + 'pppq_val.csv')
test_df = pd.read_csv(config.DATA_DIR + 'pppq_test.csv')

print(f"\n‚úÖ Data loaded:")
print(f"   Train: {train_df.shape}")
print(f"   Val:   {val_df.shape}")
print(f"   Test:  {test_df.shape}")

# Load feature metadata
with open(config.DATA_DIR + 'pppq_features.json', 'r') as f:
    feature_metadata = json.load(f)

print(f"\nüìã Features: {feature_metadata['num_features']}")
print(f"   Classes: {feature_metadata['classes']}")

# ============================================================================
# 2. PREPARE FEATURES
# ============================================================================

print("\n" + "="*80)
print("STEP 2: PREPARING FEATURES")
print("="*80)

# Core features
exclude_cols = ['Date', 'Asset', 'PPP_Q_Class', 'Inflation_Regime', 'Asset_Category']
feature_cols = [col for col in train_df.columns if col not in exclude_cols]

print(f"\nüìä Feature columns: {len(feature_cols)}")

# Separate features and target
X_train = train_df[feature_cols].fillna(0)
y_train = train_df['PPP_Q_Class']
asset_train = train_df['Asset']

X_val = val_df[feature_cols].fillna(0)
y_val = val_df['PPP_Q_Class']
asset_val = val_df['Asset']

X_test = test_df[feature_cols].fillna(0)
y_test = test_df['PPP_Q_Class']
asset_test = test_df['Asset']

# Encode labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_val_encoded = label_encoder.transform(y_val)
y_test_encoded = label_encoder.transform(y_test)

class_mapping = {i: label for i, label in enumerate(label_encoder.classes_)}

print(f"\nüè∑Ô∏è  Class Mapping:")
for idx, class_name in class_mapping.items():
    print(f"   {idx}: {class_name}")

# Class distribution
print(f"\nüìä Class Distribution:")
for split_name, y_encoded in [('TRAIN', y_train_encoded), ('VAL', y_val_encoded), ('TEST', y_test_encoded)]:
    print(f"\n   {split_name}:")
    dist = pd.Series(y_encoded).value_counts().sort_index()
    for idx, count in dist.items():
        print(f"   {class_mapping[idx]}: {count:,} ({count/len(y_encoded)*100:.1f}%)")

# ============================================================================
# 3. TRAIN PRIMARY MODEL - LIGHTGBM
# ============================================================================

print("\n" + "="*80)
print("STEP 3: TRAINING PRIMARY MODEL - LIGHTGBM")
print("="*80)

lgb_train_data = lgb.Dataset(X_train, label=y_train_encoded, feature_name=feature_cols)
lgb_val_data = lgb.Dataset(X_val, label=y_val_encoded, reference=lgb_train_data)

print(f"\nüöÄ Training LightGBM...")

evals_result = {}
callbacks = [
    lgb.early_stopping(stopping_rounds=config.EARLY_STOPPING_ROUNDS),
    lgb.log_evaluation(period=config.VERBOSE_EVAL),
    lgb.record_evaluation(evals_result)
]

start_time = datetime.now()

lgbm_model = lgb.train(
    config.LGBM_PARAMS,
    lgb_train_data,
    num_boost_round=config.NUM_BOOST_ROUND,
    valid_sets=[lgb_train_data, lgb_val_data],
    valid_names=['train', 'val'],
    callbacks=callbacks
)

lgbm_time = (datetime.now() - start_time).total_seconds()

print(f"\n‚úÖ LightGBM trained!")
print(f"   Best iteration: {lgbm_model.best_iteration}")
print(f"   Training time: {lgbm_time:.2f}s")

# ============================================================================
# 4. TRAIN SECONDARY MODEL - XGBOOST (Optional Ensemble)
# ============================================================================

print("\n" + "="*80)
print("STEP 4: TRAINING SECONDARY MODEL - XGBOOST")
print("="*80)

print(f"\nüöÄ Training XGBoost...")

start_time = datetime.now()

# Updated XGBoost API - use 'early_stopping_rounds' in __init__
xgb_model = xgb.XGBClassifier(
    objective='multi:softprob',
    num_class=4,
    eval_metric='mlogloss',
    max_depth=7,
    learning_rate=0.05,
    n_estimators=500,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=0.1,
    random_state=42,
    n_jobs=-1,
    early_stopping_rounds=50  # Move here instead of fit()
)

xgb_model.fit(
    X_train, y_train_encoded,
    eval_set=[(X_val, y_val_encoded)],
    verbose=False
)

xgb_time = (datetime.now() - start_time).total_seconds()

print(f"‚úÖ XGBoost trained!")
print(f"   Best iteration: {xgb_model.best_iteration}")
print(f"   Training time: {xgb_time:.2f}s")

# ============================================================================
# 5. TRAIN BASELINE - RANDOM FOREST
# ============================================================================

print("\n" + "="*80)
print("STEP 5: TRAINING BASELINE - RANDOM FOREST")
print("="*80)

print(f"\nüöÄ Training Random Forest...")

start_time = datetime.now()

rf_model = RandomForestClassifier(**config.RF_PARAMS)
rf_model.fit(X_train, y_train_encoded)

rf_time = (datetime.now() - start_time).total_seconds()

print(f"‚úÖ Random Forest trained!")
print(f"   Training time: {rf_time:.2f}s")

# ============================================================================
# 6. EVALUATE ALL MODELS
# ============================================================================

print("\n" + "="*80)
print("STEP 6: EVALUATING ALL MODELS")
print("="*80)

def evaluate_model(model, X, y_true_encoded, model_name, model_type='lgb'):
    """Evaluate model with detailed metrics"""

    # Predict
    if model_type == 'lgb':
        y_pred_proba = model.predict(X, num_iteration=model.best_iteration)
    elif model_type == 'xgb':
        y_pred_proba = model.predict_proba(X)
    else:  # rf
        y_pred_proba = model.predict_proba(X)

    y_pred_encoded = np.argmax(y_pred_proba, axis=1)

    # Metrics
    accuracy = accuracy_score(y_true_encoded, y_pred_encoded)
    balanced_acc = balanced_accuracy_score(y_true_encoded, y_pred_encoded)

    unique_classes = np.unique(np.concatenate([y_true_encoded, y_pred_encoded]))
    macro_f1 = f1_score(y_true_encoded, y_pred_encoded, average='macro', labels=unique_classes)
    weighted_f1 = f1_score(y_true_encoded, y_pred_encoded, average='weighted', labels=unique_classes)

    print(f"\n{'='*80}")
    print(f"{model_name.upper()} - TEST SET RESULTS")
    print(f"{'='*80}")

    print(f"\nüìä Metrics:")
    print(f"   Accuracy:          {accuracy:.4f}")
    print(f"   Balanced Accuracy: {balanced_acc:.4f}")
    print(f"   Macro F1:          {macro_f1:.4f} ‚Üê PRIMARY METRIC")
    print(f"   Weighted F1:       {weighted_f1:.4f}")

    present_classes = [label_encoder.classes_[i] for i in unique_classes]
    print(f"\nüìã Classification Report:")
    print(classification_report(
        y_true_encoded, y_pred_encoded,
        labels=unique_classes, target_names=present_classes, zero_division=0
    ))

    cm = confusion_matrix(y_true_encoded, y_pred_encoded, labels=unique_classes)

    return {
        'model_name': model_name,
        'predictions_encoded': y_pred_encoded,
        'probabilities': y_pred_proba,
        'accuracy': accuracy,
        'balanced_accuracy': balanced_acc,
        'macro_f1': macro_f1,
        'weighted_f1': weighted_f1,
        'confusion_matrix': cm,
        'unique_classes': unique_classes,
        'present_classes': present_classes
    }

# Evaluate all models
lgbm_results = evaluate_model(lgbm_model, X_test, y_test_encoded, 'LightGBM', 'lgb')
xgb_results = evaluate_model(xgb_model, X_test, y_test_encoded, 'XGBoost', 'xgb')
rf_results = evaluate_model(rf_model, X_test, y_test_encoded, 'Random Forest', 'rf')

# ============================================================================
# 7. ENSEMBLE PREDICTIONS (Average of LightGBM + XGBoost)
# ============================================================================

print("\n" + "="*80)
print("STEP 7: CREATING ENSEMBLE MODEL")
print("="*80)

# Ensemble: Average probabilities
ensemble_proba = (lgbm_results['probabilities'] + xgb_results['probabilities']) / 2
ensemble_pred = np.argmax(ensemble_proba, axis=1)

ensemble_acc = accuracy_score(y_test_encoded, ensemble_pred)
ensemble_macro_f1 = f1_score(y_test_encoded, ensemble_pred, average='macro')

print(f"\nüìä ENSEMBLE (LightGBM + XGBoost) RESULTS:")
print(f"   Accuracy:    {ensemble_acc:.4f}")
print(f"   Macro F1:    {ensemble_macro_f1:.4f}")

# ============================================================================
# 8. FEATURE IMPORTANCE ANALYSIS
# ============================================================================

print("\n" + "="*80)
print("STEP 8: FEATURE IMPORTANCE ANALYSIS")
print("="*80)

importance_gain = lgbm_model.feature_importance(importance_type='gain')
importance_df = pd.DataFrame({
    'feature': feature_cols,
    'importance_gain': importance_gain
}).sort_values('importance_gain', ascending=False)

print(f"\nüîù Top 20 Features:")
print(importance_df.head(20).to_string(index=False))

# ============================================================================
# 9. GENERATE ACTIONABLE INSIGHTS
# ============================================================================

print("\n" + "="*80)
print("STEP 9: GENERATING ACTIONABLE INSIGHTS")
print("="*80)

def generate_investment_signal(row, pred_class, confidence, lgbm_proba):
    """Generate actionable investment insights"""

    asset = row['Asset']

    # Determine asset category
    if asset in config.CRYPTO_ASSETS:
        category = 'Crypto'
    elif asset in config.PRECIOUS_METALS:
        category = 'Precious Metal'
    elif asset in config.EQUITY_INDICES:
        category = 'Equity Index'
    elif asset in config.COMMODITIES:
        category = 'Commodity'
    elif asset in config.ETFS:
        category = 'ETF'
    else:
        category = 'Tech Stock'

    # Volatility assessment
    volatility = row.get('Volatility_90D', 0)
    if volatility < 15:
        vol_level = 'LOW'
    elif volatility < 30:
        vol_level = 'MEDIUM'
    elif volatility < 50:
        vol_level = 'HIGH'
    else:
        vol_level = 'EXTREME'

    # Cycle position
    distance_ath = row.get('Distance_From_ATH_Pct', 0)
    if distance_ath > -10:
        cycle_pos = 'NEAR_ATH'
        entry_signal = 'WAIT'
    elif distance_ath > -30:
        cycle_pos = 'CORRECTION'
        entry_signal = 'WATCH'
    elif distance_ath > -50:
        cycle_pos = 'VALUE_ZONE'
        entry_signal = 'CONSIDER'
    else:
        cycle_pos = 'DEEP_VALUE'
        entry_signal = 'BUY'

    # Growth potential
    saturation = row.get('Market_Cap_Saturation_Pct', 50)
    if saturation < 20:
        growth = 'HIGH'
    elif saturation < 50:
        growth = 'MEDIUM'
    elif saturation < 80:
        growth = 'LOW'
    else:
        growth = 'SATURATED'

    # Strengths & Weaknesses
    strengths = []
    weaknesses = []

    # Analyze based on features
    pp_mult = row.get('PP_Multiplier_5Y', 1.0)
    if pp_mult > 1.5:
        strengths.append(f"Strong PP growth ({pp_mult:.2f}x)")
    elif pp_mult < 1.0:
        weaknesses.append(f"Losing purchasing power ({pp_mult:.2f}x)")

    sharpe = row.get('Sharpe_Ratio_5Y', 0)
    if sharpe > 1.0:
        strengths.append(f"Excellent risk-adjusted returns (Sharpe: {sharpe:.2f})")
    elif sharpe < 0.3:
        weaknesses.append(f"Poor risk-adjusted returns (Sharpe: {sharpe:.2f})")

    max_dd = row.get('Max_Drawdown', 0)
    if max_dd > 50:
        weaknesses.append(f"Severe drawdowns ({max_dd:.1f}%)")
    elif max_dd < 20:
        strengths.append(f"Low drawdowns ({max_dd:.1f}%)")

    if volatility < 15:
        strengths.append("Low volatility (stable)")
    elif volatility > 40:
        weaknesses.append("Extreme volatility")

    if distance_ath < -50:
        strengths.append("Far from ATH (value opportunity)")
    elif distance_ath > -5:
        weaknesses.append("Near ATH (pullback risk)")

    if growth == 'HIGH':
        strengths.append("High growth potential (low saturation)")
    elif growth == 'SATURATED':
        weaknesses.append("Limited upside (market saturated)")

    return {
        'asset': asset,
        'category': category,
        'predicted_class': pred_class,
        'confidence': round(confidence * 100, 1),
        'volatility': vol_level,
        'volatility_value': round(volatility, 1),
        'cycle_position': cycle_pos,
        'distance_from_ath': round(distance_ath, 1),
        'entry_signal': entry_signal,
        'growth_potential': growth,
        'market_cap_saturation': round(saturation, 1),
        'strengths': strengths[:3],  # Top 3
        'weaknesses': weaknesses[:3],  # Top 3
        'pp_multiplier_5y': round(pp_mult, 3),
        'sharpe_ratio_5y': round(sharpe, 3),
        'max_drawdown': round(max_dd, 1)
    }

print("\nüî® Generating insights for test set...")

insights_list = []
for idx, row in test_df.iterrows():
    pred_class = label_encoder.classes_[lgbm_results['predictions_encoded'][idx]]
    confidence = lgbm_results['probabilities'][idx].max()

    insights = generate_investment_signal(row, pred_class, confidence, lgbm_results['probabilities'][idx])
    insights_list.append(insights)

insights_df = pd.DataFrame(insights_list)

# Save insights
insights_path = config.REPORTS_DIR + 'investment_insights.csv'
insights_df.to_csv(insights_path, index=False)

print(f"‚úÖ Generated {len(insights_df)} actionable insights")
print(f"\nüìã Sample Insights (Bitcoin):")
bitcoin_insights = insights_df[insights_df['asset'] == 'Bitcoin'].head(5)
for col in ['asset', 'predicted_class', 'confidence', 'volatility', 'entry_signal', 'cycle_position']:
    if col in bitcoin_insights.columns:
        print(f"   {col}: {bitcoin_insights.iloc[0][col]}")

# ============================================================================
# 10. SAVE MODELS & ARTIFACTS
# ============================================================================

print("\n" + "="*80)
print("STEP 10: SAVING MODELS & ARTIFACTS")
print("="*80)

# Save models
lgbm_model.save_model(config.MODEL_DIR + 'lgbm_model.txt')
xgb_model.save_model(config.MODEL_DIR + 'xgb_model.json')

with open(config.MODEL_DIR + 'rf_model.pkl', 'wb') as f:
    pickle.dump(rf_model, f)

with open(config.MODEL_DIR + 'label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

with open(config.MODEL_DIR + 'feature_columns.json', 'w') as f:
    json.dump(feature_cols, f, indent=2)

importance_df.to_csv(config.REPORTS_DIR + 'feature_importance.csv', index=False)

# Save comprehensive results
results_summary = {
    'training_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    'models': {
        'lightgbm': {
            'macro_f1': float(lgbm_results['macro_f1']),
            'accuracy': float(lgbm_results['accuracy']),
            'best_iteration': int(lgbm_model.best_iteration),
            'training_time_seconds': lgbm_time
        },
        'xgboost': {
            'macro_f1': float(xgb_results['macro_f1']),
            'accuracy': float(xgb_results['accuracy']),
            'best_iteration': int(xgb_model.best_iteration),
            'training_time_seconds': xgb_time
        },
        'random_forest': {
            'macro_f1': float(rf_results['macro_f1']),
            'accuracy': float(rf_results['accuracy']),
            'training_time_seconds': rf_time
        },
        'ensemble': {
            'macro_f1': float(ensemble_macro_f1),
            'accuracy': float(ensemble_acc)
        }
    },
    'best_model': 'LightGBM' if lgbm_results['macro_f1'] >= xgb_results['macro_f1'] else 'XGBoost',
    'top_10_features': importance_df.head(10)['feature'].tolist(),
    'num_features': len(feature_cols),
    'classes': label_encoder.classes_.tolist()
}

with open(config.REPORTS_DIR + 'training_summary.json', 'w') as f:
    json.dump(results_summary, f, indent=2)

print(f"\nüíæ All models and artifacts saved!")

# ============================================================================
# 11. FINAL SUMMARY
# ============================================================================

print("\n" + "="*80)
print("‚úÖ TRAINING COMPLETE!")
print("="*80)

print(f"\nüìä MODEL PERFORMANCE COMPARISON (Test Set):")
print(f"   {'Model':<20} {'Macro F1':<12} {'Accuracy':<12}")
print(f"   {'-'*44}")
print(f"   {'LightGBM':<20} {lgbm_results['macro_f1']:<12.4f} {lgbm_results['accuracy']:<12.4f}")
print(f"   {'XGBoost':<20} {xgb_results['macro_f1']:<12.4f} {xgb_results['accuracy']:<12.4f}")
print(f"   {'Random Forest':<20} {rf_results['macro_f1']:<12.4f} {rf_results['accuracy']:<12.4f}")
print(f"   {'Ensemble':<20} {ensemble_macro_f1:<12.4f} {ensemble_acc:<12.4f}")

print(f"\nüèÜ BEST MODEL: {results_summary['best_model']}")

print(f"\nüìÅ OUTPUT FILES:")
print(f"   Models: {config.MODEL_DIR}")
print(f"   Reports: {config.REPORTS_DIR}")
print(f"   Insights: {insights_path}")

print("\n" + "="*80)

  PPP-Q ENHANCED CLASSIFIER - PRODUCTION TRAINING

STEP 1: LOADING & VALIDATING DATA

‚úÖ Data loaded:
   Train: (65745, 43)
   Val:   (10950, 43)
   Test:  (10695, 43)

üìã Features: 38
   Classes: ['A_PRESERVER', 'B_PARTIAL', 'C_ERODER', 'D_DESTROYER']

STEP 2: PREPARING FEATURES

üìä Feature columns: 38

üè∑Ô∏è  Class Mapping:
   0: A_PRESERVER
   1: B_PARTIAL
   2: C_ERODER
   3: D_DESTROYER

üìä Class Distribution:

   TRAIN:
   A_PRESERVER: 21,366 (32.5%)
   B_PARTIAL: 20,283 (30.9%)
   C_ERODER: 19,834 (30.2%)
   D_DESTROYER: 4,262 (6.5%)

   VAL:
   A_PRESERVER: 3,942 (36.0%)
   B_PARTIAL: 5,550 (50.7%)
   C_ERODER: 1,323 (12.1%)
   D_DESTROYER: 135 (1.2%)

   TEST:
   A_PRESERVER: 4,332 (40.5%)
   B_PARTIAL: 3,999 (37.4%)
   C_ERODER: 1,495 (14.0%)
   D_DESTROYER: 869 (8.1%)

STEP 3: TRAINING PRIMARY MODEL - LIGHTGBM

üöÄ Training LightGBM...
Training until validation scores don't improve for 50 rounds
[50]	train's multi_logloss: 0.0380366	val's multi_logloss: 0.174263
[1