In [None]:
# Upload your dataset
from google.colab import files

uploaded = files.upload()
print(f"Uploaded: {list(uploaded.keys())}")

In [None]:
# Install required packages
!pip install xgboost lightgbm shap -q

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

import xgboost as xgb
import lightgbm as lgb
import shap

import warnings
warnings.filterwarnings('ignore')

print("‚úÖ Libraries loaded!")

## Step 2: Load and Explore Data

In [None]:
# Load dataset
df = pd.read_csv('runner_dataset.csv')

print(f"Dataset shape: {df.shape}")
print(f"\nLabel distribution:")
print(df['label'].value_counts())
print(f"\nPositive rate: {df['label'].mean():.1%}")

df.head()

In [None]:
# Feature columns (exclude non-feature columns)
exclude_cols = ['ticker', 'move_date', 'actual_gain', 'label', 'price']
feature_cols = [col for col in df.columns if col not in exclude_cols]

print(f"Features: {len(feature_cols)}")
print(feature_cols)

In [None]:
# Compare features between big moves and normal days
pos = df[df['label'] == 1]
neg = df[df['label'] == 0]

comparison = pd.DataFrame({
    'Big Move Mean': pos[feature_cols].mean(),
    'Normal Mean': neg[feature_cols].mean(),
})
comparison['Difference'] = comparison['Big Move Mean'] - comparison['Normal Mean']
comparison['Diff %'] = (comparison['Difference'] / comparison['Normal Mean'].abs()) * 100

# Sort by absolute difference
comparison = comparison.sort_values('Diff %', key=abs, ascending=False)

print("üéØ Top Features by Difference:")
comparison.head(20)

In [None]:
# Visualize top features
top_features = comparison.head(10).index.tolist()

fig, axes = plt.subplots(2, 5, figsize=(20, 8))
axes = axes.flatten()

for i, feature in enumerate(top_features):
    ax = axes[i]
    ax.hist(neg[feature], bins=30, alpha=0.5, label='Normal', color='blue')
    ax.hist(pos[feature], bins=30, alpha=0.5, label='Big Move', color='red')
    ax.set_title(feature)
    ax.legend()

plt.tight_layout()
plt.suptitle('Feature Distributions: Big Move vs Normal', y=1.02)
plt.show()

## Step 3: Prepare Data for Training

In [None]:
# Prepare features and labels
X = df[feature_cols].fillna(0)
y = df['label']

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Training set: {len(X_train)} samples")
print(f"Test set: {len(X_test)} samples")
print(f"Positive rate in train: {y_train.mean():.1%}")
print(f"Positive rate in test: {y_test.mean():.1%}")

## Step 4: Train Models

We'll try multiple models and compare performance.

In [None]:
# Model 1: XGBoost
print("Training XGBoost...")

# Handle class imbalance
scale_pos_weight = len(y_train[y_train==0]) / len(y_train[y_train==1])

xgb_model = xgb.XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=scale_pos_weight,
    random_state=42,
    use_label_encoder=False,
    eval_metric='auc',
    tree_method='gpu_hist'  # Use GPU!
)

xgb_model.fit(X_train_scaled, y_train)

# Evaluate
y_pred_xgb = xgb_model.predict(X_test_scaled)
y_prob_xgb = xgb_model.predict_proba(X_test_scaled)[:, 1]

print("\nüìä XGBoost Results:")
print(classification_report(y_test, y_pred_xgb, target_names=['Normal', 'Big Move']))
print(f"ROC-AUC: {roc_auc_score(y_test, y_prob_xgb):.3f}")

In [None]:
# Model 2: LightGBM
print("Training LightGBM...")

lgb_model = lgb.LGBMClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    is_unbalance=True,  # Handle imbalance
    random_state=42,
    device='gpu'  # Use GPU!
)

lgb_model.fit(X_train_scaled, y_train)

# Evaluate
y_pred_lgb = lgb_model.predict(X_test_scaled)
y_prob_lgb = lgb_model.predict_proba(X_test_scaled)[:, 1]

print("\nüìä LightGBM Results:")
print(classification_report(y_test, y_pred_lgb, target_names=['Normal', 'Big Move']))
print(f"ROC-AUC: {roc_auc_score(y_test, y_prob_lgb):.3f}")

In [None]:
# Model 3: Ensemble (Average of both)
print("Creating Ensemble...")

y_prob_ensemble = (y_prob_xgb + y_prob_lgb) / 2
y_pred_ensemble = (y_prob_ensemble > 0.5).astype(int)

print("\nüìä Ensemble Results:")
print(classification_report(y_test, y_pred_ensemble, target_names=['Normal', 'Big Move']))
print(f"ROC-AUC: {roc_auc_score(y_test, y_prob_ensemble):.3f}")

## Step 5: Analyze Feature Importance

In [None]:
# Feature importance from XGBoost
importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': xgb_model.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(12, 10))
plt.barh(importance['feature'][:20], importance['importance'][:20])
plt.xlabel('Importance')
plt.title('üéØ Top 20 Features for Predicting Big Moves')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

print("\nüéØ TOP PREDICTIVE FEATURES:")
for _, row in importance.head(15).iterrows():
    print(f"   {row['feature']}: {row['importance']:.4f}")

In [None]:
# SHAP values for deeper understanding
print("Calculating SHAP values (this may take a minute)...")

explainer = shap.TreeExplainer(xgb_model)
shap_values = explainer.shap_values(X_test_scaled[:500])  # Sample for speed

plt.figure(figsize=(12, 10))
shap.summary_plot(shap_values, X_test[:500], feature_names=feature_cols, show=False)
plt.tight_layout()
plt.show()

## Step 6: ROC Curve and Threshold Analysis

In [None]:
# ROC Curves
plt.figure(figsize=(10, 8))

for name, y_prob in [('XGBoost', y_prob_xgb), ('LightGBM', y_prob_lgb), ('Ensemble', y_prob_ensemble)]:
    fpr, tpr, thresholds = roc_curve(y_test, y_prob)
    auc = roc_auc_score(y_test, y_prob)
    plt.plot(fpr, tpr, label=f'{name} (AUC={auc:.3f})')

plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves - Big Move Prediction')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Find optimal threshold
# We want HIGH PRECISION for trading (don't want false positives)

print("\nüéØ THRESHOLD ANALYSIS (XGBoost):")
print(f"{'Threshold':<12} {'Precision':<12} {'Recall':<12} {'F1':<12}")
print("-" * 50)

from sklearn.metrics import precision_score, recall_score, f1_score

best_threshold = 0.5
best_f1 = 0

for threshold in [0.3, 0.4, 0.5, 0.6, 0.7, 0.8]:
    y_pred_thresh = (y_prob_xgb >= threshold).astype(int)
    
    if y_pred_thresh.sum() == 0:
        continue
    
    precision = precision_score(y_test, y_pred_thresh)
    recall = recall_score(y_test, y_pred_thresh)
    f1 = f1_score(y_test, y_pred_thresh)
    
    print(f"{threshold:<12.1f} {precision:<12.3f} {recall:<12.3f} {f1:<12.3f}")
    
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = threshold

print(f"\n‚úÖ Recommended threshold: {best_threshold}")

## Step 7: Save Model

In [None]:
# Save the best model
import pickle

# Save XGBoost model
with open('runner_model_xgb.pkl', 'wb') as f:
    pickle.dump(xgb_model, f)

# Save LightGBM model
with open('runner_model_lgb.pkl', 'wb') as f:
    pickle.dump(lgb_model, f)

# Save scaler
with open('runner_scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

# Save feature list
import json
with open('runner_features.json', 'w') as f:
    json.dump(feature_cols, f)

# Save feature importance
importance.to_csv('feature_importance.csv', index=False)

print("‚úÖ Models saved!")

# Download files
files.download('runner_model_xgb.pkl')
files.download('runner_model_lgb.pkl')
files.download('runner_scaler.pkl')
files.download('runner_features.json')
files.download('feature_importance.csv')

## Step 8: Trading Strategy Backtest

In [None]:
# Simulate trading with the model
print("üéÆ BACKTEST SIMULATION")
print("="*50)

# Use the test set as "unseen" data
threshold = best_threshold

# Get predictions
predictions = (y_prob_xgb >= threshold).astype(int)
actual_moves = y_test.values
actual_gains = df.loc[y_test.index, 'actual_gain'].values

# Simulate trades
trades_taken = predictions.sum()
winning_trades = ((predictions == 1) & (actual_moves == 1)).sum()
losing_trades = ((predictions == 1) & (actual_moves == 0)).sum()

win_rate = winning_trades / trades_taken if trades_taken > 0 else 0

# Calculate returns (simplified)
avg_win = actual_gains[(predictions == 1) & (actual_moves == 1)].mean() if winning_trades > 0 else 0
avg_loss = -3  # Assume 3% stop loss on losing trades

expected_return = (win_rate * avg_win) + ((1 - win_rate) * avg_loss)

print(f"Threshold: {threshold}")
print(f"Trades taken: {trades_taken}")
print(f"Winning trades: {winning_trades}")
print(f"Losing trades: {losing_trades}")
print(f"Win rate: {win_rate:.1%}")
print(f"\nAverage win: +{avg_win:.1f}%")
print(f"Average loss: {avg_loss}% (assumed stop)")
print(f"\nExpected return per trade: {expected_return:.1f}%")

if expected_return > 0:
    print(f"\nüü¢ POSITIVE EDGE - This strategy has potential!")
else:
    print(f"\nüî¥ NEGATIVE EDGE - Need to adjust strategy")

## üê∫ Findings Summary

After training, look at:

1. **Top Features** - What predicts big moves?
2. **Precision** - How often are predictions correct?
3. **Expected Return** - Is there a trading edge?

---

**Next Steps:**
1. Download the saved models
2. Copy them to `models/` folder in your repo
3. Run `python src/ml/runner_predictor.py scan` to get live predictions

**AWOOOO** üê∫