# Pure ML Model v1.0 - Full Dataset Training

Train a pure learning model on 20 years of Gold (XAUUSD) 1h data.

**Key differences from v0.1:**
- Full dataset: 2004-2025 (~125k bars)
- Time-based train/test split (no data leakage)
- Pure learning: model discovers patterns, no human bias
- Target: Price moves +0.5% within next 24 hours

In [None]:
# Setup - Clone repo and install dependencies
!pip install xgboost scikit-learn pandas numpy matplotlib seaborn -q

import os
if os.path.exists('gold-ml-trading'):
    %cd gold-ml-trading
    !git pull
else:
    !git clone https://github.com/altommo/gold-ml-trading.git
    %cd gold-ml-trading

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from xgboost import XGBClassifier
import joblib
import warnings
warnings.filterwarnings('ignore')

print("Libraries loaded successfully!")

In [None]:
# ============================================================
# INDICATOR FUNCTIONS (inlined to avoid import issues)
# ============================================================

def calculate_wavetrend(df, n1=10, n2=21):
    """Calculate WaveTrend indicator"""
    df = df.copy()
    ap = (df['high'] + df['low'] + df['close']) / 3
    esa = ap.ewm(span=n1, adjust=False).mean()
    d = (ap - esa).abs().ewm(span=n1, adjust=False).mean()
    ci = (ap - esa) / (0.015 * d)
    df['wt1'] = ci.ewm(span=n2, adjust=False).mean()
    df['wt2'] = df['wt1'].rolling(4).mean()
    return df

def calculate_wolfpack(df):
    """Calculate Wolfpack indicator (EMA3 - EMA8)"""
    df = df.copy()
    df['wolfpack'] = df['close'].ewm(span=3, adjust=False).mean() - df['close'].ewm(span=8, adjust=False).mean()
    return df

def calculate_rsi(df, period=14):
    """Calculate RSI"""
    df = df.copy()
    delta = df['close'].diff()
    gain = delta.clip(lower=0).rolling(period).mean()
    loss = (-delta.clip(upper=0)).rolling(period).mean()
    df['rsi'] = 100 - (100 / (1 + gain / loss))
    return df

def calculate_atr(df, period=14):
    """Calculate ATR and ATR%"""
    df = df.copy()
    df['atr'] = (df['high'] - df['low']).rolling(period).mean()
    df['atr_pct'] = df['atr'] / df['close'] * 100
    return df

def calculate_moving_averages(df):
    """Calculate common moving averages"""
    df = df.copy()
    df['ma20'] = df['close'].rolling(20).mean()
    df['ma50'] = df['close'].rolling(50).mean()
    df['ma200'] = df['close'].rolling(200).mean()
    df['price_vs_ma20'] = (df['close'] - df['ma20']) / df['ma20'] * 100
    df['price_vs_ma50'] = (df['close'] - df['ma50']) / df['ma50'] * 100
    df['price_vs_ma200'] = (df['close'] - df['ma200']) / df['ma200'] * 100
    return df

def calculate_returns(df):
    """Calculate various return periods"""
    df = df.copy()
    df['ret_1h'] = df['close'].pct_change() * 100
    df['ret_4h'] = df['close'].pct_change(4) * 100
    df['ret_24h'] = df['close'].pct_change(24) * 100
    return df

def calculate_bollinger_bands(df, period=20, std_dev=2):
    """Calculate Bollinger Bands"""
    df = df.copy()
    df['bb_mid'] = df['close'].rolling(period).mean()
    df['bb_std'] = df['close'].rolling(period).std()
    df['bb_upper'] = df['bb_mid'] + (df['bb_std'] * std_dev)
    df['bb_lower'] = df['bb_mid'] - (df['bb_std'] * std_dev)
    df['bb_pct'] = (df['close'] - df['bb_lower']) / (df['bb_upper'] - df['bb_lower'])
    df['bb_width'] = (df['bb_upper'] - df['bb_lower']) / df['bb_mid'] * 100
    return df

def calculate_momentum(df):
    """Calculate momentum indicators"""
    df = df.copy()
    df['roc_5'] = (df['close'] / df['close'].shift(5) - 1) * 100
    df['roc_10'] = (df['close'] / df['close'].shift(10) - 1) * 100
    
    # MACD
    ema12 = df['close'].ewm(span=12, adjust=False).mean()
    ema26 = df['close'].ewm(span=26, adjust=False).mean()
    df['macd'] = ema12 - ema26
    df['macd_signal'] = df['macd'].ewm(span=9, adjust=False).mean()
    df['macd_hist'] = df['macd'] - df['macd_signal']
    
    # Stochastic RSI
    rsi = df['rsi'] if 'rsi' in df.columns else calculate_rsi(df)['rsi']
    rsi_min = rsi.rolling(14).min()
    rsi_max = rsi.rolling(14).max()
    df['stoch_rsi'] = (rsi - rsi_min) / (rsi_max - rsi_min) * 100
    return df

def calculate_time_features(df):
    """Add time-based features"""
    df = df.copy()
    if isinstance(df.index, pd.DatetimeIndex):
        df['hour'] = df.index.hour
        df['day_of_week'] = df.index.dayofweek
        df['is_london'] = ((df['hour'] >= 8) & (df['hour'] <= 16)).astype(int)
        df['is_ny'] = ((df['hour'] >= 13) & (df['hour'] <= 21)).astype(int)
        df['is_overlap'] = ((df['hour'] >= 13) & (df['hour'] <= 16)).astype(int)
    return df

def calculate_trend(df):
    """Calculate trend indicators"""
    df = df.copy()
    df['trend_20_50'] = np.where(df['ma20'] > df['ma50'], 1, -1)
    df['trend_50_200'] = np.where(df['ma50'] > df['ma200'], 1, -1)
    df['trend_score'] = df['trend_20_50'] + df['trend_50_200']
    return df

def calculate_volatility(df):
    """Calculate volatility features"""
    df = df.copy()
    df['volatility_24h'] = df['ret_1h'].rolling(24).std()
    df['volatility_week'] = df['ret_1h'].rolling(168).std()
    df['vol_ratio'] = df['volatility_24h'] / df['volatility_week']
    return df

def add_all_indicators(df):
    """Add all indicators"""
    df = calculate_wavetrend(df)
    df = calculate_wolfpack(df)
    df = calculate_rsi(df)
    df = calculate_atr(df)
    df = calculate_moving_averages(df)
    df = calculate_returns(df)
    df = calculate_bollinger_bands(df)
    df = calculate_momentum(df)
    df = calculate_time_features(df)
    df = calculate_trend(df)
    df = calculate_volatility(df)
    return df

print("Indicator functions defined!")

In [None]:
# ============================================================
# LOAD DATA
# ============================================================

df = pd.read_csv('data/XAUUSD_KAGGLE_1h.csv', parse_dates=['datetime'], index_col='datetime')
print(f"Loaded {len(df):,} bars")
print(f"Date range: {df.index.min()} to {df.index.max()}")
print(f"\nPrice range: ${df['close'].min():.2f} - ${df['close'].max():.2f}")
df.head()

In [None]:
# ============================================================
# CALCULATE ALL INDICATORS
# ============================================================

print("Calculating indicators...")
df = add_all_indicators(df)
print(f"Total columns: {len(df.columns)}")
print(f"\nIndicator columns:")
print([c for c in df.columns if c not in ['open', 'high', 'low', 'close', 'volume']])

In [None]:
# ============================================================
# CREATE TARGET VARIABLE
# ============================================================

LOOKAHEAD_HOURS = 24
TARGET_PCT = 0.5  # 0.5% move = profitable trade

# Future max price in next 24 hours
df['future_max'] = df['high'].rolling(LOOKAHEAD_HOURS).max().shift(-LOOKAHEAD_HOURS)
df['future_return'] = (df['future_max'] - df['close']) / df['close'] * 100

# Target: Did price reach +0.5% in next 24 hours?
df['target'] = (df['future_return'] >= TARGET_PCT).astype(int)

print(f"Target distribution:")
print(df['target'].value_counts(normalize=True))
print(f"\nTarget 1 = price went up {TARGET_PCT}% within {LOOKAHEAD_HOURS} hours")

In [None]:
# ============================================================
# DEFINE FEATURES
# ============================================================

# All available features for pure learning
FEATURES = [
    # Core momentum
    'wt1', 'wt2', 'wolfpack', 'rsi', 'stoch_rsi',
    
    # Price position
    'price_vs_ma20', 'price_vs_ma50', 'price_vs_ma200',
    'bb_pct', 'bb_width',
    
    # Momentum
    'roc_5', 'roc_10', 'macd_hist',
    
    # Volatility
    'atr_pct', 'volatility_24h', 'vol_ratio',
    
    # Trend
    'trend_score',
    
    # Returns
    'ret_1h', 'ret_4h', 'ret_24h',
    
    # Time
    'hour', 'day_of_week', 'is_overlap'
]

print(f"Using {len(FEATURES)} features:")
print(FEATURES)

In [None]:
# ============================================================
# PREPARE DATA WITH TIME-BASED SPLIT
# ============================================================

# Drop rows with NaN
df_clean = df.dropna(subset=FEATURES + ['target'])
print(f"Clean rows: {len(df_clean):,} (dropped {len(df) - len(df_clean):,} rows with NaN)")

# Time-based split: Train on older data, test on newer
# Use 80% for training, 20% for testing (chronological)
split_idx = int(len(df_clean) * 0.8)
split_date = df_clean.index[split_idx]

train_df = df_clean.iloc[:split_idx]
test_df = df_clean.iloc[split_idx:]

print(f"\nTrain period: {train_df.index.min()} to {train_df.index.max()} ({len(train_df):,} bars)")
print(f"Test period:  {test_df.index.min()} to {test_df.index.max()} ({len(test_df):,} bars)")

X_train = train_df[FEATURES]
y_train = train_df['target']
X_test = test_df[FEATURES]
y_test = test_df['target']

print(f"\nTrain target distribution: {y_train.mean():.1%} positive")
print(f"Test target distribution: {y_test.mean():.1%} positive")

In [None]:
# ============================================================
# SCALE FEATURES
# ============================================================

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Features scaled with StandardScaler")

In [None]:
# ============================================================
# TRAIN PURE MODEL
# ============================================================

print("Training XGBoost model on full dataset...")
print("This is a PURE learning model - no human bias, just learns from data.")
print()

model = XGBClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_weight=3,
    gamma=0.1,
    random_state=42,
    eval_metric='logloss',
    early_stopping_rounds=20
)

model.fit(
    X_train_scaled, y_train,
    eval_set=[(X_test_scaled, y_test)],
    verbose=False
)

print(f"Model trained! Best iteration: {model.best_iteration}")

In [None]:
# ============================================================
# EVALUATE MODEL
# ============================================================

y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
y_pred = model.predict(X_test_scaled)

print("=" * 50)
print("MODEL EVALUATION ON TEST SET")
print(f"Test period: {test_df.index.min().date()} to {test_df.index.max().date()}")
print("=" * 50)
print()
print(classification_report(y_test, y_pred))
print(f"ROC-AUC Score: {roc_auc_score(y_test, y_pred_proba):.4f}")

In [None]:
# ============================================================
# FEATURE IMPORTANCE
# ============================================================

importance_df = pd.DataFrame({
    'feature': FEATURES,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 8))
plt.barh(importance_df['feature'], importance_df['importance'])
plt.xlabel('Importance')
plt.title('Feature Importance - Pure Learning Model')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

print("\nTop 10 Features:")
print(importance_df.head(10).to_string(index=False))

In [None]:
# ============================================================
# BACKTEST WITH THRESHOLD OPTIMIZATION
# ============================================================

def backtest_signals(df, predictions, threshold=0.5, tp_pct=0.5, sl_pct=0.3):
    """Backtest trading signals"""
    trades = []
    df = df.copy()
    df['pred'] = predictions
    
    for i, (idx, row) in enumerate(df.iterrows()):
        if row['pred'] < threshold:
            continue
            
        entry_price = row['close']
        tp_price = entry_price * (1 + tp_pct/100)
        sl_price = entry_price * (1 - sl_pct/100)
        
        # Look ahead for exit
        future = df.iloc[i+1:i+25]  # Next 24 bars
        
        hit_tp = False
        hit_sl = False
        exit_price = entry_price
        
        for _, future_row in future.iterrows():
            if future_row['high'] >= tp_price:
                hit_tp = True
                exit_price = tp_price
                break
            if future_row['low'] <= sl_price:
                hit_sl = True
                exit_price = sl_price
                break
        
        if not hit_tp and not hit_sl:
            # Time exit at last bar
            if len(future) > 0:
                exit_price = future.iloc[-1]['close']
        
        pnl_pct = (exit_price - entry_price) / entry_price * 100
        
        trades.append({
            'entry_time': idx,
            'entry_price': entry_price,
            'exit_price': exit_price,
            'pnl_pct': pnl_pct,
            'hit_tp': hit_tp,
            'hit_sl': hit_sl,
            'confidence': row['pred']
        })
    
    return pd.DataFrame(trades)

print("Testing different thresholds...")
print()

test_df_bt = test_df.copy()
test_df_bt['pred'] = y_pred_proba

results = []
for threshold in [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
    trades = backtest_signals(test_df_bt, y_pred_proba, threshold=threshold)
    if len(trades) > 0:
        win_rate = (trades['pnl_pct'] > 0).mean() * 100
        total_return = trades['pnl_pct'].sum()
        avg_return = trades['pnl_pct'].mean()
        sharpe = trades['pnl_pct'].mean() / trades['pnl_pct'].std() * np.sqrt(252) if trades['pnl_pct'].std() > 0 else 0
        
        results.append({
            'threshold': threshold,
            'trades': len(trades),
            'win_rate': win_rate,
            'avg_return': avg_return,
            'total_return': total_return,
            'sharpe': sharpe
        })
        
        print(f"Threshold {threshold}: {len(trades):,} trades, {win_rate:.1f}% win, {total_return:.1f}% total, Sharpe {sharpe:.2f}")

results_df = pd.DataFrame(results)
print()
print(results_df.to_string(index=False))

In [None]:
# ============================================================
# DETAILED ANALYSIS BY YEAR
# ============================================================

test_df_bt['year'] = test_df_bt.index.year

print("\n" + "=" * 60)
print("PERFORMANCE BY YEAR (threshold=0.5)")
print("=" * 60)

for year in sorted(test_df_bt['year'].unique()):
    year_df = test_df_bt[test_df_bt['year'] == year]
    year_preds = year_df['pred'].values
    
    trades = backtest_signals(year_df, year_preds, threshold=0.5)
    if len(trades) > 0:
        win_rate = (trades['pnl_pct'] > 0).mean() * 100
        total_return = trades['pnl_pct'].sum()
        sharpe = trades['pnl_pct'].mean() / trades['pnl_pct'].std() * np.sqrt(252) if trades['pnl_pct'].std() > 0 else 0
        print(f"{year}: {len(trades):4d} trades, {win_rate:5.1f}% win, {total_return:8.1f}% return, Sharpe {sharpe:5.2f}")

In [None]:
# ============================================================
# SAVE MODELS
# ============================================================

import os
os.makedirs('models/v1.0', exist_ok=True)

joblib.dump(model, 'models/v1.0/pure_model.pkl')
joblib.dump(scaler, 'models/v1.0/scaler.pkl')

# Save config
import json

best_result = results_df.loc[results_df['sharpe'].idxmax()]

config = {
    'version': '1.0',
    'date': str(pd.Timestamp.now().date()),
    'training_period': f"{train_df.index.min().date()} to {train_df.index.max().date()}",
    'test_period': f"{test_df.index.min().date()} to {test_df.index.max().date()}",
    'train_bars': len(train_df),
    'test_bars': len(test_df),
    'features': FEATURES,
    'lookahead_hours': LOOKAHEAD_HOURS,
    'target_pct': TARGET_PCT,
    'model_params': {
        'n_estimators': 200,
        'max_depth': 6,
        'learning_rate': 0.1,
        'subsample': 0.8,
        'colsample_bytree': 0.8
    },
    'best_threshold': float(best_result['threshold']),
    'performance': {
        'trades': int(best_result['trades']),
        'win_rate': float(best_result['win_rate']),
        'total_return': float(best_result['total_return']),
        'sharpe': float(best_result['sharpe'])
    },
    'all_thresholds': results_df.to_dict('records')
}

with open('models/v1.0/config.json', 'w') as f:
    json.dump(config, f, indent=2)

print("Models saved to models/v1.0/")
print(f"\nBest configuration:")
print(f"  Threshold: {best_result['threshold']}")
print(f"  Trades: {int(best_result['trades']):,}")
print(f"  Win Rate: {best_result['win_rate']:.1f}%")
print(f"  Total Return: {best_result['total_return']:.1f}%")
print(f"  Sharpe: {best_result['sharpe']:.2f}")

In [None]:
# ============================================================
# SUMMARY
# ============================================================

print("\n" + "=" * 60)
print("PURE MODEL v1.0 - SUMMARY")
print("=" * 60)
print(f"""\nThis model was trained on {len(train_df):,} hours of Gold data from {train_df.index.min().date()} to {train_df.index.max().date()}.

It learns purely from the data without any human-imposed trading rules.
The target is simple: predict if Gold will rise {TARGET_PCT}% within the next {LOOKAHEAD_HOURS} hours.

Key findings:
- Best Sharpe ratio achieved at threshold {best_result['threshold']}
- Model uses {len(FEATURES)} features derived from price action

Top predictive features:""")

print(importance_df.head(5).to_string(index=False))

print("\n" + "=" * 60)