# Dataset Generator for Size Picker

Generate large datasets of synthetic trades with controlled distributions.

In [None]:
# Setup + Mount Google Drive
import os
import random
import numpy as np
import pandas as pd
import pickle
from datetime import datetime

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

SAVE_DIR = '/content/drive/MyDrive/size_picker_data'
os.makedirs(SAVE_DIR, exist_ok=True)
print(f"Saving to: {SAVE_DIR}")

In [None]:
# Configuration
SIZES = [round(0.15 * i, 2) for i in range(1, 31)]  # 0.15 to 4.50
LOOKBACK = 24

# Size categories
SMALL_SIZES = [s for s in SIZES if s <= 1.5]      # 0.15 to 1.50
MID_SIZES = [s for s in SIZES if 1.5 < s <= 3.0]  # 1.65 to 3.00
BIG_SIZES = [s for s in SIZES if s > 3.0]         # 3.15 to 4.50

print(f"Small sizes ({len(SMALL_SIZES)}): {SMALL_SIZES[0]} to {SMALL_SIZES[-1]}")
print(f"Mid sizes ({len(MID_SIZES)}): {MID_SIZES[0]} to {MID_SIZES[-1]}")
print(f"Big sizes ({len(BIG_SIZES)}): {BIG_SIZES[0]} to {BIG_SIZES[-1]}")

In [None]:
# Broker simulation
class BrokerSim:
    def __init__(self, equity=1000, leverage=20, liquidation_pct=0.30,
                 spread_pct=0.0006, overnight_cost_per_oz=1.0, min_size=0.15):
        self.initial_equity = equity
        self.leverage = leverage
        self.liquidation_level = equity * liquidation_pct
        self.spread_pct = spread_pct
        self.overnight_cost = overnight_cost_per_oz
        self.min_size = min_size
        
    def run_trade(self, candles, entry_idx, exit_idx, direction, size):
        if size < self.min_size:
            size = self.min_size
            
        entry_price = candles.iloc[entry_idx]['close']
        if direction == 'Buy':
            entry_price *= (1 + self.spread_pct)
        else:
            entry_price *= (1 - self.spread_pct)
        
        equity = self.initial_equity
        liquidated = False
        max_drawdown_pct = 0
        total_overnight = 0
        last_date = pd.to_datetime(candles.iloc[entry_idx]['datetime']).date()
        max_equity = equity
        
        for i in range(entry_idx + 1, exit_idx + 1):
            bar = candles.iloc[i]
            current_date = pd.to_datetime(bar['datetime']).date()
            
            if current_date > last_date:
                days = (current_date - last_date).days
                overnight = days * self.overnight_cost * size
                total_overnight += overnight
                equity -= overnight
                last_date = current_date
            
            if direction == 'Buy':
                worst_price = bar['low']
                unrealized_worst = (worst_price - entry_price) * size
                unrealized = (bar['close'] - entry_price) * size
            else:
                worst_price = bar['high']
                unrealized_worst = (entry_price - worst_price) * size
                unrealized = (entry_price - bar['close']) * size
            
            equity_at_worst = self.initial_equity + unrealized_worst - total_overnight
            if equity_at_worst <= self.liquidation_level:
                liquidated = True
                equity = self.liquidation_level
                break
            
            equity = self.initial_equity + unrealized - total_overnight
            max_equity = max(max_equity, equity)
            dd = (max_equity - equity) / max_equity * 100 if max_equity > 0 else 0
            max_drawdown_pct = max(max_drawdown_pct, dd)
        
        if not liquidated:
            exit_price = candles.iloc[exit_idx]['close']
            if direction == 'Buy':
                exit_price *= (1 - self.spread_pct)
                final_pnl = (exit_price - entry_price) * size
            else:
                exit_price *= (1 + self.spread_pct)
                final_pnl = (entry_price - exit_price) * size
            equity = self.initial_equity + final_pnl - total_overnight
        
        pnl = equity - self.initial_equity
        return_pct = (pnl / self.initial_equity) * 100
        
        return {
            'liquidated': liquidated,
            'return_pct': return_pct,
            'max_drawdown_pct': max_drawdown_pct,
            'size': size
        }

broker = BrokerSim()
print("Broker ready")

In [None]:
# Candle generation
def generate_candles(n_candles, base_price=4400, volatility=0.02):
    candles = []
    price = base_price
    
    for _ in range(n_candles):
        change = price * volatility * random.gauss(0, 1)
        open_p = price
        close_p = price + change
        high_ext = abs(random.gauss(0, 1)) * price * volatility * 0.5
        low_ext = abs(random.gauss(0, 1)) * price * volatility * 0.5
        high_p = max(open_p, close_p) + high_ext
        low_p = min(open_p, close_p) - low_ext
        
        candles.append({
            'open': open_p,
            'high': high_p,
            'low': low_p,
            'close': close_p
        })
        price = close_p
    
    return pd.DataFrame(candles)

def generate_trade(lookback=24, trade_length_range=(10, 50), volatility_range=(0.005, 0.03)):
    volatility = random.uniform(*volatility_range)
    trade_length = random.randint(*trade_length_range)
    total_candles = lookback + trade_length
    
    candles = generate_candles(total_candles, volatility=volatility)
    candles['datetime'] = pd.date_range('2025-01-01', periods=total_candles, freq='h')
    
    entry_idx = lookback
    exit_idx = total_candles - 1
    direction = random.choice(['Buy', 'Sell'])
    
    return {
        'candles_df': candles,
        'entry_idx': entry_idx,
        'exit_idx': exit_idx,
        'direction': direction,
        'volatility': volatility
    }

print("Candle generator ready")

In [None]:
# Optimal size finder
def get_optimal_size(candles_df, entry_idx, exit_idx, direction, sizes, broker):
    results = []
    for size in sizes:
        result = broker.run_trade(candles_df, entry_idx, exit_idx, direction, size)
        result['size'] = size
        results.append(result)
    
    df = pd.DataFrame(results)
    df['score'] = df.apply(lambda r: -10 if r['liquidated'] else r['return_pct'] * 0.5, axis=1)
    
    best_idx = df['score'].idxmax()
    optimal_size = df.loc[best_idx, 'size']
    optimal_score = df.loc[best_idx, 'score']
    
    # Categorize
    if optimal_size <= 1.5:
        category = 'small'
    elif optimal_size <= 3.0:
        category = 'mid'
    else:
        category = 'big'
    
    return {
        'optimal_size': optimal_size,
        'optimal_score': optimal_score,
        'category': category,
        'all_results': df
    }

print("Optimal size finder ready")

In [None]:
# Dataset generator V2 - stores all_results for two-head model training

def generate_all_datasets_v2(target_per_category=100000):
    """
    Generate trades and sort into small/mid/big as we go.
    V2: Stores simulation results for ALL 30 sizes per trade (needed for two-head model).
    Saves progress every 100 trades.
    Resumes from existing progress files if they exist.
    """
    # Load existing progress if available (use v2 suffix to avoid conflicts)
    small_path = f'{SAVE_DIR}/small_v2_progress.pkl'
    mid_path = f'{SAVE_DIR}/mid_v2_progress.pkl'
    big_path = f'{SAVE_DIR}/big_v2_progress.pkl'

    if os.path.exists(small_path):
        with open(small_path, 'rb') as f:
            small_trades = pickle.load(f)
        print(f"Loaded {len(small_trades)} existing small trades")
    else:
        small_trades = []

    if os.path.exists(mid_path):
        with open(mid_path, 'rb') as f:
            mid_trades = pickle.load(f)
        print(f"Loaded {len(mid_trades)} existing mid trades")
    else:
        mid_trades = []

    if os.path.exists(big_path):
        with open(big_path, 'rb') as f:
            big_trades = pickle.load(f)
        print(f"Loaded {len(big_trades)} existing big trades")
    else:
        big_trades = []

    total_generated = len(small_trades) + len(mid_trades) + len(big_trades)
    last_save = total_generated

    print(f"Target: {target_per_category} trades per category")
    print(f"Starting from: small={len(small_trades)}, mid={len(mid_trades)}, big={len(big_trades)}")
    print("Generating (V2 with all_results)...")

    while (len(small_trades) < target_per_category or
           len(mid_trades) < target_per_category or
           len(big_trades) < target_per_category):

        trade_data = generate_trade()
        optimal_info = get_optimal_size(
            trade_data['candles_df'],
            trade_data['entry_idx'],
            trade_data['exit_idx'],
            trade_data['direction'],
            SIZES,
            broker
        )

        candle_start = trade_data['entry_idx'] - LOOKBACK
        trade_candles = trade_data['candles_df'].iloc[candle_start:trade_data['exit_idx']+1][['open', 'high', 'low', 'close']].values

        # Convert all_results DataFrame to list of dicts for storage
        all_results_list = optimal_info['all_results'][['size', 'liquidated', 'return_pct', 'max_drawdown_pct']].to_dict('records')

        trade = {
            'trade_id': total_generated + 1,
            'direction': trade_data['direction'],
            'candles': trade_candles,
            'optimal_size': optimal_info['optimal_size'],
            'optimal_score': optimal_info['optimal_score'],
            'category': optimal_info['category'],
            'volatility': trade_data['volatility'],
            # V2: Store results for all 30 sizes
            'all_results': all_results_list
        }

        # Sort into category (only if we still need it)
        if optimal_info['category'] == 'small' and len(small_trades) < target_per_category:
            small_trades.append(trade)
        elif optimal_info['category'] == 'mid' and len(mid_trades) < target_per_category:
            mid_trades.append(trade)
        elif optimal_info['category'] == 'big' and len(big_trades) < target_per_category:
            big_trades.append(trade)

        total_generated += 1
        total_saved = len(small_trades) + len(mid_trades) + len(big_trades)

        # Save every 100 trades added
        if total_saved - last_save >= 100:
            last_save = total_saved
            print(f"Total: {total_generated} | small={len(small_trades)}, mid={len(mid_trades)}, big={len(big_trades)}")

            # Save progress
            save_dataset(small_trades, 'small_v2_progress')
            save_dataset(mid_trades, 'mid_v2_progress')
            save_dataset(big_trades, 'big_v2_progress')

    # Final save
    save_dataset(small_trades, f'small_v2_{target_per_category//1000}k')
    save_dataset(mid_trades, f'mid_v2_{target_per_category//1000}k')
    save_dataset(big_trades, f'big_v2_{target_per_category//1000}k')

    print(f"\nDone! Total generated: {total_generated}")
    print(f"Small: {len(small_trades)}, Mid: {len(mid_trades)}, Big: {len(big_trades)}")

    return small_trades, mid_trades, big_trades

print("Generator ready - use generate_all_datasets_v2() for two-head model data")

In [None]:
# Save/Load functions
def save_dataset(trades, filename):
    filepath = f'{SAVE_DIR}/{filename}.pkl'
    with open(filepath, 'wb') as f:
        pickle.dump(trades, f)
    print(f"Saved {len(trades)} trades to {filepath}")
    
def load_dataset(filename):
    filepath = f'{SAVE_DIR}/{filename}.pkl'
    with open(filepath, 'rb') as f:
        trades = pickle.load(f)
    print(f"Loaded {len(trades)} trades from {filepath}")
    return trades

def list_datasets():
    files = [f for f in os.listdir(SAVE_DIR) if f.endswith('.pkl')]
    print("Available datasets:")
    for f in files:
        filepath = f'{SAVE_DIR}/{f}'
        with open(filepath, 'rb') as file:
            trades = pickle.load(file)
        print(f"  {f}: {len(trades)} trades")
    return files

print("Save/Load ready")

---
## Option 1: Convert Existing Datasets to V2 (Faster)

If you already have V1 datasets (balanced_100k, etc.), convert them to V2 by re-running broker simulation for all 30 sizes. This is much faster than regenerating from scratch.

In [None]:
# Convert V1 dataset to V2 (add all_results)
def convert_to_v2(trades, broker):
    """
    Convert existing V1 trades to V2 by running broker simulation for all 30 sizes.
    
    The stored candles already contain everything we need:
    - entry_idx = LOOKBACK (24)
    - exit_idx = len(candles) - 1
    """
    v2_trades = []
    
    for i, trade in enumerate(trades):
        # Reconstruct what broker needs
        candles = trade['candles']
        direction = trade['direction']
        
        # The stored candles start at (original_entry - LOOKBACK) and end at exit
        # So entry_idx in this array is LOOKBACK, exit_idx is len-1
        entry_idx = LOOKBACK
        exit_idx = len(candles) - 1
        
        # Create DataFrame with datetime for broker (it needs datetime for overnight calc)
        candles_df = pd.DataFrame(candles, columns=['open', 'high', 'low', 'close'])
        candles_df['datetime'] = pd.date_range('2025-01-01', periods=len(candles), freq='h')
        
        # Run simulation for all 30 sizes
        results = []
        for size in SIZES:
            result = broker.run_trade(candles_df, entry_idx, exit_idx, direction, size)
            result['size'] = size
            results.append(result)
        
        results_df = pd.DataFrame(results)
        all_results_list = results_df[['size', 'liquidated', 'return_pct', 'max_drawdown_pct']].to_dict('records')
        
        # Create V2 trade (copy existing + add all_results)
        v2_trade = trade.copy()
        v2_trade['all_results'] = all_results_list
        v2_trades.append(v2_trade)
        
        if (i + 1) % 10000 == 0:
            print(f"  Converted {i + 1}/{len(trades)} trades")
    
    return v2_trades

def convert_dataset_to_v2(input_name, output_name):
    """Load a V1 dataset, convert to V2, and save."""
    print(f"Loading {input_name}...")
    trades = load_dataset(input_name)
    
    if 'all_results' in trades[0]:
        print(f"  Already V2 format! Skipping.")
        return trades
    
    print(f"Converting {len(trades)} trades to V2...")
    v2_trades = convert_to_v2(trades, broker)
    
    print(f"Saving as {output_name}...")
    save_dataset(v2_trades, output_name)
    
    return v2_trades

print("Conversion functions ready")

In [None]:
# Convert existing balanced_100k to V2
# This will run broker simulation for all 30 sizes on each trade

balanced_v2 = convert_dataset_to_v2('balanced_100k', 'balanced_v2_100k')

In [None]:
# Optional: Convert reinforcement_200k to V2
# reinforcement_v2 = convert_dataset_to_v2('reinforcement_200k', 'reinforcement_v2_200k')

---
## Option 2: Generate Fresh V2 Datasets

Generate completely new datasets with all_results included from the start.

In [None]:
# Generate 100k of each category (V2 with all_results)
# This runs until all three have 100k, saving progress every 100 trades
small_trades, mid_trades, big_trades = generate_all_datasets_v2(100000)

In [None]:
# Create balanced V2 dataset (equal parts small, mid, big)
import random

balanced = small_trades[:33333] + mid_trades[:33333] + big_trades[:33334]
random.shuffle(balanced)
save_dataset(balanced, 'balanced_v2_100k')
print(f"Created balanced V2 dataset: {len(balanced)} trades")
print(f"  Sample trade has 'all_results': {'all_results' in balanced[0]}")

In [None]:
# Create reinforcement V2 dataset (remaining trades after balanced)
reinforcement = small_trades[33333:] + mid_trades[33333:] + big_trades[33334:]
random.shuffle(reinforcement)
save_dataset(reinforcement, 'reinforcement_v2_200k')
print(f"Created reinforcement V2 dataset: {len(reinforcement)} trades")

In [None]:
# List all datasets
list_datasets()

In [None]:
# List all generated datasets
list_datasets()

---
## Combine Datasets

Combine datasets for training curriculum.

In [None]:
# Example: Create balanced training set
# Load and combine
# small = load_dataset('small_100k')
# mid = load_dataset('mid_100k')
# big = load_dataset('big_100k')
# 
# balanced = small[:33333] + mid[:33333] + big[:33334]
# random.shuffle(balanced)
# save_dataset(balanced, 'balanced_100k')