## Description

Must get 1 minute candles from Binance USDT perps for the symbols in filtered_tokens.csv.
Must take in a parameter which specifies how many symbols to get data from. If this parameter is None then get all of them.
Populate the 1 minute candles according to the Candle dataclass in models.py.
Must respect rate limits. Should be complex and look at the rate limit data in the responses and adjust accordingly 
Must allow for a date range input. Not all the tokens would have been listed during the entire period so it needs to be able to figure out the data range within the date range for each symbol.
Output to data folder.

In [8]:
# Import required libraries
import pandas as pd
import numpy as np
import requests
import time
import asyncio
import aiohttp
from datetime import datetime, timedelta
from pathlib import Path
import sys
import os
from concurrent.futures import ThreadPoolExecutor
import threading

# Add the backtester module to the path
sys.path.append('/home/zac/back-testing')
from backtester.models import Candle

# Configuration
BINANCE_BASE_URL = "https://fapi.binance.com"
DATA_DIR = Path("/home/zac/back-testing/data")
TOKENS_FILE = DATA_DIR / "filtered_tokens.csv"

print("Libraries imported successfully!")
print(f"Data directory: {DATA_DIR}")
print(f"Tokens file: {TOKENS_FILE}")

Libraries imported successfully!
Data directory: /home/zac/back-testing/data
Tokens file: /home/zac/back-testing/data/filtered_tokens.csv


In [9]:
class BinanceRateLimiter:
    """Handle Binance API rate limiting with automatic adjustment and concurrent support"""
    
    def __init__(self, max_concurrent_requests=10):
        self.weight_limit = 2400  # Default weight limit per minute
        self.current_weight = 0
        self.window_start = time.time()
        self.min_delay = 0.05  # Reduced minimum delay (50ms)
        self.last_request_time = 0
        self.max_concurrent = max_concurrent_requests
        self.semaphore = threading.Semaphore(max_concurrent_requests)
        self.lock = threading.Lock()
        
    def check_rate_limits(self, response_headers):
        """Update rate limit info from response headers"""
        with self.lock:
            if 'x-mbx-used-weight-1m' in response_headers:
                self.current_weight = int(response_headers['x-mbx-used-weight-1m'])
                
            # If we're approaching the limit, implement backoff
            if self.current_weight > self.weight_limit * 0.7:  # 70% of limit
                print(f"Rate limit warning: {self.current_weight}/{self.weight_limit}")
            
    def wait_if_needed(self):
        """Wait if necessary to respect rate limits"""
        with self.lock:
            current_time = time.time()
            
            # Reset weight counter every minute
            if current_time - self.window_start >= 60:
                self.current_weight = 0
                self.window_start = current_time
                
            # Ensure minimum delay between requests
            time_since_last = current_time - self.last_request_time
            if time_since_last < self.min_delay:
                time.sleep(self.min_delay - time_since_last)
                
            # If weight is high, wait longer
            if self.current_weight > self.weight_limit * 0.8:
                wait_time = 60 - (current_time - self.window_start)
                if wait_time > 0:
                    print(f"Rate limit reached. Waiting {wait_time:.1f} seconds...")
                    time.sleep(wait_time)
                    self.current_weight = 0
                    self.window_start = time.time()
                    
            self.last_request_time = time.time()
    
    def acquire(self):
        """Acquire semaphore for concurrent request limiting"""
        self.semaphore.acquire()
        
    def release(self):
        """Release semaphore"""
        self.semaphore.release()

rate_limiter = BinanceRateLimiter(max_concurrent_requests=4)  # Conservative concurrent limit
print("Rate limiter initialized with concurrent support")

Rate limiter initialized with concurrent support


In [10]:
def get_symbol_info(symbol):
    """Get symbol information to determine listing date and availability"""
    rate_limiter.wait_if_needed()
    
    try:
        url = f"{BINANCE_BASE_URL}/fapi/v1/exchangeInfo"
        response = requests.get(url)
        response.raise_for_status()
        
        rate_limiter.check_rate_limits(response.headers)
        
        data = response.json()
        
        # Find the specific symbol
        for symbol_info in data['symbols']:
            if symbol_info['symbol'] == symbol:
                return {
                    'symbol': symbol,
                    'status': symbol_info['status'],
                    'onboardDate': symbol_info.get('onboardDate', None),
                    'contractType': symbol_info.get('contractType', 'PERPETUAL')
                }
        
        return None
        
    except requests.RequestException as e:
        print(f"Error getting symbol info for {symbol}: {e}")
        return None

def get_earliest_valid_timestamp(symbol, start_date):
    """Get the earliest valid timestamp for a symbol, considering listing date"""
    symbol_info = get_symbol_info(symbol)
    
    if not symbol_info or symbol_info['status'] != 'TRADING':
        print(f"Symbol {symbol} is not available for trading")
        return None
    
    # Convert start_date to timestamp
    if isinstance(start_date, str):
        start_dt = datetime.strptime(start_date, "%Y-%m-%d")
    else:
        start_dt = start_date
    
    # If symbol has onboard date, use the later of the two dates
    if symbol_info.get('onboardDate'):
        onboard_dt = datetime.fromtimestamp(symbol_info['onboardDate'] / 1000)
        earliest_dt = max(start_dt, onboard_dt)
    else:
        earliest_dt = start_dt
    
    return int(earliest_dt.timestamp() * 1000)

print("Symbol info functions defined")

Symbol info functions defined


In [11]:
def get_klines_batch(symbol, start_time, end_time, limit=1000):
    """Get klines for a specific time period with concurrent support"""
    rate_limiter.acquire()
    
    try:
        rate_limiter.wait_if_needed()
        
        url = f"{BINANCE_BASE_URL}/fapi/v1/klines"
        params = {
            'symbol': symbol,
            'interval': '1m',
            'startTime': start_time,
            'endTime': end_time,
            'limit': limit
        }
        
        response = requests.get(url, params=params, timeout=30)
        response.raise_for_status()
        
        rate_limiter.check_rate_limits(response.headers)
        
        return response.json()
        
    except requests.RequestException as e:
        print(f"Error getting klines for {symbol}: {e}")
        return []
    finally:
        rate_limiter.release()

def convert_kline_to_candle(kline_data, symbol):
    """Convert Binance kline data to Candle object"""
    return Candle(
        symbol=symbol,
        timestamp=datetime.fromtimestamp(int(kline_data[0]) / 1000),
        open=np.float64(kline_data[1]),
        high=np.float64(kline_data[2]),
        low=np.float64(kline_data[3]),
        close=np.float64(kline_data[4]),
        volume=np.float64(kline_data[5])
    )

def get_candle_batch_worker(args):
    """Worker function for concurrent batch processing"""
    symbol, start_timestamp, end_timestamp, batch_num = args
    
    try:
        klines = get_klines_batch(symbol, start_timestamp, end_timestamp)
        if klines:
            candles = [convert_kline_to_candle(kline, symbol) for kline in klines]
            return batch_num, candles, klines[-1][0]  # Return batch number, candles, and last timestamp
        return batch_num, [], None
    except Exception as e:
        print(f"Error in batch {batch_num} for {symbol}: {e}")
        return batch_num, [], None

def get_all_candles_for_symbol(symbol, start_date, end_date):
    """Get all 1-minute candles for a symbol within date range using concurrent processing"""
    print(f"Getting data for {symbol}...")
    
    # Get the earliest valid timestamp for this symbol
    start_timestamp = get_earliest_valid_timestamp(symbol, start_date)
    if not start_timestamp:
        print(f"Skipping {symbol}: not available")
        return []
    
    # Convert end_date to timestamp
    if isinstance(end_date, str):
        end_dt = datetime.strptime(end_date, "%Y-%m-%d")
    else:
        end_dt = end_date
    end_timestamp = int(end_dt.timestamp() * 1000)
    
    print(f"  Date range: {datetime.fromtimestamp(start_timestamp/1000)} to {end_dt}")
    
    # Create batches for concurrent processing
    batch_tasks = []
    current_start = start_timestamp
    batch_num = 0
    
    while current_start < end_timestamp:
        current_end = min(current_start + (1000 * 60 * 1000), end_timestamp)
        batch_tasks.append((symbol, current_start, current_end, batch_num))
        current_start = current_end
        batch_num += 1
    
    print(f"  Processing {len(batch_tasks)} batches concurrently...")
    
    # Process batches concurrently
    all_candles = []
    with ThreadPoolExecutor(max_workers=6) as executor:  # Conservative worker count
        batch_results = list(executor.map(get_candle_batch_worker, batch_tasks))
    
    # Sort results by batch number and combine
    batch_results.sort(key=lambda x: x[0])
    
    for batch_num, candles, last_timestamp in batch_results:
        if candles:
            all_candles.extend(candles)
            print(f"  Batch {batch_num}: {len(candles)} candles")
    
    print(f"  Completed {symbol}: {len(all_candles)} total candles")
    return all_candles

print("Concurrent candle collection functions defined")

Concurrent candle collection functions defined


In [None]:
def save_candles_to_csv(candles, symbol, data_dir):
    """Save candles to CSV file in the data directory"""
    if not candles:
        print(f"No candles to save for {symbol}")
        return
    
    # Create DataFrame
    df_data = []
    for candle in candles:
        df_data.append({
            'timestamp': candle.timestamp,
            'open': candle.open,
            'high': candle.high,
            'low': candle.low,
            'close': candle.close,
            'volume': candle.volume
        })
    
    df = pd.DataFrame(df_data)
    
    # Sort by timestamp to ensure chronological order
    df = df.sort_values('timestamp')
    
    # Save to CSV
    output_file = data_dir / f"{symbol}.csv"
    df.to_csv(output_file, index=False)
    
    print(f"Saved {len(df)} candles for {symbol} to {output_file}")

def load_filtered_tokens(tokens_file, max_symbols=None):
    """Load symbols from filtered_tokens.csv"""
    df = pd.read_csv(tokens_file)
    symbols = df['binance_symbol'].tolist()
    
    if max_symbols is not None:
        symbols = symbols[:max_symbols]
        print(f"Limited to first {max_symbols} symbols")
    
    print(f"Loaded {len(symbols)} symbols from {tokens_file}")
    return symbols

print("Utility functions defined")

Utility functions defined


In [13]:
def process_symbol_worker(args):
    """Worker function for processing individual symbols concurrently"""
    symbol, start_date, end_date, symbol_index, total_symbols = args
    
    print(f"\n[{symbol_index}/{total_symbols}] Processing {symbol}")
    
    try:
        # Collect candles for this symbol
        candles = get_all_candles_for_symbol(symbol, start_date, end_date)
        
        if candles:
            # Save to CSV
            save_candles_to_csv(candles, symbol, DATA_DIR)
            return symbol, True, len(candles)
        else:
            print(f"  No candles collected for {symbol}")
            return symbol, False, 0
            
    except Exception as e:
        print(f"  Error processing {symbol}: {e}")
        return symbol, False, 0

def collect_candle_data(start_date="2024-01-01", end_date="2024-12-31", max_symbols=None, max_concurrent_symbols=3):
    """
    Main function to collect candle data for multiple symbols with concurrent processing
    
    Args:
        start_date: Start date in YYYY-MM-DD format or datetime object
        end_date: End date in YYYY-MM-DD format or datetime object  
        max_symbols: Maximum number of symbols to process (None for all)
        max_concurrent_symbols: Maximum number of symbols to process concurrently
    """
    
    print("=== Binance USDT Perpetual Futures Candle Collection (Concurrent) ===")
    print(f"Date range: {start_date} to {end_date}")
    print(f"Max concurrent symbols: {max_concurrent_symbols}")
    
    # Load symbols
    symbols = load_filtered_tokens(TOKENS_FILE, max_symbols)
    
    # Ensure data directory exists
    DATA_DIR.mkdir(exist_ok=True)
    
    successful_collections = 0
    failed_collections = 0
    total_candles = 0
    
    # Process symbols in batches to avoid overwhelming the API
    symbol_tasks = []
    for i, symbol in enumerate(symbols, 1):
        symbol_tasks.append((symbol, start_date, end_date, i, len(symbols)))
    
    # Process symbols concurrently in batches
    batch_size = max_concurrent_symbols
    for i in range(0, len(symbol_tasks), batch_size):
        batch = symbol_tasks[i:i + batch_size]
        
        print(f"\nProcessing batch {i//batch_size + 1}/{(len(symbol_tasks) + batch_size - 1)//batch_size}")
        
        with ThreadPoolExecutor(max_workers=max_concurrent_symbols) as executor:
            results = list(executor.map(process_symbol_worker, batch))
        
        # Process results
        for symbol, success, candle_count in results:
            if success:
                successful_collections += 1
                total_candles += candle_count
            else:
                failed_collections += 1
        
        # Small delay between batches to be respectful
        if i + batch_size < len(symbol_tasks):
            time.sleep(2)
    
    print(f"\n=== Collection Summary ===")
    print(f"Successfully collected: {successful_collections}")
    print(f"Failed: {failed_collections}")
    print(f"Total symbols processed: {len(symbols)}")
    print(f"Total candles collected: {total_candles}")
    print(f"Data saved to: {DATA_DIR}")

print("Concurrent main collection function defined")

Concurrent main collection function defined


## Usage Examples

Below are several examples of how to use the candle collection system:

In [14]:
# Example 1: Test with 3 symbols concurrently (last 52 weeks)
from datetime import datetime, timedelta
end_date = datetime.now()
start_date = end_date - timedelta(weeks=52)

# Test with concurrent processing - this should be much faster!
collect_candle_data(start_date=start_date, end_date=end_date, max_symbols=3, max_concurrent_symbols=2)

print("Example 1 ready (test with 3 symbols, last 52 weeks, 2 concurrent)")

=== Binance USDT Perpetual Futures Candle Collection (Concurrent) ===
Date range: 2024-10-18 09:48:06.705662 to 2025-10-17 09:48:06.705662
Max concurrent symbols: 2
Limited to first 3 symbols
Loaded 3 symbols from /home/zac/back-testing/data/filtered_tokens.csv

Processing batch 1/2

[1/3] Processing RLCUSDT
Getting data for RLCUSDT...

[2/3] Processing BANKUSDT
Getting data for BANKUSDT...
  Date range: 2024-10-18 09:48:06.705000 to 2025-10-17 09:48:06.705662
  Processing 525 batches concurrently...
  Date range: 2025-04-18 20:30:00 to 2025-10-17 09:48:06.705662
  Processing 262 batches concurrently...
  Date range: 2024-10-18 09:48:06.705000 to 2025-10-17 09:48:06.705662
  Processing 525 batches concurrently...
  Date range: 2025-04-18 20:30:00 to 2025-10-17 09:48:06.705662
  Processing 262 batches concurrently...
Rate limit reached. Waiting 23.5 seconds...
Rate limit reached. Waiting 23.5 seconds...
Rate limit reached. Waiting 60.0 seconds...
Rate limit reached. Waiting 60.0 seconds