Test notebook for basic functionality

In [1]:
# Imports
import numpy as np
import pandas as pd
import requests
import os
from dataclasses import dataclass
from datetime import datetime, timedelta
import time

In [2]:
# POCOs


@dataclass(slots=True)
class Candle:
    symbol: str
    timestamp: datetime # starting time of the candle UTC
    open: np.float64
    high: np.float64
    low: np.float64
    close: np.float64 | None = None
    volume_base: np.float64 | None = None
    volume_quote: np.float64 | None = None



In [3]:
# Config
symbols = ["BTCUSDT"]
candle_size = "1m"
data_path = "../data"
start_date = "2023-01-01"
end_date = "2023-06-01"


In [4]:
def get_binance_klines(symbol, interval, start_time, end_time, limit=1500):
    """
    Fetch kline data from Binance futures API
    """
    base_url = "https://fapi.binance.com/fapi/v1/klines"
    
    params = {
        'symbol': symbol,
        'interval': interval,
        'startTime': int(start_time.timestamp() * 1000),
        'endTime': int(end_time.timestamp() * 1000),
        'limit': limit
    }
    
    response = requests.get(base_url, params=params)
    response.raise_for_status()
    
    return response.json()

def klines_to_dataframe(klines_data):
    """
    Convert Binance klines data to DataFrame
    """
    df = pd.DataFrame(klines_data, columns=[
        'timestamp', 'open', 'high', 'low', 'close', 'volume',
        'close_time', 'quote_asset_volume', 'number_of_trades',
        'taker_buy_base_asset_volume', 'taker_buy_quote_asset_volume', 'ignore'
    ])
    
    df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')
    
    numeric_columns = ['open', 'high', 'low', 'close', 'volume', 'quote_asset_volume']
    df[numeric_columns] = df[numeric_columns].astype(float)
    
    df = df[['timestamp', 'open', 'high', 'low', 'close', 'volume', 'quote_asset_volume']]
    
    return df

def check_existing_data(file_path):
    """
    Check what data already exists and return the latest timestamp
    """
    if not os.path.exists(file_path):
        return None, None
    
    try:
        df = pd.read_csv(file_path)
        df['timestamp'] = pd.to_datetime(df['timestamp'])
        latest_timestamp = df['timestamp'].max()
        return df, latest_timestamp
    except Exception as e:
        print(f"Error reading existing data: {e}")
        return None, None

In [5]:
os.makedirs(data_path, exist_ok=True)

start_dt = datetime.strptime(start_date, "%Y-%m-%d")
end_dt = datetime.strptime(end_date, "%Y-%m-%d")

for symbol in symbols:
    print(f"\nProcessing {symbol}...")
    file_path = f"{data_path}/{symbol}.csv"
    
    existing_df, latest_timestamp = check_existing_data(file_path)
    
    if existing_df is not None:
        print(f"Found existing data with {len(existing_df)} rows")
        print(f"Latest timestamp: {latest_timestamp}")
        
        fetch_start = latest_timestamp + timedelta(minutes=1)
        if fetch_start >= end_dt:
            print(f"Data already up to date for {symbol}")
            continue
    else:
        print(f"No existing data found for {symbol}")
        fetch_start = start_dt
    
    print(f"Fetching data from {fetch_start} to {end_dt}")
    
    all_data = []
    current_start = fetch_start
    
    while current_start < end_dt:
        current_end = min(current_start + timedelta(hours=24) - timedelta(minutes=1), end_dt)
        
        try:
            print(f"Fetching {current_start} to {current_end}")
            klines = get_binance_klines(symbol, candle_size, current_start, current_end)
            
            if klines:
                df_chunk = klines_to_dataframe(klines)
                all_data.append(df_chunk)
                print(f"  Fetched {len(df_chunk)} candles")
                
                if len(df_chunk) > 0:
                    last_timestamp = df_chunk['timestamp'].max()
                    current_start = last_timestamp + timedelta(minutes=1)
                else:
                    current_start = current_end + timedelta(minutes=1)
            else:
                current_start = current_end + timedelta(minutes=1)
            
            time.sleep(0.1)
            
        except Exception as e:
            print(f"Error fetching data for {symbol}: {e}")
            break
    
    if all_data:
        new_df = pd.concat(all_data, ignore_index=True)
        new_df = new_df.drop_duplicates(subset=['timestamp']).sort_values('timestamp')
        
        if existing_df is not None:
            combined_df = pd.concat([existing_df, new_df], ignore_index=True)
            combined_df = combined_df.drop_duplicates(subset=['timestamp']).sort_values('timestamp')
        else:
            combined_df = new_df
        
        combined_df.to_csv(file_path, index=False)
        print(f"Saved {len(combined_df)} total candles to {file_path}")
        print(f"Date range: {combined_df['timestamp'].min()} to {combined_df['timestamp'].max()}")
        
        duplicates = combined_df.duplicated(subset=['timestamp']).sum()
        print(f"Duplicate timestamps after deduplication: {duplicates}")
    else:
        print(f"No new data fetched for {symbol}")

print("\nData fetching completed!")


Processing BTCUSDT...
No existing data found for BTCUSDT
Fetching data from 2023-01-01 00:00:00 to 2023-06-01 00:00:00
Fetching 2023-01-01 00:00:00 to 2023-01-01 23:59:00
  Fetched 1440 candles
Fetching 2023-01-01 22:00:00 to 2023-01-02 21:59:00
  Fetched 1440 candles
Fetching 2023-01-02 22:00:00 to 2023-01-03 21:59:00
  Fetched 1440 candles
Fetching 2023-01-03 22:00:00 to 2023-01-04 21:59:00
  Fetched 1440 candles
Fetching 2023-01-04 22:00:00 to 2023-01-05 21:59:00
  Fetched 1440 candles
Fetching 2023-01-05 22:00:00 to 2023-01-06 21:59:00
  Fetched 1440 candles
Fetching 2023-01-06 22:00:00 to 2023-01-07 21:59:00
  Fetched 1440 candles
Fetching 2023-01-07 22:00:00 to 2023-01-08 21:59:00
  Fetched 1440 candles
Fetching 2023-01-08 22:00:00 to 2023-01-09 21:59:00
  Fetched 1440 candles
Fetching 2023-01-09 22:00:00 to 2023-01-10 21:59:00
  Fetched 1440 candles
Fetching 2023-01-10 22:00:00 to 2023-01-11 21:59:00
  Fetched 1440 candles
Fetching 2023-01-11 22:00:00 to 2023-01-12 21:59:00
  F

In [6]:
# Verify downloaded data
for symbol in symbols:
    file_path = f"{data_path}/{symbol}_{candle_size}.csv"
    if os.path.exists(file_path):
        df = pd.read_csv(file_path)
        df['timestamp'] = pd.to_datetime(df['timestamp'])
        print(f"\n{symbol}:")
        print(f"  Total candles: {len(df):,}")
        print(f"  Date range: {df['timestamp'].min()} to {df['timestamp'].max()}")
        print(f"  File size: {os.path.getsize(file_path):,} bytes")