In [3]:
import requests
import zipfile
import polars as pl
import io
import gc
from datetime import datetime, timedelta
from pathlib import Path

# ============================================================================
# CONFIGURATION
# ============================================================================
BASE_URL = "https://data.binance.vision/data/futures/um"
SYMBOLS = ['BTCUSDT', 'ETHUSDT']
START = datetime(2024, 1, 1)
END = datetime(2024, 12, 31)
OUTPUT_DIR = Path("./data")

OUTPUT_DIR.mkdir(exist_ok=True)

In [None]:
class DataLoader:
    """Gère le téléchargement des données Binance"""
    
    def __init__(self, base_url: str):
        self.base_url = base_url
        self.fail = {'url' : []}
    
    def _load_zip(self, url: str) -> bytes | None:
        try:
            r = requests.get(url, timeout=30)
            return r.content if r.status_code == 200 else None
        except e as Exception:
            self.fail['url'].append(url)
            return None
    
    def unzip_file(self, content: bytes, extract_to: Path) -> bool:
        """Décompresse un fichier ZIP"""
        try:
            with zipfile.ZipFile(io.BytesIO(content)) as z:
                z.extractall(extract_to)
            return True
        except:
            return False
        
    def download_data(self, symbol: str, date: datetime) -> bool:
        """Télécharge et décompresse les données pour un symbole et une date donnés"""
        year = date.year
        month = date.month
        day = date.day
        
        file_name = f'{symbol}-aggTrades-{year}-{month:02d}.zip'
        url = f"{url}/{file_name}"
        
    


In [None]:

# ============================================================================
# CLASSE PROCESSOR
# ============================================================================

class DataProcessor:
    """Traite, agrège et sauvegarde les données"""
    
    def __init__(self, output_dir: Path, bar_window: str = '5m'):
        self.output_dir = output_dir
        self.bar_window = bar_window
    
    def preprocess(self, df: pl.DataFrame) -> pl.DataFrame:
        """Prépare les données brutes pour l'analyse"""
        return df.with_columns([
            pl.from_epoch(pl.col('transact_time'), time_unit='ms').alias('timestamp'),
            pl.col('price').cast(pl.Float64),
            pl.col('quantity').cast(pl.Float64),
            pl.when(~pl.col('is_buyer_maker')).then(pl.col('quantity')).otherwise(0).alias('buy_volume'),
            pl.when(pl.col('is_buyer_maker')).then(pl.col('quantity')).otherwise(0).alias('sell_volume'),
            (pl.col('price') * pl.col('quantity')).alias('dollar_volume')
        ])
    
    def aggregate(self, df: pl.DataFrame) -> pl.DataFrame:
        """Agrège en barres temporelles"""
        return (
            df
            .group_by_dynamic('timestamp', every=self.bar_window)
            .agg([
                pl.col('price').first().alias('open'),
                pl.col('price').max().alias('high'),
                pl.col('price').min().alias('low'),
                pl.col('price').last().alias('close'),
                pl.col('quantity').sum().alias('volume'),
                pl.col('buy_volume').sum().alias('buy_volume'),
                pl.col('sell_volume').sum().alias('sell_volume'),
            ])
            .with_columns([
                (pl.col('buy_volume') - pl.col('sell_volume')).alias('delta'),
            ])
            .with_columns([
                (pl.col('delta') / pl.col('volume')).alias('imbalance'),
                pl.col('delta').cum_sum().alias('cvd')
            ])
        )
    
    def save(self, df: pl.DataFrame, symbol: str, year: int, month: int):
        """Sauvegarde le résultat en parquet"""
        filename = self.output_dir / f"{symbol}_{self.bar_window}_{year}-{month:02d}.parquet"
        df.write_parquet(filename)
        print(f"→ {filename.name} ({len(df)} barres)")
    
    def clear(self, *objects):
        """Libère la mémoire"""
        for obj in objects:
            del obj
        gc.collect()
    
    def process(self, df: pl.DataFrame, symbol: str, year: int, month: int):
        """Pipeline complet: preprocess → aggregate → save → clear"""
        df_prep = self.preprocess(df)
        df_agg = self.aggregate(df_prep)
        self.save(df_agg, symbol, year, month)
        self.clear(df, df_prep, df_agg)

# ============================================================================
# EXÉCUTION
# ============================================================================

if __name__ == "__main__":
    loader = DataLoader(BASE_URL)
    processor = DataProcessor(OUTPUT_DIR, bar_window='5m')
    
    current = START
    
    while current <= END:
        year = current.year
        month = current.month
        
        for symbol in SYMBOLS:
            print(f"\n{symbol} {year}-{month:02d}:", end=" ")
            
            # Load
            df = loader.load_month(symbol, year, month)
            
            # Process (si données disponibles)
            if df is not None:
                processor.process(df, symbol, year, month)
            else:
                print("skip")
        
        # Mois suivant
        current = datetime(year + 1, 1, 1) if month == 12 else datetime(year, month + 1, 1)
    
    print("\n\n✓ Terminé !")