In [None]:
import requests
import zipfile
import polars as pl
import io
import gc
import json
from datetime import datetime, timedelta
from pathlib import Path
from typing import Optional


In [None]:
class DataLoader_:
    """Gère le téléchargement des données Binance"""
    
    def __init__(self, base_url: str, period: str = "monthly",
                 target: str = "aggTrades", data_dir: str = "./data"):
        self.base_url = base_url
        self.period = period
        self.target = target
        self.data_dir = Path(data_dir)
        self.data_dir.mkdir(exist_ok=True)
    
    def _get_filename(self, symbol: str, date: datetime) -> str:
        """Construit le nom de fichier selon le period"""
        if self.period == "monthly":
            return f'{symbol}-{self.target}-{date.year}-{date.month:02d}.csv'
        elif self.period == "daily":
            return f'{symbol}-{self.target}-{date.year}-{date.month:02d}-{date.day:02d}.csv'
        else:
            raise ValueError(f"Period invalide: {self.period}")
    
    def _get_url(self, symbol: str, date: datetime) -> str:
        """Construit l'URL de téléchargement"""
        filename_base = self._get_filename(symbol, date).replace('.csv', '.zip')
        return f"{self.base_url}/{self.period}/{self.target}/{symbol}/{filename_base}"
    
    def _download(self, url: str) -> bytes | None:
        """Télécharge un fichier ZIP"""
        try:
            r = requests.get(url, timeout=30)
            return r.content if r.status_code == 200 else None
        except (requests.RequestException, TimeoutError):
            return None
    
    def _unzip(self, content: bytes, symbol: str, date: datetime) -> Path | None:
        """Décompresse un fichier ZIP et retourne le chemin du CSV"""
        try:
            with zipfile.ZipFile(io.BytesIO(content)) as z:
                z.extractall(self.data_dir)
            
            csv_file = self.data_dir / self._get_filename(symbol, date)
            return csv_file if csv_file.exists() else None
        except Exception as e:
            print(f"Erreur décompression: {e}")
            return None
    
    def load(self, symbol: str, date: datetime) -> Path | None:
        """Charge un fichier (check existing → download → unzip)"""
        # Check existing
        csv_file = self.data_dir / self._get_filename(symbol, date)
        if csv_file.exists():
            return csv_file
        
        # Download
        url = self._get_url(symbol, date)
        content = self._download(url)
        if not content:
            return None
        
        # Unzip
        return self._unzip(content, symbol, date)

In [None]:

class DataLoader:
    """Gère le téléchargement des données Binance"""
    
    def __init__(self, base_url: str, period: str = "monthly",
                 target: str = "aggTrades", data_dir: str = "./data"):
        self.base_url = base_url
        self.period = period
        self.target = target
        self.data_dir = Path(data_dir)
        self.data_dir.mkdir(exist_ok=True)
        
        # Fichier JSON pour tracker les téléchargements
        self.tracker_file = self.data_dir / "downloaded_files.json"
        self.downloaded = self._load_tracker()
    
    def _load_tracker(self) -> dict:
        """Charge le fichier JSON de tracking"""
        if self.tracker_file.exists():
            with open(self.tracker_file, 'r') as f:
                return json.load(f)
        return {}
    
    def _save_tracker(self):
        """Sauvegarde le fichier JSON de tracking"""
        with open(self.tracker_file, 'w') as f:
            json.dump(self.downloaded, f, indent=2)
    
    def _mark_downloaded(self, symbol: str, date: datetime):
        """Marque un fichier comme téléchargé"""
        key = f"{symbol}_{date.year}-{date.month:02d}"
        self.downloaded[key] = {
            'symbol': symbol,
            'year': date.year,
            'month': date.month,
            'period': self.period,
            'downloaded_at': datetime.now().isoformat()
        }
        self._save_tracker()
    
    def _is_downloaded(self, symbol: str, date: datetime) -> bool:
        """Vérifie si un fichier a déjà été téléchargé"""
        key = f"{symbol}_{date.year}-{date.month:02d}"
        return key in self.downloaded
    
    def _get_filename(self, symbol: str, date: datetime) -> str:
        """Construit le nom de fichier selon le period"""
        if self.period == "monthly":
            return f'{symbol}-{self.target}-{date.year}-{date.month:02d}.csv'
        elif self.period == "daily":
            return f'{symbol}-{self.target}-{date.year}-{date.month:02d}-{date.day:02d}.csv'
        else:
            raise ValueError(f"Period invalide: {self.period}")
    
    def _get_url(self, symbol: str, date: datetime) -> str:
        """Construit l'URL de téléchargement"""
        filename_base = self._get_filename(symbol, date).replace('.csv', '.zip')
        return f"{self.base_url}/{self.period}/{self.target}/{symbol}/{filename_base}"
    
    def _download(self, url: str) -> bytes | None:
        """Télécharge un fichier ZIP"""
        try:
            r = requests.get(url, timeout=30)
            return r.content if r.status_code == 200 else None
        except (requests.RequestException, TimeoutError):
            return None
    
    def _unzip(self, content: bytes, symbol: str, date: datetime) -> Path | None:
        """Décompresse un fichier ZIP et retourne le chemin du CSV"""
        try:
            with zipfile.ZipFile(io.BytesIO(content)) as z:
                z.extractall(self.data_dir)
            
            csv_file = self.data_dir / self._get_filename(symbol, date)
            return csv_file if csv_file.exists() else None
        except Exception as e:
            print(f"Erreur décompression: {e}")
            return None
    
    def load(self, symbol: str, date: datetime) -> Path | None:
        """Charge un fichier (check tracker → check file → download → unzip)"""
        
        # 1. Check si déjà téléchargé selon le tracker
        if self._is_downloaded(symbol, date):
            # Vérifier quand même si le fichier physique existe
            csv_file = self.data_dir / self._get_filename(symbol, date)
            if csv_file.exists():
                return csv_file
            # Si le fichier n'existe plus mais est dans le tracker, on continue le téléchargement
        
        # 2. Download
        url = self._get_url(symbol, date)
        content = self._download(url)
        if not content:
            return None
        
        # 3. Unzip
        csv_file = self._unzip(content, symbol, date)
        
        # 4. Marquer comme téléchargé
        if csv_file:
            self._mark_downloaded(symbol, date)
        
        return csv_file
    
    

class BarMethods:
    
    def __init__(self, df: pl.DataFrame,
                 bar_params: dict = {'type': 'time', 'params': '5m'}):
        self.df_raw = df
        self.bar_params = bar_params
        self._time_bar()

    def _time_bar(self):
        self.df_agg = (
            self.df_raw
            .group_by_dynamic('timestamp', every=self.bar_params.get('params', '5m'))
            .agg([
                pl.col('price').first().alias('open'),
                pl.col('price').max().alias('high'),
                pl.col('price').min().alias('low'),
                pl.col('price').last().alias('close'),
                pl.col('quantity').sum().alias('volume'),
                pl.col('buy_volume').sum().alias('buy_volume'),
                pl.col('sell_volume').sum().alias('sell_volume'),
            ])
            .with_columns([
                (pl.col('buy_volume') - pl.col('sell_volume')).alias('delta'),
                (pl.col('buy_volume') + pl.col('sell_volume')).alias('total_volume')
            ])
            .with_columns([
                (pl.col('delta') / pl.col('total_volume')).alias('imbalance'),
                pl.col('delta').cum_sum().alias('cvd')
            ])
        )

class DataProcessor:
    """Traite, agrège et sauvegarde les données"""
    
    def __init__(self, data_dir: str,
                 bar_params: dict = {'type': 'time', 'params': '5m'}):
        self.data_dir = Path(data_dir)
        self.data_dir.mkdir(exist_ok=True)
        self.bar_params = bar_params
    
    def _preprocess(self, df: pl.DataFrame) -> pl.DataFrame:
        """Prépare les données brutes"""
        return df.with_columns([
            pl.from_epoch(pl.col('transact_time'), time_unit='ms').alias('timestamp'),
            pl.col('price').cast(pl.Float64),
            pl.col('quantity').cast(pl.Float64),
            pl.when(~pl.col('is_buyer_maker')).then(pl.col('quantity')).otherwise(0).alias('buy_volume'),
            pl.when(pl.col('is_buyer_maker')).then(pl.col('quantity')).otherwise(0).alias('sell_volume'),
            (pl.col('price') * pl.col('quantity')).alias('dollar_volume')
        ]).with_columns([
            pl.col('quantity').cum_sum().alias('cumulative_volume'),
            pl.col('dollar_volume').cum_sum().alias('cumulative_dollar')
        ])
    
    def _clear(self, filepath: Path, *objects):
        """Libère la mémoire et supprime le CSV"""
        # Supprimer le CSV source
        if filepath.exists():
            filepath.unlink()
        
        # Libérer les objets
        for obj in objects:
            del obj
        gc.collect()
        
    
    def execute(self, filepath: Path, symbol: str, date: datetime):
        """Pipeline complet: read → preprocess → aggregate → save → clear"""
        
        # Read
        df = pl.read_csv(filepath, has_header=True)

        # Preprocess
        df_prep = self._preprocess(df)
        
        # Aggregate
        bar_methods = BarMethods(df_prep, bar_params=self.bar_params)
        df_agg = bar_methods.df_agg
        
        # Save
        bar_str = self.bar_params.get('params', '5m')
        filename = self.data_dir / f"{symbol}_{bar_str}_{date.year}-{date.month:02d}.parquet"
        df_agg.write_parquet(filename)
        print(f"→ {filename.name} ({len(df_agg)} barres)")
        
        # Clear
        self._clear(filepath, df, df_prep, df_agg, bar_methods)

In [None]:
BASE_URL = "https://data.binance.vision/data/futures/um"
DATA_DIR = "./data"
SYMBOL = "BTCUSDT"
date = datetime(2024, 1, 1)



def run_one(symbol, date, data_dir=DATA_DIR, bar_params={'type': 'time', 'params': '5m'}):
    loader = DataLoader(base_url="https://data.binance.vision/data/futures/um", 
                    period="monthly", target="aggTrades", data_dir=data_dir)
    csv_file = loader.load(symbol=symbol, date=date)
    if csv_file:    
        print(csv_file)
        processor = DataProcessor(data_dir=data_dir, bar_params=bar_params)
        processor.execute(filepath=csv_file, symbol=symbol, date=date)
    else:
        print("✗ Fichier non disponible")

    

In [None]:
from datetime import datetime
from pathlib import Path
import gc

# ============================================================================
# CONFIGURATION
# ============================================================================

SYMBOLS = ['BTCUSDT', 'ETHUSDT']
START = datetime(2024, 1, 1)
END = datetime(2024, 12, 31)

# ============================================================================
# INITIALISATION
# ============================================================================
BASE_URL = "https://data.binance.vision/data/futures/um"
DATA_DIR = "./data"
SYMBOL = "BTCUSDT"
date = datetime(2024, 1, 1)



loader = DataLoader(base_url="https://data.binance.vision/data/futures/um", 
                    period="monthly", target="aggTrades", data_dir=DATA_DIR)
csv_file = loader.load(SYMBOL, date)
if csv_file:    
    print(csv_file)
    processor = DataProcessor(data_dir=DATA_DIR, bar_params={'type': 'time', 'params': '5m'})
    processor.execute(filepath=csv_file, symbol=SYMBOL, date=date)
else:
    print("✗ Fichier non disponible")


data\BTCUSDT-aggTrades-2024-01.csv
→ BTCUSDT_5m_2024-01.parquet (8928 barres)
