In [25]:
import pandas as pd
import requests
from datetime import datetime, timedelta
import time
import os

class CryptoDataFetcher:
    def __init__(self):
        self.base_url = "https://api.binance.com/api/v3/klines"
        self.trading_pairs = ["BTCUSDT", "ETHUSDT", "SOLUSDT", "BNBUSDT"]
        self.interval = "1m"
        self.output_dir = "crypto_data"
        self.chunk_size = timedelta(days=1)  # Fetch 1 day at a time
        self.max_retries = 3
        
        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)

    def get_historical_data(self, symbol, start_time, end_time):
        all_data = []
        current_time = start_time
        
        while current_time < end_time:
            chunk_end = min(current_time + self.chunk_size, end_time)
            
            params = {
                "symbol": symbol,
                "interval": self.interval,
                "startTime": int(current_time.timestamp() * 1000),
                "endTime": int(chunk_end.timestamp() * 1000),
                "limit": 1000
            }
            
            retry_count = 0
            
            while retry_count < self.max_retries:
                try:
                    response = requests.get(self.base_url, params=params)
                    response.raise_for_status()
                    data = response.json()
                    
                    if data:
                        all_data.extend(data)
                        break
                    else:
                        print(f"No data received for {symbol} at {current_time}")
                        break
                        
                except requests.exceptions.RequestException as e:
                    retry_count += 1
                    if retry_count == self.max_retries:
                        print(f"Failed to fetch data for {symbol} after {self.max_retries} retries: {str(e)}")
                        return pd.DataFrame()
                    time.sleep(2 ** retry_count)  # Exponential backoff
            
            current_time = chunk_end
            time.sleep(1)  # Respect rate limits
        
        if not all_data:
            return pd.DataFrame()
            
        # Convert to DataFrame with clear column names
        df = pd.DataFrame(all_data, columns=[
            'timestamp', 'open', 'high', 'low', 'close', 
            'volume', 'close_time', 'quote_volume',
            'trades', 'taker_buy_volume', 'taker_buy_quote_volume', 'ignore'
        ])
        
        # Keep only the columns we need
        df = df[['timestamp', 'open', 'high', 'low', 'close', 'volume', 'trades']]
        
        # Convert timestamp to datetime
        df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')
        
        # Convert numeric columns to float
        numeric_columns = ['open', 'high', 'low', 'close', 'volume']
        for col in numeric_columns:
            df[col] = df[col].astype(float)
        
        # Convert trades to integer
        df['trades'] = df['trades'].astype(int)
            
        return df

    def fetch_and_save_data(self, start_date_str, end_date_str):
        start_date = datetime.strptime(start_date_str, "%Y-%m-%d")
        end_date = datetime.strptime(end_date_str, "%Y-%m-%d")
        current_date = start_date
        
        while current_date <= end_date:
            print(f"Fetching data for {', '.join(self.trading_pairs)} on {current_date.strftime('%Y-%m-%d')}")
            
            for symbol in self.trading_pairs:
                try:
                    # Fetch data for the symbol
                    df = self.get_historical_data(symbol, current_date, current_date + self.chunk_size)
                    
                    if df.empty:
                        continue
                    
                    # Check for missing timestamps
                    expected_timestamps = pd.date_range(start=current_date, end=current_date + self.chunk_size, freq='1min')
                    actual_timestamps = set(df['timestamp'])
                    missing_timestamps = [ts for ts in expected_timestamps if ts not in actual_timestamps]
                    
                    if missing_timestamps:
                        print(f"Warning: Found {len(missing_timestamps)} missing timestamps for {symbol}")
                        print(f"First few missing timestamps: {missing_timestamps[:5]}")
                    
                    # Save to a single file per trading pair
                    filename = f"{symbol}.csv"
                    filepath = os.path.join(self.output_dir, filename)
                    df.to_csv(filepath, index=False, mode='a', header=not os.path.exists(filepath))
                    print(f"Appended {len(df)} rows to {filename}")
                
                except Exception as e:
                    print(f"Error processing data for {symbol} on {current_date.strftime('%Y-%m-%d')}: {str(e)}")
            
            current_date += self.chunk_size
            time.sleep(1)  # Respect rate limits

def main():
    fetcher = CryptoDataFetcher()
    
    # Get yesterday's date as end date to ensure complete data
    end_date = (datetime.now() - timedelta(days=1)).strftime("%Y-%m-%d")
    
    print("Starting data fetch...")
    fetcher.fetch_and_save_data("2022-01-01", end_date)
    print("Data fetch complete!")


In [26]:
if __name__ == "__main__":
    main()

Starting data fetch...
Fetching data for BTCUSDT, ETHUSDT, SOLUSDT, BNBUSDT on 2022-01-01
First few missing timestamps: [Timestamp('2022-01-01 13:40:00'), Timestamp('2022-01-01 13:41:00'), Timestamp('2022-01-01 13:42:00'), Timestamp('2022-01-01 13:43:00'), Timestamp('2022-01-01 13:44:00')]
Appended 1000 rows to BTCUSDT.csv
First few missing timestamps: [Timestamp('2022-01-01 13:40:00'), Timestamp('2022-01-01 13:41:00'), Timestamp('2022-01-01 13:42:00'), Timestamp('2022-01-01 13:43:00'), Timestamp('2022-01-01 13:44:00')]
Appended 1000 rows to ETHUSDT.csv
First few missing timestamps: [Timestamp('2022-01-01 13:40:00'), Timestamp('2022-01-01 13:41:00'), Timestamp('2022-01-01 13:42:00'), Timestamp('2022-01-01 13:43:00'), Timestamp('2022-01-01 13:44:00')]
Appended 1000 rows to SOLUSDT.csv
First few missing timestamps: [Timestamp('2022-01-01 13:40:00'), Timestamp('2022-01-01 13:41:00'), Timestamp('2022-01-01 13:42:00'), Timestamp('2022-01-01 13:43:00'), Timestamp('2022-01-01 13:44:00')]
Appe