In [36]:
import numpy as np
import pandas as pd
import requests
from concurrent.futures import ThreadPoolExecutor
import datetime as dt
import os
from functools import reduce
from fetch_symbols import get_symbols


class Data:
    def __init__(self, symbols, interval, start_time, end_time):
        self.symbols = symbols
        self.interval = interval
        self.start_time = start_time
        self.end_time = end_time
        self.available_symbols = self.binance_symbols()
        self.df = self.get_data()

    def binance_symbols(self):
        """Fetch available symbols from Binance API."""
        response = requests.get("https://api.binance.com/api/v3/exchangeInfo")
        exchange_info = response.json()
        valid_symbols = {s['symbol'] for s in exchange_info['symbols']}
        return [s for s in self.symbols if s in valid_symbols]

    def fetch_symbol_data(self, symbol, date_list, url, limit):
        """Fetch kline data for a single symbol."""
        all_data = []
        for i in range(len(date_list) - 1):
            params = {
                'symbol': symbol,
                'interval': self.interval,
                'startTime': int(date_list[i].timestamp() * 1000),
                'endTime': int((date_list[i + 1] - dt.timedelta(seconds=1)).timestamp() * 1000),
                'limit': limit,
            }
            response = requests.get(url, params=params)
            data = response.json()
            if isinstance(data, list):
                all_data.extend(data)
        return symbol, all_data

    def get_binance_klines(self, limit=1000):
        """Fetch historical kline data for all symbols in parallel."""
        url = "https://api.binance.com/api/v3/klines"
        date_list = pd.date_range(start=self.start_time, end=self.end_time, freq='D').tolist()

        # Use ThreadPoolExecutor for parallel fetching
        with ThreadPoolExecutor(max_workers=10) as executor:
            results = executor.map(
                lambda symbol: self.fetch_symbol_data(symbol, date_list, url, limit),
                self.available_symbols,
            )

        # Process and combine results
        data_frames = {}
        for symbol, data in results:
            if not data:
                continue
            df = pd.DataFrame(data)
            df = df.iloc[:, 0:6]
            df.columns = ['Open Time', 'open', 'high', 'low', 'close', 'volume']
            df.index = pd.to_datetime(df['Open Time'], unit='ms')
            df.drop('Open Time', axis=1, inplace=True)
            data_frames[symbol] = df

        if not data_frames:
            return None

        combined_df = pd.concat(data_frames, axis=1)
        combined_df = combined_df.swaplevel(axis=1).sort_index(axis=1)
        combined_df = combined_df.apply(pd.to_numeric, errors='coerce')

        return combined_df

    def prepare_data(self, df):
        """Prepare data for analysis."""
        _df = df.copy()
        for coin in df.columns.levels[1]:
            _df['returns', coin] = _df['close', coin].pct_change()
            _df['log_return', coin] = np.log(_df['returns', coin] + 1)
            _df["creturns", coin] = _df["log_return", coin].cumsum().apply(np.exp)
            _df['price', coin] = _df['close', coin]
            _df['volume_in_dollars', coin] = _df['close', coin] * _df['volume', coin]

        df = _df.stack(future_stack=True)
        df.sort_index(axis=1, inplace=True)
        df.index.names = ['date', 'coin']
        df.dropna(inplace=True)

        return df

    def upload_data(self, df, filename):
        """Save data to a CSV file."""
        df.to_csv(filename)

    def get_data(self):
        """Main function to fetch, prepare, and save data."""
        df = self.get_binance_klines()
        if df is not None:
            df = self.prepare_data(df)
            self.upload_data(df, 'data.csv')
        return df
    



class CSV_Data:
    def __init__(self, folder_path, symbols):
        self.folder_path = folder_path
        self.symbols = symbols
        self.df = self.process_folder(folder_path, symbols)
        self.df = self.prepare_data()
        self.upload_data_to_csv(self.df)
        
    
    def prepare_data(self):
        df = self.df.copy()
        for coin in df.columns.levels[1]:
            df['returns', coin] = df['close', coin].pct_change()
            df['log_return', coin] = np.log(df['returns', coin])
            df["creturns", coin] = df["log_return", coin].cumsum().apply(np.exp)
            df['price', coin] = df['close', coin]
            df['volume_in_dollars', coin] = df['close', coin] * df['volume', coin]

        df = df.stack(level=1, future_stack=True)
        df.sort_index(axis=1, inplace=True)
        df.index.names = ['date', 'coin']
        df.dropna(inplace=True)

        return df
    
    def get_data(self, file_path, symbols):
        df = pd.read_csv(file_path)
        df = df.drop(columns = df.columns[-1]).reset_index()
        df.drop(columns = df.columns[0], inplace = True)
        df.drop(index = 0, inplace = True)
        df.columns = ['date', 'coin', 'open', 'high', 'low', 'close', 'volume', 'volume_in_dollars']

        if not df['coin'].iloc[0] in symbols:
            return
        # Clean the date column by stripping whitespace
        df['date'] = df['date'].str.strip()
        # Parse the date column with mixed format
        df['date'] = pd.to_datetime(df['date'], format='mixed', errors='coerce')
        
        df.set_index([df.columns[0], df.columns[1]], inplace = True)
        df = df.unstack()
        return df
    
    def process_folder(self, folder_path, symbols):
        # Get all CSV files in the folder
        csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
        
        dfs = []
        
        for file in csv_files:
            file_path = os.path.join(folder_path, file)
            df = self.get_data(file_path, symbols)
            if df is not None:
                dfs.append(df)
        

        # Get the union of all indices (dates) to align the data
        all_dates = reduce(pd.Index.union, [df.index.get_level_values(0) for df in dfs])

        # Reindex all DataFrames to the same set of dates (adding NaNs where data is missing)
        dfs_aligned = [df.reindex(all_dates, level=0, fill_value=None) for df in dfs]

        # Concatenate all DataFrames
        concatenated_df = pd.concat(dfs_aligned, axis=1)
        concatenated_df = concatenated_df.sort_index(axis=1)
        concatenated_df = concatenated_df.apply(pd.to_numeric, errors='coerce', downcast='float') #Essential to perform calculations
        
        return concatenated_df

    def upload_data_to_csv(self, df):
        # Upload the data to CSV file
        df.to_csv('all_data.csv')
    

    



# Example usage
# symbols = ['BTCUSD', 'ETHUSD']
symbols = get_symbols()
# Add the symbol to each string in the list
updated_symbols = [s + 'T' for s in symbols]
interval = '1h'
start_time = dt.datetime(2020, 1, 1)
end_time = dt.datetime(2020, 1, 7)
df = Data(updated_symbols, interval, start_time, end_time).df
print(df)


#Use the below for uploading full data (uploaded to csv)
# symbols = get_symbols()
# binance_symbols = Data(symbols)
# folder_path = r'C:\Users\yassi\OneDrive\Documents\Trading\Algo Trading Projects\Algo Business\data\Binance Data (CSV)'
# df = CSV_Data(folder_path, symbols).df

  _df['volume_in_dollars', coin] = _df['close', coin] * _df['volume', coin]
  _df['returns', coin] = _df['close', coin].pct_change()
  _df['log_return', coin] = np.log(_df['returns', coin] + 1)
  _df["creturns", coin] = _df["log_return", coin].cumsum().apply(np.exp)
  _df['price', coin] = _df['close', coin]
  _df['volume_in_dollars', coin] = _df['close', coin] * _df['volume', coin]
  _df['returns', coin] = _df['close', coin].pct_change()
  _df['log_return', coin] = np.log(_df['returns', coin] + 1)
  _df["creturns", coin] = _df["log_return", coin].cumsum().apply(np.exp)
  _df['price', coin] = _df['close', coin]
  _df['volume_in_dollars', coin] = _df['close', coin] * _df['volume', coin]
  _df['returns', coin] = _df['close', coin].pct_change()
  _df['log_return', coin] = np.log(_df['returns', coin] + 1)
  _df["creturns", coin] = _df["log_return", coin].cumsum().apply(np.exp)
  _df['price', coin] = _df['close', coin]
  _df['volume_in_dollars', coin] = _df['close', coin] * _df['volume', coi

                                  close  creturns       high  log_return  \
date                coin                                                   
2020-01-01 01:00:00 ADAUSDT    0.032990  1.006406   0.033030    0.006386   
                    ALGOUSDT   0.217500  1.003692   0.217500    0.003685   
                    ANKRUSDT   0.001452  1.008333   0.001462    0.008299   
                    ATOMUSDT   4.290000  1.017070   4.292000    0.016926   
                    BANDUSDT   0.221600  1.000903   0.221600    0.000903   
...                                 ...       ...        ...         ...   
2020-01-06 23:00:00 WINUSDT    0.000095  1.028139   0.000095    0.004219   
                    XMRUSDT   58.910000  1.321741  59.270000    0.008695   
                    XTZUSDT    1.304700  0.967591   1.305300    0.012106   
                    ZECUSDT   32.860000  1.197958  32.970000    0.012864   
                    ZRXUSDT    0.195800  1.082366   0.197300    0.009236   

           

In [48]:
start_time = dt.datetime(2020, 1, 1)
end_time = dt.datetime(2020, 1, 2)
df = Data(updated_symbols, interval, start_time, end_time).df
df

  _df['volume_in_dollars', coin] = _df['close', coin] * _df['volume', coin]
  _df['returns', coin] = _df['close', coin].pct_change()
  _df['log_return', coin] = np.log(_df['returns', coin] + 1)
  _df["creturns", coin] = _df["log_return", coin].cumsum().apply(np.exp)
  _df['price', coin] = _df['close', coin]
  _df['volume_in_dollars', coin] = _df['close', coin] * _df['volume', coin]
  _df['returns', coin] = _df['close', coin].pct_change()
  _df['log_return', coin] = np.log(_df['returns', coin] + 1)
  _df["creturns", coin] = _df["log_return", coin].cumsum().apply(np.exp)
  _df['price', coin] = _df['close', coin]
  _df['volume_in_dollars', coin] = _df['close', coin] * _df['volume', coin]
  _df['returns', coin] = _df['close', coin].pct_change()
  _df['log_return', coin] = np.log(_df['returns', coin] + 1)
  _df["creturns", coin] = _df["log_return", coin].cumsum().apply(np.exp)
  _df['price', coin] = _df['close', coin]
  _df['volume_in_dollars', coin] = _df['close', coin] * _df['volume', coi

Unnamed: 0_level_0,Unnamed: 1_level_0,close,creturns,high,log_return,low,open,price,returns,volume,volume_in_dollars
date,coin,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2020-01-01 01:00:00,ADAUSDT,0.032990,1.006406,0.033030,0.006386,0.032760,0.032770,0.032990,0.006406,1.560752e+06,51489.201882
2020-01-01 01:00:00,ALGOUSDT,0.217500,1.003692,0.217500,0.003685,0.216700,0.216800,0.217500,0.003692,2.287337e+04,4974.957975
2020-01-01 01:00:00,ANKRUSDT,0.001452,1.008333,0.001462,0.008299,0.001440,0.001440,0.001452,0.008333,9.391210e+05,1363.603692
2020-01-01 01:00:00,ATOMUSDT,4.290000,1.017070,4.292000,0.016926,4.223000,4.223000,4.290000,0.017070,1.918447e+04,82301.359140
2020-01-01 01:00:00,BANDUSDT,0.221600,1.000903,0.221600,0.000903,0.218600,0.219900,0.221600,0.000903,2.713665e+04,6013.481640
...,...,...,...,...,...,...,...,...,...,...,...
2020-01-01 23:00:00,WINUSDT,0.000093,1.006494,0.000093,-0.001075,0.000093,0.000093,0.000093,-0.001074,1.461924e+09,135958.940928
2020-01-01 23:00:00,XMRUSDT,45.820000,1.028046,45.880000,-0.000436,45.760000,45.850000,45.820000,-0.000436,6.313433e+02,28928.150006
2020-01-01 23:00:00,XTZUSDT,1.370000,1.016019,1.374500,-0.002187,1.365200,1.372500,1.370000,-0.002185,2.710982e+04,37140.453400
2020-01-01 23:00:00,ZECUSDT,28.070000,1.023332,28.100000,0.000000,27.980000,28.060000,28.070000,0.000000,1.933481e+03,54272.799881
