In [159]:
# Install python-binance library and import crypto data from Binance API.

In [160]:
from binance.client import Client
import pandas as pd
import config

In [161]:
api_key = config.api_key
api_secret = config.api_secret

In [162]:
client = Client(api_key, api_secret)

In [163]:
# Fetch the data

import requests

# Function to get top 100 cryptocurrencies from CoinGecko
def get_top_200_cryptos():
    url = 'https://api.coingecko.com/api/v3/coins/markets'
    params = {
        'vs_currency': 'usd',
        'order': 'market_cap_desc',
        'per_page': 200,
        'page': 1,
        'sparkline': False
    }
    response = requests.get(url, params=params)
    data = response.json()
    top_200_cryptos = {coin['name']: coin['symbol'].upper() + 'USDT' for coin in data}
    return top_200_cryptos

# Function to fetch historical data from Binance
def fetch_historical_data(symbol, start_date):
    interval = Client.KLINE_INTERVAL_1DAY
    klines = client.get_historical_klines(symbol, interval, start_date)
    df = pd.DataFrame(klines, columns=[
        'timestamp', 'open', 'high', 'low', 'close', 'volume', 'close_time', 'quote_asset_volume',
        'number_of_trades', 'taker_buy_base_asset_volume', 'taker_buy_quote_asset_volume', 'ignore'
    ])
    df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')
    return df

In [164]:
# Main execution

start_date = "1 January 2019"
top_200_cryptos = get_top_200_cryptos()
historical_data = {}

for name, symbol in top_200_cryptos.items():
    try:
        df = fetch_historical_data(symbol, start_date)
        historical_data[name] = df
        print(f"Fetched data for {name} ({symbol})")
    except Exception as e:
        print(f"Could not fetch data for {name} ({symbol}): {e}")

# Print sample data from the first cryptocurrency
first_crypto_name = next(iter(historical_data))
print(historical_data[first_crypto_name].head())

Fetched data for Bitcoin (BTCUSDT)
Fetched data for Ethereum (ETHUSDT)
Could not fetch data for Tether (USDTUSDT): APIError(code=-1121): Invalid symbol.
Fetched data for BNB (BNBUSDT)
Fetched data for Solana (SOLUSDT)
Fetched data for USDC (USDCUSDT)
Could not fetch data for Lido Staked Ether (STETHUSDT): APIError(code=-1121): Invalid symbol.
Fetched data for XRP (XRPUSDT)
Could not fetch data for Toncoin (TONUSDT): APIError(code=-1121): Invalid symbol.
Fetched data for Dogecoin (DOGEUSDT)
Fetched data for Cardano (ADAUSDT)
Fetched data for Shiba Inu (SHIBUSDT)
Fetched data for Avalanche (AVAXUSDT)
Fetched data for TRON (TRXUSDT)
Fetched data for Wrapped Bitcoin (WBTCUSDT)
Fetched data for Chainlink (LINKUSDT)
Fetched data for Bitcoin Cash (BCHUSDT)
Fetched data for Polkadot (DOTUSDT)
Fetched data for NEAR Protocol (NEARUSDT)
Fetched data for Polygon (MATICUSDT)
Fetched data for Litecoin (LTCUSDT)
Fetched data for Internet Computer (ICPUSDT)
Fetched data for Uniswap (UNIUSDT)
Fetched d

In [165]:
len(historical_data.keys())

140

In [166]:
from datetime import datetime

count = 0
for df in historical_data.values():
    first_cell = df.iloc[0, 0]
    print(type(first_cell), first_cell)  # Debugging output
    if isinstance(first_cell, datetime):
        first_cell = first_cell.strftime('%Y-%m-%d %H:%M:%S')
    if first_cell == '2019-01-01 00:00:00':
        count += 1
count

<class 'pandas._libs.tslibs.timestamps.Timestamp'> 2019-01-01 00:00:00
<class 'pandas._libs.tslibs.timestamps.Timestamp'> 2019-01-01 00:00:00
<class 'pandas._libs.tslibs.timestamps.Timestamp'> 2019-01-01 00:00:00
<class 'pandas._libs.tslibs.timestamps.Timestamp'> 2020-08-11 00:00:00
<class 'pandas._libs.tslibs.timestamps.Timestamp'> 2019-01-01 00:00:00
<class 'pandas._libs.tslibs.timestamps.Timestamp'> 2019-01-01 00:00:00
<class 'pandas._libs.tslibs.timestamps.Timestamp'> 2019-07-05 00:00:00
<class 'pandas._libs.tslibs.timestamps.Timestamp'> 2019-01-01 00:00:00
<class 'pandas._libs.tslibs.timestamps.Timestamp'> 2021-05-10 00:00:00
<class 'pandas._libs.tslibs.timestamps.Timestamp'> 2020-09-22 00:00:00
<class 'pandas._libs.tslibs.timestamps.Timestamp'> 2019-01-01 00:00:00
<class 'pandas._libs.tslibs.timestamps.Timestamp'> 2023-04-28 00:00:00
<class 'pandas._libs.tslibs.timestamps.Timestamp'> 2019-01-16 00:00:00
<class 'pandas._libs.tslibs.timestamps.Timestamp'> 2019-11-28 00:00:00
<class

16

In [179]:
# Count how many crypto we will have with changing start datetime.

count = 0
matching_keys = []
threshold_date = datetime.strptime('2021-01-01 00:00:00', '%Y-%m-%d %H:%M:%S')

for key, df in historical_data.items():
    first_cell = df.iloc[0, 0]
    # Convert first cell to datetime if it is not already
    if isinstance(first_cell, str):
        first_cell = datetime.strptime(first_cell, '%Y-%m-%d %H:%M:%S')
    elif isinstance(first_cell, pd.Timestamp):
        first_cell = first_cell.to_pydatetime()

    if first_cell > threshold_date:
        count += 1
        matching_keys.append(key)

print("Count:", count)
print("Matching keys:", matching_keys)


Count: 0
Matching keys: []


In [180]:
import pandas as pd
from datetime import datetime

# Set the threshold dates for filtering and trimming
threshold_start_date = datetime.strptime('2021-01-01 00:00:00', '%Y-%m-%d %H:%M:%S')
threshold_end_date = datetime.strptime('2024-05-15 00:00:00', '%Y-%m-%d %H:%M:%S')

# Step 1: Filter DataFrames that do not start after the threshold date
filtered_historical_data = {}
for key, df in historical_data.items():
    if not df.empty:
        first_cell = df.iloc[0, 0]
        if (isinstance(first_cell, str) and datetime.strptime(first_cell, '%Y-%m-%d %H:%M:%S') <= threshold_start_date) or (isinstance(first_cell, pd.Timestamp) and first_cell.to_pydatetime() <= threshold_start_date):
            filtered_historical_data[key] = df

# Step 2: Trim DataFrames to start from the threshold date if they have older data
for key, df in filtered_historical_data.items():
    if not df.empty:
        # Ensure the first column is a datetime object
        if isinstance(df.iloc[0, 0], str):
            df.iloc[:, 0] = pd.to_datetime(df.iloc[:, 0])

        # Filter rows to start from the threshold date
        filtered_historical_data[key] = df[df.iloc[:, 0] >= threshold_start_date]

# Step 3: Remove DataFrames with incorrect start or end dates or length not equal to 1231
final_historical_data = {}
for key, df in filtered_historical_data.items():
    if not df.empty:
        start_date = df.iloc[0, 0]
        end_date = df.iloc[-1, 0]
        if start_date == threshold_start_date and end_date == threshold_end_date and len(df) == 1231:
            final_historical_data[key] = df

# Update the original dictionary to be the final filtered one
historical_data = final_historical_data

# Output the remaining keys to confirm
print("Remaining keys:", list(historical_data.keys()))
print(len(historical_data.keys()))

# Optional: Print start and end dates and length of each DataFrame to verify
for key, df in historical_data.items():
    if not df.empty:
        print(f'{key} - start: {df.iloc[0, 0]} to end: {df.iloc[-1, 0]}, length: {len(df)}')


Remaining keys: []
0


In [175]:
for df in historical_data.values():
    print(len(df))

In [176]:
# Optional: Print start and end dates of each DataFrame to verify
for key, df in historical_data.items():
    print(f'{key} - start: {df.iloc[0, 0]} to end: {df.iloc[-1, 0]}')

In [177]:
# Assuming you have a dictionary of DataFrames where each DataFrame represents data for a coin
# The dictionary keys represent the coin names

# Initialize an empty dictionary to store integral of paths for each coin
daily_percent_change_dict = {}

# Compute the daily change for each coin
for coin, df in historical_data.items():
    daily_percent_changes_list = []  # Create a new list for each coin
    for i in range(len(df)-1):
        # Assuming the 5th column is index 4
        daily_change = (float(df.iloc[i+1, 4]) - float(df.iloc[i, 4])) / float(df.iloc[i, 4]) * 100
        daily_percent_changes_list.append(round(daily_change, 2))  # Round to 2 decimal places
    daily_percent_change_dict[coin] = daily_percent_changes_list

In [178]:
daily_percent_change_dict['Solana']

KeyError: 'Solana'

In [None]:
daily_percent_change_dict['BNB']

In [None]:
from sklearn.cluster import KMeans
import numpy as np

# Assuming daily_change_dict is your dictionary with coin names as keys and daily changes list as values

# Determine the maximum length of daily change lists
max_length = max(len(changes) for changes in daily_percent_change_dict.values())

# Pad or truncate daily change lists to the maximum length
padded_daily_changes = []
for changes in daily_percent_change_dict.values():
    padded_changes = changes + [0] * (max_length - len(changes))  # Pad with zeros
    # Alternatively, you can truncate longer lists by uncommenting the line below
    # padded_changes = changes[:max_length]
    padded_daily_changes.append(padded_changes)

# Prepare data
coins = list(daily_percent_change_dict.keys())
X = np.array(padded_daily_changes)  # Convert values to numpy array

# Choose the number of clusters (k)
k = 5  # You can adjust this number based on your data and requirements

# Initialize and fit KMeans model
kmeans = KMeans(n_clusters=k)
kmeans.fit(X)

# Get cluster labels
cluster_labels = kmeans.labels_

# Print clustering results
for i in range(k):
    cluster_coins = [coins[j] for j in range(len(coins)) if cluster_labels[j] == i]
    print(f'Cluster {i+1}: {cluster_coins}')
