In [1]:
from utils.helper_clustering_functions import KMeanClustering ,kmeans_with_smape_ts ,kmeans_with_min_distance
from utils.helper_similarity_metrics import calculate_dtw_distance , calculate_error_metrics ,calculate_cosine_similarity_char ,CoinCrossMappingSimilarity ,smape,smape_distance_metric
from utils.helper_visualization_functions import plot_and_save , cluster_visualization_of_time_series
import pandas as pd
import numpy as np
import os 
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
import csv


In [2]:
import csv

directory_names = {
    "preprocessed_data_dir_name":"ProcessedData",
    "visualization_data_dir_name":"VisualizationData",
    "testing_garbage_dir_name" :"TestingGarbage",
    "ResultsDirectory":"SimilarityResults",
    "cluster_dir_path":"ClusterResultsVisualization"
}
for key , value in directory_names.items():
    print(f"Creating Directories : {value}")
    os.makedirs(directory_names[key], exist_ok=True)
    
files_path = {
    'raw_price_data' : os.path.join("Datasets","raw_datasets","prices.csv"),
    'raw_token_names' : os.path.join("Datasets","raw_datasets","token_names.csv"),
    "token_names":os.path.join("TestingGarbage","token_names.csv"),
    "similarity_results_file_path":os.path.join("SimilarityResults","similarity_results_version_0.1.csv")
}

Creating Directories : ProcessedData
Creating Directories : VisualizationData
Creating Directories : TestingGarbage
Creating Directories : SimilarityResults
Creating Directories : ClusterResultsVisualization


# Helping function

In [3]:


if __name__=="__main__":
    # step-1 Reading price data 
    cols_to_ignore = ['Unnamed: 0']
    raw_price_df = pd.read_csv(
                        files_path['raw_price_data'],
                        compression='gzip',
                        usecols=lambda col: col not in cols_to_ignore)
    
    # raw_price_df =  raw_price_df.head(1000*20)
    # step-2 Reading Token data
    token_names_df = pd.read_csv(files_path['raw_token_names'] )

    # token_names_df = token_names_df.head(1000)
    number_of_unique_token =  len(token_names_df['id'].unique())
    
    print(f"Shape of price data : {raw_price_df.shape}")
    print(f"shap of token names data : {token_names_df.shape}")
    print(f"Number of Unique Token : {number_of_unique_token}")

    # step-3 Merging the data
    merged_df = raw_price_df.merge(token_names_df, left_on=['network_id', 'base_currency'], right_on=['network_id', 'id'])
    # merged_df = raw_price_df.merge(token_names_df, left_on=['base_currency'], right_on=['id'])
    
    number_of_unique_token_after_merging =  len(merged_df['base_currency'].unique())
    print(f"Number of Unique Token after merging : {number_of_unique_token_after_merging}")

    merged_df['token_id'] = merged_df['base_currency'].astype(str) + '_' + merged_df['network_id'].astype(str)
    selective_base_currency = [1166,15390,1146,15467,2012,15593,13049,16668,162796,168956]

    # selective_base_currency = list(merged_df['base_currency'].unique())[:-1]

    merged_df = merged_df[ merged_df['base_currency'].isin(selective_base_currency)  ]

    price_pivot = merged_df.pivot_table(index='timestamp_utc', columns='token_id', values='open')
    price_pivot = price_pivot.fillna(method='ffill').fillna(method='bfill')

    
    scaler = StandardScaler()
    price_scaled = scaler.fit_transform(price_pivot.T)  # Transpose so each row is a token


    # labels = KMeanClustering(n_clusters=10,
    #                 price_scaled=price_scaled,
    #                metric = "dtw",
    #                 max_iter=50)
    
    # cluster_results = pd.DataFrame({'token_id': price_pivot.columns, 'cluster': labels})

    # results_with_cluster_id = pd.merge(merged_df_filtered,cluster_results,on='token_id')

    # results_with_cluster_id['timestamp_utc'] = pd.to_datetime(results_with_cluster_id['timestamp_utc'])
    # results_with_cluster_id = results_with_cluster_id.sort_values(by='timestamp_utc')

    # cluster_visualization_of_time_series(results_with_cluster_id=results_with_cluster_id,cluster_dir_path = directory_names['cluster_dir_path'])


        
    # Perform DBSCAN clustering
    
    print("Clustering .....")
    dbscan = DBSCAN(eps=10, min_samples=2, metric=smape_distance_metric)
    
    labels = dbscan.fit_predict(price_pivot.T.values)
    
    cluster_results = pd.DataFrame({'token_id': price_pivot.columns, 'cluster': labels})
    
    results_with_cluster_id = pd.merge(merged_df,cluster_results,on='token_id')
    
    results_with_cluster_id['timestamp_utc'] = pd.to_datetime(results_with_cluster_id['timestamp_utc'])
    results_with_cluster_id = results_with_cluster_id.sort_values(by='timestamp_utc')
    
    cluster_visualization_of_time_series(results_with_cluster_id=results_with_cluster_id,cluster_dir_path = directory_names['cluster_dir_path'])
    
    price_pivot_df = results_with_cluster_id.pivot_table(index='timestamp_utc', columns='token_id', values='open')
    
    
    total_detected_similar_tokens = CoinCrossMappingSimilarity(results_with_cluster_id=results_with_cluster_id, price_pivot_df=price_pivot_df,
                                                               files_path = files_path,
                                                               directory_names=directory_names)
    
    

Shape of price data : (22021830, 4)
shap of token names data : (55787, 4)
Number of Unique Token : 55787
Number of Unique Token after merging : 16369


  price_pivot = price_pivot.fillna(method='ffill').fillna(method='bfill')


Clustering .....
Cluster ID: -1
Cluster id : [-1]
Number of tokens in this cluster: 2
file saved at path : ClusterResultsVisualization/cluster_-1.png
Cluster ID: 0
Cluster id : [0]
Number of tokens in this cluster: 2
file saved at path : ClusterResultsVisualization/cluster_0.png
Cluster ID: 1
Cluster id : [1]
Number of tokens in this cluster: 2
file saved at path : ClusterResultsVisualization/cluster_1.png
Cluster ID: 2
Cluster id : [2]
Number of tokens in this cluster: 2
file saved at path : ClusterResultsVisualization/cluster_2.png
Processing Cluster ID: -1
Number of tokens in this cluster: 2


Cluster -1:   0%|                                                                                                                                                                              | 0/2 [00:00<?, ?it/s]

Token1: 15593_2 and Token2: 1166_1


Cluster -1: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:04<00:00,  2.32s/it]


Processing Cluster ID: 0
Number of tokens in this cluster: 2


Cluster 0: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 2095.58it/s]


Processing Cluster ID: 1
Number of tokens in this cluster: 2


Cluster 1: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 2513.06it/s]


Processing Cluster ID: 2
Number of tokens in this cluster: 2


Cluster 2: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 3462.08it/s]


In [6]:
cluster_results['cluster'].unique()

array([ 0, -1,  1,  2])

In [11]:
selct_id = list(cluster_results['token_id'].unique())

In [8]:
merged_df[ merged_df['base_currency'].isin(['1146_1',13049_1]) ]

Unnamed: 0,network_id,base_currency,timestamp_utc,open,id,display_name,full_name,token_id


In [15]:
merged_df [merged_df['token_id'].isin(selct_id)]

Unnamed: 0,network_id,base_currency,timestamp_utc,open,id,display_name,full_name,token_id
2364291,1,1146,2024-01-01 00:00:00+00:00,2276.749208,1146,WETH,Wrapped Ether,1146_1
2364292,1,1146,2024-01-01 01:00:00+00:00,2289.556378,1146,WETH,Wrapped Ether,1146_1
2364293,1,1146,2024-01-01 02:00:00+00:00,2299.143545,1146,WETH,Wrapped Ether,1146_1
2364294,1,1146,2024-01-01 03:00:00+00:00,2294.922573,1146,WETH,Wrapped Ether,1146_1
2364295,1,1146,2024-01-01 04:00:00+00:00,2278.497918,1146,WETH,Wrapped Ether,1146_1
...,...,...,...,...,...,...,...,...
16761953,1,168956,2024-02-24 12:00:00+00:00,0.137452,168956,TRX,TRON,168956_1
16762089,1,13049,2024-02-24 11:00:00+00:00,102.432491,13049,SOL,Wrapped SOL (Wormhole),13049_1
16762090,1,13049,2024-02-24 12:00:00+00:00,102.364262,13049,SOL,Wrapped SOL (Wormhole),13049_1
16767848,2,16668,2024-02-24 11:00:00+00:00,102.235546,16668,SOL,SOLANA,16668_2


In [17]:
merged_df [merged_df['token_id'].isin(selct_id)][['id','display_name','full_name']].drop_duplicates()

Unnamed: 0,id,display_name,full_name
2364291,1146,WETH,Wrapped Ether
3499503,13049,SOL,Wrapped SOL (Wormhole)
4097584,168956,TRX,TRON
5235248,1166,WBTC,Wrapped BTC
6587779,15593,DOGE,Dogecoin
6711678,15467,ETH,Ethereum Token
6962070,16668,SOL,SOLANA
8746995,162796,TRX,TRON


In [18]:
price_pivot_df

token_id,1146_1,1166_1,13049_1,15467_2,15593_2,162796_2,16668_2,168956_1
timestamp_utc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2024-01-01 00:00:00+00:00,2276.749208,42162.625718,100.916692,2281.253582,0.089342,0.107731,101.418230,0.108514
2024-01-01 01:00:00+00:00,2289.556378,42399.402490,101.956201,2292.825732,0.089803,0.107740,101.934136,0.108514
2024-01-01 02:00:00+00:00,2299.143545,42569.818279,103.860506,2301.390643,0.089944,0.107971,103.950880,0.108514
2024-01-01 03:00:00+00:00,2294.922573,42491.951509,103.682483,2296.545059,0.089828,0.107685,103.467932,0.108419
2024-01-01 04:00:00+00:00,2278.497918,42259.274694,103.213321,2275.818070,0.089042,0.107665,102.701461,0.108201
...,...,...,...,...,...,...,...,...
2024-02-29 20:00:00+00:00,3397.205293,61378.093237,129.589618,3392.073114,0.125975,0.141934,129.755330,0.137118
2024-02-29 21:00:00+00:00,3403.926460,61992.963496,130.680054,3402.962132,0.126949,0.141704,130.398961,0.136933
2024-02-29 22:00:00+00:00,3339.562272,61063.669207,126.776973,3338.367171,0.119694,0.140772,127.172046,0.136233
2024-02-29 23:00:00+00:00,3323.797192,61067.145910,123.808898,3324.228389,0.116662,0.139945,124.337706,0.136233
