In [1]:
from utils.helper_clustering_functions import KMeanClustering ,kmeans_with_smape_ts ,kmeans_with_min_distance
from utils.helper_similarity_metrics import calculate_dtw_distance , calculate_error_metrics ,calculate_cosine_similarity_char ,CoinCrossMappingSimilarity ,smape,smape_distance_metric
from utils.helper_visualization_functions import plot_and_save , cluster_visualization_of_time_series
import pandas as pd
import numpy as np
import os 
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
import csv
import os 
import glob
import tqdm
import gc

In [2]:
import csv

directory_names = {
    "preprocessed_data_dir_name":"ProcessedData",
    "visualization_data_dir_name":"VisualizationData",
    "testing_garbage_dir_name" :"TestingGarbage",
    "ResultsDirectory":"SimilarityResults",
    "cluster_dir_path":"ClusterResultsVisualization"
}
for key , value in directory_names.items():
    print(f"Creating Directories : {value}")
    os.makedirs(directory_names[key], exist_ok=True)
    
files_path = {
    'raw_price_data' : os.path.join("Datasets","raw_datasets","prices.csv"),
    'raw_token_names' : os.path.join("Datasets","raw_datasets","token_names.csv"),
    "token_names":os.path.join("TestingGarbage","token_names.csv"),
    "similarity_results_file_path":os.path.join("SimilarityResults","similarity_results_version_0.1.csv")
}

Creating Directories : ProcessedData
Creating Directories : VisualizationData
Creating Directories : TestingGarbage
Creating Directories : SimilarityResults
Creating Directories : ClusterResultsVisualization


# Helping function

In [3]:
from concurrent.futures import ProcessPoolExecutor, as_completed
import os
import csv
import gc
import glob
from tqdm import tqdm

def process_pair(token1, token2, price_pivot_df, directory_names, results_file):
    try:
        token_folder_name = os.path.join(directory_names['visualization_data_dir_name'], token1)
        os.makedirs(token_folder_name, exist_ok=True)

        plot_filename = f"{token1}_vs_{token2}_dtw.png"
        saving_file_path = os.path.join(token_folder_name, plot_filename)

        # Check if the plot already exists
        if os.path.exists(saving_file_path):
            return

        # Drop missing values and find common indices
        ts1 = price_pivot_df[token1].dropna()
        ts2 = price_pivot_df[token2].dropna()
        common_index = ts1.index.intersection(ts2.index)
        ts1_common = ts1.loc[common_index]
        ts2_common = ts2.loc[common_index]

        # Proceed if common data points exist
        if len(ts1_common) > 0 and len(ts2_common) > 0:
            # Calculate similarity metrics
            dtw_distance = calculate_dtw_distance(ts1_common, ts2_common)
            cosine_sim = calculate_cosine_similarity_char(token1, token2)

            # Calculate error metrics
            mae, rmse, mape, smape = calculate_error_metrics(ts1_common, ts2_common)
            similarity_score = {
                "mae": mae,
                "rmse": rmse,
                "mape": mape,
                "smape": smape,
                "dtw": dtw_distance,
                "cosine_sim": cosine_sim
            }

            # Save plot only if SMAPE is above 15
            if smape < 15:
                plot_and_save(ts1=ts1_common, 
                              ts2=ts2_common, 
                              token1=token1, 
                              token2=token2, 
                              similarity_score=similarity_score, 
                              saving_file_path=saving_file_path)

            # Save the results in a CSV file
            with open(results_file, mode='a', newline='') as file:
                writer = csv.writer(file)
                writer.writerow([token1, token2, dtw_distance, mae, rmse, mape, smape, cosine_sim])

        # Free memory if necessary
        del ts1_common, ts2_common
        gc.collect()

    except Exception as e:
        print(f"Error processing tokens {token1} and {token2}: {e}")

def CoinCrossMappingSimilarity_multiprocessing(results_with_cluster_id=None, price_pivot_df=None, files_path=None, directory_names=None,max_workers=2):
    results_file = files_path['similarity_results_file_path']

    if not os.path.exists(results_file):
        with open(results_file, mode='w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(['Token1', 'Token2', 'DTW_Distance', 'MAE', 'RMSE', 'MAPE', 'SMAPE', "cosine_sim"])

    # Iterate through each cluster
    for cluster_id, cluster_group in results_with_cluster_id.groupby('cluster'):
        print(f"Processing Cluster ID: {cluster_id}")
        tokens = list(cluster_group['token_id'].unique())
        print(f"Number of tokens in this cluster: {len(tokens)}")

        # Create a pool of processes
        with ProcessPoolExecutor(max_workers=max_workers) as executor:
            futures = []
            for i, token1 in tqdm(enumerate(tokens), total=len(tokens), desc=f'Cluster {cluster_id}'):
                for j, token2 in enumerate(tokens):
                    if i < j:  # Skip redundant calculations
                        futures.append(executor.submit(process_pair, token1, token2, price_pivot_df, directory_names, results_file))

            # Wait for all processes to complete
            for future in tqdm(as_completed(futures), total=len(futures), desc="Processing Token Pairs"):
                future.result()

    gc.collect()
    total_detected_similar_tokens = len(glob.glob(f"{directory_names['visualization_data_dir_name']}/*/*.png"))
    return total_detected_similar_tokens


In [None]:


if __name__=="__main__":
    # step-1 Reading price data 
    cols_to_ignore = ['Unnamed: 0']
    raw_price_df = pd.read_csv(
                        files_path['raw_price_data'],
                        compression='gzip',
                        usecols=lambda col: col not in cols_to_ignore)
    
    # raw_price_df =  raw_price_df.head(1000*20)
    # step-2 Reading Token data
    token_names_df = pd.read_csv(files_path['raw_token_names'] )

    # token_names_df = token_names_df.head(1000)
    number_of_unique_token =  len(token_names_df['id'].unique())
    
    print(f"Shape of price data : {raw_price_df.shape}")
    print(f"shap of token names data : {token_names_df.shape}")
    print(f"Number of Unique Token : {number_of_unique_token}")

    # step-3 Merging the data
    merged_df = raw_price_df.merge(token_names_df, left_on=['network_id', 'base_currency'], right_on=['network_id', 'id'])
    # merged_df = raw_price_df.merge(token_names_df, left_on=['base_currency'], right_on=['id'])
    
    number_of_unique_token_after_merging =  len(merged_df['base_currency'].unique())
    
    print(f"Number of Unique Token after merging : {number_of_unique_token_after_merging}")

    merged_df['token_id'] = merged_df['base_currency'].astype(str) + '_' + merged_df['network_id'].astype(str)

    # selective_base_currency = list(merged_df['base_currency'].unique())[:20]
    # merged_df = merged_df[ merged_df['base_currency'].isin(selective_base_currency)  ]

    price_pivot = merged_df.pivot_table(index='timestamp_utc', columns='token_id', values='open')
    price_pivot = price_pivot.dropna()

    scaler = StandardScaler()
    price_scaled = scaler.fit_transform(price_pivot.T)  # Transpose so each row is a token


    # labels = KMeanClustering(n_clusters=10,
    #                 price_scaled=price_scaled,
    #                metric = "dtw",
    #                 max_iter=50)
    
    # cluster_results = pd.DataFrame({'token_id': price_pivot.columns, 'cluster': labels})

    # results_with_cluster_id = pd.merge(merged_df_filtered,cluster_results,on='token_id')

    # results_with_cluster_id['timestamp_utc'] = pd.to_datetime(results_with_cluster_id['timestamp_utc'])
    # results_with_cluster_id = results_with_cluster_id.sort_values(by='timestamp_utc')

    # cluster_visualization_of_time_series(results_with_cluster_id=results_with_cluster_id,cluster_dir_path = directory_names['cluster_dir_path'])


        
    # Perform DBSCAN clustering
    dbscan = DBSCAN(eps=100, min_samples=3, metric=smape_distance_metric)
    
    labels = dbscan.fit_predict(price_pivot.T.values)
    
    cluster_results = pd.DataFrame({'token_id': price_pivot.columns, 'cluster': labels})
    
    results_with_cluster_id = pd.merge(merged_df,cluster_results,on='token_id')
    
    results_with_cluster_id['timestamp_utc'] = pd.to_datetime(results_with_cluster_id['timestamp_utc'])
    results_with_cluster_id = results_with_cluster_id.sort_values(by='timestamp_utc')
    
    cluster_visualization_of_time_series(results_with_cluster_id=results_with_cluster_id,cluster_dir_path = directory_names['cluster_dir_path'])
    
    price_pivot_df = results_with_cluster_id.pivot_table(index='timestamp_utc', columns='token_id', values='open')


    total_similar_tokens = CoinCrossMappingSimilarity_multiprocessing(results_with_cluster_id, price_pivot_df, files_path, directory_names, max_workers=2)
    print(f"Total detected similar tokens: {total_similar_tokens}")

    

Shape of price data : (22021830, 4)
shap of token names data : (1000, 4)
Number of Unique Token : 1000
Number of Unique Token after merging : 541
Cluster ID: -1
Cluster id : [-1]
Number of tokens in this cluster: 5
file saved at path : ClusterResultsVisualization/cluster_-1.png
Cluster ID: 0
Cluster id : [0]
Number of tokens in this cluster: 15
file saved at path : ClusterResultsVisualization/cluster_0.png
Processing Cluster ID: -1
Number of tokens in this cluster: 5


Cluster -1: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 87.17it/s]
Processing Token Pairs: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:12<00:00,  1.24s/it]


Processing Cluster ID: 0
Number of tokens in this cluster: 15


Cluster 0: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:00<00:00, 327.18it/s]
Processing Token Pairs:  58%|███████████████████████████████████████████████████████████████████████████████████████▋                                                               | 61/105 [01:35<01:13,  1.66s/it]