In [1]:
import pandas as pd
import os
import glob 
import matplotlib.pyplot as plt
import pickle

In [2]:
def read_crypto_data(path):
    """
    This function reads data from a csv file and applies a datatype scheme in order to ensure consistency

    Parameters:
    path (string): Path to a csv file
    
    Returns:
    dataframe: Dataframe that contains the content of the csv file
    """
    #Define the data type dictionary for the columns
    dtype_dict = {
        'Date': object,
        'Price': float,
        'Daily Active Addresses': float,
        'Price Volatility 1w': float,
        'RSI 1d': float,
        'Exchange Flow Balance': float,
        'Percent of Stablecoin Total Supply held by Whales with more than 5 million USD': float,
        'Whale Transaction Count (>1m USD)': float,
        'Age Consumed': float,
        'Circulation (90d)': float,
        'The Ratio of Daily On-Chain Transaction Volume in Profit to Loss': float,
        'Mean Coin Age': float,
        'Mean Dollar Invested Age': float,
        'MVRV Long/Short Difference': float,
        'MVRV Ratio (Z score)': float,
        'Percent of Total Supply in Profit': float
    }

    #Read csv file into the dataframe
    df = pd.read_csv(path, dtype = dtype_dict)

    return df

In [3]:
def read_dir_of_csv(dictionary, path):
    """
    This function reads all the csv files in a given directory. Csv files will be stored in a dirctionary data structure where the key is the filname without type extention. The according content of the csv file is saved as a pandas dataframe.
        
    Parameters: 
    dictionary (dict): Target dictionary for the data to be stored
    path (string): Path of the directory where the csv files can be found
        
    Returns:
    dict: Dictionary with the filenames as keys and content of the according csv as dataframes
    """
    
    #use glob to retrieve the csv files in the given folder 
    csv_files = glob.glob(os.path.join(path, "*.csv"))
    
    #loop over the list of csv files 
    for file in csv_files: 
        
        #read the csv file 
        df = read_crypto_data(file) 
        
        #create filename by cutting the string (remove directory and extension)
        filename = os.path.basename(file).split('.')[0]
        
        #save dataframe in dictionary with filename as key
        dictionary[filename] = df
        
    return dictionary

In [4]:
def drop_nan_prices(df):
    """
    This function drops the NaN values in the price column of a given dataframe. The entire row will be deleted
    
    Parameter:
    df (dataframe): the dataframe which NaN values are to be deleted
    
    Returns:
    df (dataframe): returns cleaned up price column for the given dataframe
    """
    
    #drop rows where price is NaN
    df.dropna(subset=["Price"], inplace=True)

    return df

In [5]:
def clean_dfs_prices(dictionary):
    """
    This function cleans all dataframes in a dictionary for NaN prices
    
    Parameters:
    dictionary (dict): dictionary containing dataframes
    
    Returns:
    dictionary (dict): dictionary containing cleaned dataframes
    """
    
    #loop over dataframes in dictionary an print desciption
    for name in dictionary:
        drop_nan_prices(dictionary[name])
    
    return dictionary

In [6]:
def convert_timestamps(dictionary):
    """
    This function converts the timestamps into a format that is processable 
    
    Parameters:
    dictionary (dictionary): all crypto currencies in a given directory
    
    Returns:
    dictionary (dict): dictionary containing dataframes with preprocessed timestamps
    """
    
    #loop over currencies
    for currency in dictionary:
        dictionary[currency]['Date'] = pd.to_datetime(dictionary[currency]['Date'])
        dictionary[currency]['Date'] = dictionary[currency]['Date'].dt.strftime('%Y-%m-%d')
        dictionary[currency]['Date'] = pd.to_datetime(dictionary[currency]['Date'])
        dictionary[currency] = dictionary[currency].set_index('Date')
   
    return dictionary

In [7]:
def get_horizons(prices, delta):
    """
    This funtion calculates the date of the horizon with the given timedelta
    
    Parameters:
    Prices (series): Series of prices where the index should be a pandas timestamp
    Delta (integer): timedelta to be considered
    
    Returns:
    Horizon (series): contains the fixed time horizon based on given timedelta
    """  
    
    horizon = prices.index.searchsorted(prices.index + delta)
    horizon = horizon[horizon < prices.shape[0]]
    horizon = prices.index[horizon]
    horizon = pd.Series(horizon, index=prices.index[:horizon.shape[0]])
    
    return horizon

In [8]:
def assign_horizons_old(dictionary, delta):
    """
    This function assigns the horizon to the currency dataframes
    
    Parameters:
    dictionary (dictionary): all crypto currencies in a given directory
    delta (pd.Timedelta): Timedelta of the window to calculate returns
    
    Returns:
    dictionary (dict): dictionary containing dataframes with extra column for horizons
    """
    
    #loop over currencies
    for currency in dictionary:
        dictionary[currency] = dictionary[currency].assign(horizon = get_horizons(dictionary[currency], delta)).dropna()
    
    return dictionary

In [9]:
def assign_horizons(dictionary, delta):
    """
    This function assigns the horizon to the currency dataframes
    
    Parameters:
    dictionary (dictionary): all crypto currencies in a given directory
    delta (pd.Timedelta): Timedelta of the window to calculate returns
    
    Returns:
    dictionary (dict): dictionary containing dataframes with extra column for horizons
    """
    
    #loop over currencies
    for currency in dictionary:
        dictionary[currency] = dictionary[currency].assign(horizon = get_horizons(dictionary[currency], delta))
    
    return dictionary

In [10]:
def allign_data_tail_old(data, delta):
    
    """
    This function alligns the data tail to the horizon in order for the calculations not to fail
    
    Parameters:
    data (dataframe): dataframe containing the prices and horizons of the asset
    delta (integer): fixed horizon of the data that needs to be adjusted at the end of the dataset
    
    Returns:
    data (dataframe): dataframe with adjusted tail
    """
    
    max_index = data.index[-1]
    
    for index, row in data.iterrows():
        if row["horizon"] > max_index:
            data.loc[index, "horizon"] = max_index
    
    return data

In [11]:
def allign_data_tail(data, delta):
    
    """
    This function alligns the data tail to the horizon in order for the calculations not to fail
    
    Parameters:
    data (dataframe): dataframe containing the prices and horizons of the asset
    delta (integer): fixed horizon of the data that needs to be adjusted at the end of the dataset
    
    Returns:
    data (dataframe): dataframe with adjusted tail
    """
    
    max_index = data.index[-(delta + 1)]
    for index, row in data.iterrows():
        if index > max_index:
            data.loc[index, "horizon"] = data.index[-1]
    
    return data

In [12]:
def allign_all_tails(dictionary, delta):
    """
    Function to loop over the currencies and allign timestamps of the tail according to the given time window
    
    Parameters:
    dictionary (dictionary): all crypto currencies in a given directory
    delta (int): number of postions in the tail to be alligned
    
    Returns:
    dictionary (dict): dictionary with dataframes containing modified tails
    """
    
    #loop over currencies
    for currency in dictionary:
        dictionary[currency] = allign_data_tail(dictionary[currency], delta)
    
    return dictionary

In [13]:
def save_dictionary(path, data):
    
    # Ensure the directory exists
    os.makedirs(os.path.dirname(path), exist_ok=True)
    
    with open(path, 'wb') as file:
        pickle.dump(data, file)

In [14]:
def load_dictionary(path):
    with open(path, 'rb') as file:
        loaded_data = pickle.load(file)
    return loaded_data