In [1]:
import pandas as pd
import numpy as np
from helper_functions import Helper

In [2]:
def cleanCrosswalk(cw_df: pd.DataFrame) -> pd.DataFrame:
    """ Cleans a crosswalk DataFrame by ensuring that both asset IDs are unique, 
    sorting the columns, sorting by the asset name, and resetting the index.

    Args:
        cw_df: A pandas DataFrame representing the crosswalk.

    Returns:
        A pandas DataFrame with the cleaned crosswalk.

    Raises:
        AssertionError: If either of the asset IDs are not unique.
    """
    # Ensure both asset ids are unique
    assert cw_df['asset_cg'].is_unique, "asset_cg must be unique"

    # Sort columns
    cw_df = cw_df[['asset_cm', 'asset_cg']]

    # Sort by cm asset name and reset index
    cw_df = cw_df.sort_values(by='asset_cm').reset_index(drop=True)

    return cw_df

In [3]:
def cleanPanel(df: pd.DataFrame, cw_df: pd.DataFrame) -> pd.DataFrame:
    # Fix columns with known issues
    df = df.drop(columns='alexa_rank', axis=1)
    df = df.rename(columns={'usd_volume_cg': 'usd_volume_24h_cg',
                            'forks': 'github_forks',
                            'stars': 'github_stars',
                            'subscribers': 'github_subscribers',
                            'total_issues': 'github_total_issues',
                            'closed_issues': 'github_closed_issues',
                            'pull_requests_merged': 'github_pull_requests_merged',
                            'pull_requests_contributors': 'github_pull_requests_contributors',
                            'code_additions_4_weeks': 'github_code_additions_4_weeks',
                            'code_deletions_4_weeks': 'github_code_deletions_4_weeks',
                            'commit_count_4_weeks': 'github_commit_count_4_weeks'})
    df['github_code_deletions_4_weeks'] = np.abs(df['github_code_deletions_4_weeks'])
    df['usd_mcap_cg'] = df.groupby('asset_cg')['usd_mcap_cg'].ffill()

    # Convert all columns to float32
    cols = list(df.columns.values)
    cols.remove('date')
    cols.remove('asset_cg')
    for col in cols:
        df[col] = df[col].astype('float32')

    # Convert date column to datetime format and remove timezone information
    df['date'] = pd.to_datetime(df['date'], utc=True).dt.tz_localize(None)

    # Confirm no mising dates nor asset ids
    assert 0 == df.date.isnull().sum()
    assert 0 == df.asset_cg.isnull().sum()

    # Confirm all asset ids are in cw
    assert cw_df.asset_cg.is_unique
    asset_ids = list(cw_df.asset_cg.values)
    assert df.shape[0] == df[df.asset_cg.isin(asset_ids)].shape[0]

    # Cut down to relevant dates between July 1, 2016 and January 2, 2023
    df = df[(df['date'] >= '2016-07-01') & (df['date'] <= '2023-01-02')]

    # For price, volume, and mcap columns, ensure no missing obs and below thresholds
    thresholds = {'usd_per_token_cg': 1e9, 'usd_volume_24h_cg': 1e11, 'usd_mcap_cg': 1e13}
    for col, thresh in thresholds.items():
        assert 0 == df[col].isnull().sum()
        df = df[df[col] < thresh]

    # For each asset_cg, replace missing values with zero if there's at least one non-missing observation in the column
    for col in cols:
        has_nonmissing = df.groupby('asset_cg')[col].transform(lambda x: x.notnull().any())
        df.loc[has_nonmissing, col] = df.loc[has_nonmissing, col].fillna(0)

    # Ensure no negative numbers
    assert 0 == (df[cols]<0).sum().sum()

    # Loop over all assets to add any missing days and fill missing price, volume, and mcap data
    final_df = pd.DataFrame()
    assets = list(np.unique(df.asset_cg.values))
    for asset in assets:
        # subset to asset of interest
        asset_df = df[df.asset_cg==asset].copy()

        # determine the date gaps
        date_gaps = []
        dates = asset_df.date.values
        for i in range(1, len(dates)):
            date_gaps.append(np.timedelta64(dates[i]-dates[i-1], 'D').astype(int))

        # determine new days to add
        indices_to_expand = [i for i in range(len(date_gaps)) if (date_gaps[i] > 1) & (date_gaps[i] < 32)]
        num_datetime_to_add = [date_gaps[i] for i in range(len(date_gaps)) if (date_gaps[i] > 1) & (date_gaps[i] < 32)]
        start_datetimes = dates[indices_to_expand]
        new_datetimes = []
        for i in range(len(start_datetimes)):
            start_datetime = start_datetimes[i]
            datetime_to_add = num_datetime_to_add[i]
            for j in range(1, datetime_to_add):
                new_datetimes.append(start_datetime+np.timedelta64(24*(j), 'h'))

        # add the new days to the asset df
        new_asset_df = pd.DataFrame(data={'date': new_datetimes})
        new_asset_df['asset_cg'] = asset
        asset_df = pd.concat((asset_df, new_asset_df))
        asset_df = asset_df.sort_values(by='date', ignore_index=True)

        # forward fill 
        asset_df = asset_df.ffill()

        # add data to master df
        final_df = pd.concat((final_df, asset_df))

    # reset df name
    del df
    df = final_df.copy()

    # Set column order
    cols.remove('usd_per_token_cg')
    cols.remove('usd_volume_24h_cg')
    cols.remove('usd_mcap_cg')
    cols.sort()
    df = df[['date', 'asset_cg', 'usd_per_token_cg', 'usd_mcap_cg', 'usd_volume_24h_cg']+cols]

    # Ensure all columns have cg in name
    df.columns = [col if col == 'date' or col.endswith('_cg') else col + '_cg' for col in df.columns]

    # ensure no duplicates by date and asset
    assert not df.duplicated(subset=['date', 'asset_cg']).any()

    # Sort by date then asset and reset index
    df = df.sort_values(by=['date', 'asset_cg']).reset_index(drop=True)

    return df

In [4]:
def collapsePanel(df: pd.DataFrame) -> pd.DataFrame:
    """
    Takes a pandas DataFrame `df` and collapses multiple columns representing 
    GitHub and Reddit activity into single columns.

    Parameters
    ----------
    df : pd.DataFrame
        The input DataFrame containing the columns to be collapsed.

    Returns
    -------
    pd.DataFrame
        The modified DataFrame with the collapsed columns.
    """
    # fill missing with zero
    df = df.fillna(0) 

    # create single github activity column
    cols_to_norm = ['github_closed_issues_cg',
        'github_code_additions_4_weeks_cg', 'github_code_deletions_4_weeks_cg',
        'github_commit_count_4_weeks_cg', 'github_forks_cg',
        'github_pull_requests_merged_cg', 'github_stars_cg',
        'github_subscribers_cg', 'github_total_issues_cg',
        'pull_request_contributors_cg']
    for col in cols_to_norm:
        df = Helper.xsecNormalizeToMinusOneOne(df, col, 'asset_cg')
    df['github_activity_cg'] = df[cols_to_norm].mean(axis=1)
    df = Helper.xsecNormalizeToMinusOneOne(df, 'github_activity_cg', 'asset_cg')
    df = df.drop(cols_to_norm, axis=1)

    # create single reddit activity column
    cols_to_norm = ['reddit_accounts_active_48h_cg', 'reddit_average_comments_48h_cg', 
        'reddit_average_posts_48h_cg', 'reddit_subscribers_cg']
    for col in cols_to_norm:
        df = Helper.xsecNormalizeToMinusOneOne(df, col, 'asset_cg')
    df['reddit_activity_cg'] = df[cols_to_norm].mean(axis=1)
    df = Helper.xsecNormalizeToMinusOneOne(df, 'reddit_activity_cg', 'asset_cg')
    df = df.drop(cols_to_norm, axis=1)

    return df

In [5]:
if __name__ == "__main__":
    # set args
    CW_IN_FP = '../data/raw/coingecko_coinmetrics_cw.pkl'
    PANEL_IN_FP = '../data/raw/coingecko_panel.pkl'
    CW_OUT_FP = '../data/derived/cg_cm_cw.pkl'
    PANEL_OUT_FP = '../data/derived/cg_panel.pkl'

    # import data
    cw_df = pd.read_pickle(CW_IN_FP)
    df = pd.read_pickle(PANEL_IN_FP)

    # clean data
    cw_df = cleanCrosswalk(cw_df)
    df = cleanPanel(df, cw_df)

    # collapse columns of panel
    df = collapsePanel(df)

    # save data
    cw_df.to_pickle(CW_OUT_FP)
    df.to_pickle(PANEL_OUT_FP)