In [1]:
import os
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

In [None]:
def loading(path: str, name: str) -> pd.DataFrame:
    """
    this function loading crypto index and keep only closed values
    args:
        - path: path to file
        - name: name of index
    return:
        - pandas DataFrame with columns Date and index
    """

    renaming_dict = {
        'Date': 'date', 'Close': name, 'Volume': name + '_vol'
    }

    df = pd.read_csv(path, parse_dates=['Date']).drop(columns=['Adj Close'])

    df = df.rename(columns=renaming_dict)
    df = df[renaming_dict.values()]
    
    df[name+'_chg'] = df[name].pct_change().fillna(0).multiply(100).round(2)
    df[name+'_vol_chg'] = df[name+'_vol'].pct_change().fillna(0).multiply(100).round(2)
    for col in df.columns[1:]:
        try:
            df[col] = df[col].str.replace(',', '', regex=False)
            df[col] = df[col].astype('double')
        except AttributeError:
            continue

    return df

In [None]:
def loading_v2(path: str, name: str) -> pd.DataFrame:
    renaming_dict = {
        'Datetime': 'date', 'Close': name, 'Volume': name + '_vol'
    }

    df = pd.read_csv(path, parse_dates=['Datetime']).drop(columns=['Adj Close'])

    df['Datetime'] = df.Datetime.dt.tz_localize(None) # remove timezone


    df = df.rename(columns=renaming_dict)
    df = df[renaming_dict.values()]
    
    df[name+'_chg'] = df[name].pct_change().fillna(0).multiply(100).round(2)
    df[name+'_vol_chg'] = df[name+'_vol'].pct_change().fillna(0).multiply(100).round(2)
    for col in df.columns[1:]:
        try:
            df[col] = df[col].str.replace(',', '', regex=False)
            df[col] = df[col].astype('double')
        except AttributeError:
            continue

    return df

In [None]:
def loading_v3(path: str) -> pd.DataFrame:
    renaming_dict = {
        'Adj Close': 'Close'
    }

    df = pd.read_csv(path, parse_dates=True, keep_date_col=True)
    if 'Adj Close' in df.columns:
        df.drop(columns=['Close'], inplace=True)
        df.rename(columns=renaming_dict, inplace=True)

    df['chg'] = df['Close'].pct_change().fillna(0).multiply(100).round(2)
    df['vol_chg'] = df['Volume'].pct_change().fillna(0).multiply(100).round(2)
    df.dropna(subset=['chg', 'vol_chg'], inplace=True) 

    for col in df.columns[1:]:
        try:
            df[col] = df[col].str.replace(',', '', regex=False)
            df[col] = df[col].astype('double')
        except AttributeError:
            continue

    return df

In [None]:
def get_data(dir: str, filename: str, compress=False):
    ext='.csv'
    if compress:
        ext = '.zip'
            
    try:
        f = os.path.join(dir, filename+ext)
        if os.path.isfile(f):
            return loading_v3(f)
    except Exception as e:
        print(f"Error loading file {dir} {filename+ext}: {e}")
        raise
    else:
        return pd.DataFrame()

In [None]:
def create_store_folder(store_dir = '_data_store'):
    os.makedirs(store_dir, exist_ok=True) 

def store_to_file(data, filename, store_dir = '_data_store', compress=False):
    create_store_folder()

    file_path = store_dir+'/'+filename
    compression_opts = None
    if compress:
        compression_opts = dict(method='zip', archive_name=filename+'.csv')
        file_path += '.zip'
    else:
        file_path += '.csv'

    data.to_csv(file_path, compression=compression_opts)

In [None]:
def store_parquet_file(data, filename, store_dir = '_data_store', compress=False):
    create_store_folder()
    file_path = store_dir+'/'+filename+'.parquet'
    table = pa.Table.from_pandas(data)
    if compress:
        pq.write_table(table, file_path, compression='snappy')
    else:
        pq.write_table(table, file_path)


In [None]:
def read_parquet_file(filename, store_dir = '_data_store'):
    file_path = store_dir+'/'+filename+'.parquet'
    table = pq.read_table(file_path)
    return table.to_pandas()

In [None]:
# # validate with
# # https://www.investing.com/equities/apple-computer-inc-historical-data?utm_source=google&utm_medium=cpc&utm_campaign=16816825626&utm_content=591978799158&utm_term=dsa-1546555491534_&GL_Ad_ID=591978799158&GL_Campaign_ID=16816825626&ISP=1&gad_source=1&gclid=CjwKCAiAxea5BhBeEiwAh4t5K3jLQNVzqbbCTLp-guh_jCrvv0cJI19-3Rh-kKb8spcB_GsFwx91ThoCGcIQAvD_BwE
# import os
# f = os.path.join('crypto_data', 'BTC-USD.csv')
# df = loading_v2(f, 'BTC-USD')

# df.tail(5)