# Imports

In [None]:
%load_ext autoreload
%autoreload 2

import os

from matplotlib import pyplot as plt
import pandas as pd
import seaborn as sns
sns.set()

from tqdm.autonotebook import tqdm

import oil.utils.datetime_utils as du
import vendors.first_rate.utils as fru

%matplotlib inline

In [None]:
from pylab import rcParams
rcParams['figure.figsize'] = (20, 5)

In [None]:
import infra.helpers.telegram_notify.telegram_notify as tgn
tn = tgn.TelegramNotify()

In [None]:
def find_price_col_bug(df):
    """
    Check that column names for ['open', 'high', 'low', 'close', 'settle']
    columns correspond to one of those columns (they can be shifted)
    """
    price_cols = df.columns.intersection(
        ['open', 'high', 'low', 'close', 'settle'])
    mean_price = df[price_cols].mean()
    col_name_bug = False
    for price_col in price_cols:
        mean_other_cols = mean_price.loc[mean_price.index.drop(
            price_col)].mean()
        if not (0.98 < (mean_price.loc[price_col] / mean_other_cols) < 1.2):
            col_name_bug = True
    return col_name_bug


def verify(pq_path):
    equity = pd.read_pq(pq_path)
    summary_series = pd.Series(index=summary_cols)
    summary_series.loc['start_date'] = equity.iloc[0, 0]
    summary_series.loc['end_date'] = equity.iloc[0, -1]
    summary_series.loc['n_rows'] = len(equity)
    summary_series.loc['price_col_bug'] = find_col_name_bug(equity)
    # check timestamps and missing days
    if 'timestamp' not in equity.columns:
        summary_series.loc['missing_timestamp_col'] = True
        bdays = pd.date_range(equity.iloc[0, 0],
                              equity.iloc[0, -1],
                              freq=du.CBD)
        n_missing_days = len(bdays) - len(equity)
        summary_series.loc['n_missing_market_days'] = n_missing_days
    else:
        summary_series.loc['missing_timestamp_col'] = False
        bdays = pd.date_range(equity.iloc[0, 0],
                              equity.iloc[0, -1],
                              freq=du.CBD)
        missing_bdays = bdays.difference(summary['timestamp'].date())
        summary_series.loc['n_missing_market_days'] = len(missing_bdays)
    return summary_series

In [None]:
PQ_DIR = '/data/first_rate/pq'

# Get file list

In [None]:
pq_files = []
for category_dir in os.listdir(PQ_DIR):
    category_dir_path = os.path.join(PQ_DIR, category_dir)
    for file_name in os.listdir(category_dir_path):
        file_path = os.path.join(category_dir_path, file_name)
        pq_files.append(file_path)

In [None]:
len(pq_files)

# Collect a summary

In [None]:
summary_cols = [
    'start_date', 'end_date', 'n_missing_market_days', 'n_rows'
    'missing_timestamp_col', 'price_col_bug'
]

In [None]:
summary = pd.DataFrame(columns=summary_cols, index=pq_files)
summary.index.name = 'file_name'

In [None]:
summary.head()

In [None]:
for pq_path in tqdm(pq_files):
    summary_equity = verify(pq_path)
    summary.loc[pq_path] = summary_equity

In [None]:
summary.to_csv('/data/first_rate/file_summary.csv')

In [None]:
tn.notify("Collected stats for each pq")

# Stats

In [None]:
summary['missing_timestamp_col'].sum()

In [None]:
summary['missing_timestamp_col'].sum() / len(summary)

In [None]:
summary['n_missing_market_days'] / summary['n_rows']

In [None]:
sns.distplot(summary['n_rows'])
plt.title('Number of rows per equity')
plt.show()

In [None]:
sns.distplot(summary['n_missing_market_days'] / summary['n_rows'])
plt.title('Proportion of missing market days per equity')
plt.show()