# Get Point-in-time datasets
Find all of the dates when companies first published their quarterly reports

In [1]:
import bql
import pandas as pd
import helper
import boto3
import json
from s3fs import S3FileSystem
import os

from ipywidgets import IntProgress
from IPython.display import display

In [2]:
import importlib
importlib.reload(helper)

<module 'helper' from '/project/helper.py'>

In [2]:
bq = bql.Service()

In [3]:
# Index to use for point in time firms
index = 'INDU Index'

# rebalance dates for the index
rebalance_dates = ['2024-12-31',
        '2024-09-30',
        '2024-06-30',
        '2024-03-31',
        '2023-12-31',
        '2023-09-30',
        '2023-06-30',
        '2023-03-31',
        '2022-12-31',
        '2022-09-30',
        '2022-06-30',
        '2022-03-31',
        '2021-12-31',
        '2021-09-30',
        '2021-06-30',
        '2021-03-31',
        '2020-12-31',
        '2020-09-30',
        '2020-06-30',
        '2020-03-31',
        '2019-12-31',
        '2019-09-30',
        '2019-06-30',
        '2019-03-31',]


# Calculate the Rebalance dates

For each rebalance date, get the members of the index at that time and look up their reporting dates

In [4]:
all_data = []

def get_reporting_dates_per_rebalance(date):
    univ = bq.univ.members(index, dates=date)
    field = bq.data.sales_rev_turn(dates=bq.func.range('-5Y','0D'), fa_period_type='A') # Change to Annual if needed
    req = bql.Request(univ, field)
    data = bq.execute(req)
    df = data[0].df().dropna()
    return df.sort_values('PERIOD_END_DATE', ascending=True).reset_index().drop_duplicates(subset=['ID','PERIOD_END_DATE'], keep='first')

In [5]:
def get_rebalance_dates():
    for date in rebalance_dates:
        all_data.append(get_reporting_dates_per_rebalance(date))
        print("Complete for ", date)
    df = pd.concat(all_data)
    df_concat = df[['ID','AS_OF_DATE','PERIOD_END_DATE']].sort_values('PERIOD_END_DATE', ascending=True).drop_duplicates(subset=['ID','PERIOD_END_DATE'], keep='first')
    return df_concat.set_index(['AS_OF_DATE','ID']).sort_values(['AS_OF_DATE'])

In [6]:
df_rebalance_dates = get_rebalance_dates()

Complete for  2024-12-31
Complete for  2024-09-30
Complete for  2024-06-30
Complete for  2024-03-31
Complete for  2023-12-31
Complete for  2023-09-30
Complete for  2023-06-30
Complete for  2023-03-31
Complete for  2022-12-31
Complete for  2022-09-30
Complete for  2022-06-30
Complete for  2022-03-31
Complete for  2021-12-31
Complete for  2021-09-30
Complete for  2021-06-30
Complete for  2021-03-31
Complete for  2020-12-31
Complete for  2020-09-30
Complete for  2020-06-30
Complete for  2020-03-31
Complete for  2019-12-31
Complete for  2019-09-30
Complete for  2019-06-30
Complete for  2019-03-31


## Request all of the data and format

Request financial statement and price points for each of the dates and securities. 

In [7]:
# Convert the request into correct format for data frame
def format_request_to_df(data, fields):
    fields = list(fields.keys())
    df_all = [data[index].df()[data[index].df()['PERIOD_END_DATE'] != 0]
                  .pivot(columns='PERIOD_END_DATE', values=[fields[index]])
                  .fillna(0) 
                  for index in range(0,len(fields))]
    df2 = pd.concat(df_all, axis=1)
    df3 = df2.stack().transpose().stack().unstack(level=0).transpose().fillna(0)
    df4 = df3.loc[:, (df3 != 0).any(axis=0)]
    # Reformat the columns to remove dates
    if len(df4.columns) == 6:
        df5 = df4.set_axis(['t-5','t-4','t-3','t-2','t-1', 't'], axis='columns')
    else:
        df5 = df4.drop(columns=df4.columns[0:(len(df4.columns)-6)])
        df5 = df5.set_axis(['t-5','t-4','t-3','t-2','t-1', 't'], axis='columns')
    # Reverse the direction of the dataset
    df6 = df5[df5.columns[::-1]]
    return df6.loc[(df6!=0).any(axis=1)]

In [8]:
# Convert datasets to dictionary
def convert_to_dict(securities, df_is, df_bs, df_px):
    date = {}
    data = {}
    for security in securities:
        # Convert DF to JSON
        df_is_sec = df_is.loc[security].to_json()
        df_bs_sec = df_bs.loc[security].to_json()
        df_px_sec = df_px.loc[security].set_index('DATE')[['Price']].to_json()
        # Convert to string and store
        data['is'] = json.dumps(df_is_sec)
        data['bs'] = json.dumps(df_bs_sec)
        data['px'] = json.dumps(df_px_sec)
        date[security] = data
    return date


def process_single_date(securities, fields):
    req = bql.Request(securities, fields)
    data = bq.execute(req)
    if len(fields) > 1:
        return format_request_to_df(data, fields)
    else:
        return data[0].df()


# main function for requesting the datasets
def update_financial_data(dates_and_securities):
    all_data = {}
    is_first = True
    dates = dates_and_securities.reset_index()['AS_OF_DATE'].unique()
    max_count = len(dates)
    f = IntProgress(min=0, max=max_count) # instantiate the bar
    display(f)
    # Loop through each date and extract securities
    for date in dates:
        if is_first:
            is_first=False
        else:
            as_of_date = str(date)[0:10]
            securities = list(dates_and_securities.loc[as_of_date].index)
            univ, is_fields, bs_fields, price = helper.setup_request(securities, as_of_date) 
            try:
                df_is = process_single_date(securities, is_fields)
                df_bs = process_single_date(securities, bs_fields)
                df_px = process_single_date(securities, price)
                all_data[as_of_date] = convert_to_dict(securities, df_is, df_bs, df_px)
            except:
                print(as_of_date)
            f.value += 1
    return all_data
        

In [9]:
# request the data
all_data = update_financial_data(df_rebalance_dates)

IntProgress(value=0, max=152)

2024-01-30


## Save the PIT data to S3

Store this in the data bucket for the project to retrieve for the next step

In [10]:
## Save to S3
user_bucket_name = os.environ['BQUANT_SANDBOX_USER_BUCKET']
bqnt_username = os.environ['BQUANT_USERNAME']

#s3 = boto3.resource("s3")

path_to_s3 = f's3://{user_bucket_name}/{bqnt_username}/tmp/fs/data_annual_pit_indu.json'
s3 = S3FileSystem()

with s3.open(path_to_s3, 'w') as file:
    json.dump(all_data, file)

In [None]:
data = json.loads(all_data['2020-01-31']['AON UN Equity']['is'])
pd.DataFrame(json.loads(data))

In [None]:
json.loads(data)

## Unused Code

In [91]:
# Failed Dates
failed_dates = ['2020-08-17',
'2020-09-21',
'2021-08-16',
'2021-09-24',
'2022-09-28',
'2022-11-25',
'2023-10-25',
'2024-01-08',
'2024-06-17',
'2024-11-08',
'2025-01-08']

In [8]:
# pull out the list of as of dates - will use this to loop through the securities
dates = df_rebalance_dates.reset_index()['AS_OF_DATE'].unique()

In [71]:
as_of_date = '2020-03-11'#str(dates[12])[0:10]
securities = list(df_rebalance_dates.loc[as_of_date].index)
as_of_date

'2020-03-11'

In [73]:
univ, is_fields, bs_fields, price = helper.setup_request(securities, as_of_date)

In [74]:
req = bql.Request(securities,is_fields)
data = bq.execute(req)

In [80]:
# Convert the request into correct format for data frame
def format_request_to_df(data, fields):
    fields = list(fields.keys())
    df_all = [data[index].df()[data[index].df()['PERIOD_END_DATE'] != 0]
                  .pivot(columns='PERIOD_END_DATE', values=[fields[index]])
                  .fillna(0) 
                  for index in range(0,len(fields))]
    df2 = pd.concat(df_all, axis=1)
    df3 = df2.stack().transpose().stack().unstack(level=0).transpose().fillna(0)
    df4 = df3.loc[:, (df3 != 0).any(axis=0)]
    # Reformat the columns to remove dates
    if len(df4.columns) == 6:
        df5 = df4.set_axis(['t-5','t-4','t-3','t-2','t-1', 't'], axis='columns')
    else:
        df5 = df4.drop(columns=df4.columns[0:(len(df4.columns)-6)])
        df5 = df5.set_axis(['t-5','t-4','t-3','t-2','t-1', 't'], axis='columns')
    # Reverse the direction of the dataset
    df6 = df5[df5.columns[::-1]]
    return df6.loc[(df6!=0).any(axis=1)]

In [90]:
# test run
req = bql.Request(securities, is_fields)
data_is = bq.execute(req)
req = bql.Request(securities, bs_fields)
data_bs = bq.execute(req)
df_is = format_request_to_df(data_is, is_fields)
df_bs = format_request_to_df(data_bs, bs_fields)

In [75]:
fields = list(is_fields.keys())
df_all = [data[index].df()[data[index].df()['PERIOD_END_DATE'] != 0]
              .pivot(columns='PERIOD_END_DATE', values=[fields[index]])
              .fillna(0) 
              for index in range(0,len(fields))]
df2 = pd.concat(df_all, axis=1)
df3 = df2.stack().transpose().stack().unstack(level=0).transpose().fillna(0)
df4 = df3.loc[:, (df3 != 0).any(axis=0)]