# Get Point-in-time datasets
Find all of the dates when companies first published their quarterly reports

In [1]:
import bql
import pandas as pd
import utils.data_request_helper as helper
import boto3
import json
from s3fs import S3FileSystem
import os

from ipywidgets import IntProgress
from IPython.display import display

In [2]:
import importlib
importlib.reload(helper)

<module 'utils.dataRequestHelper' from '/project/utils/dataRequestHelper.py'>

In [3]:
bq = bql.Service()

In [26]:
# Index to use for point in time firms
index = 'INDU Index'
filename = 'data_quarterly_pit_indu_refresh_blended.json'
reporting_period = 'Q'

# rebalance dates for the index
rebalance_dates = ['2024-12-31',
        '2024-09-30',
        '2024-06-30',
        '2024-03-31',
        '2023-12-31',
        '2023-09-30',
        '2023-06-30',
        '2023-03-31',
        '2022-12-31',
        '2022-09-30',
        '2022-06-30',
        '2022-03-31',
        '2021-12-31',
        '2021-09-30',
        '2021-06-30',
        '2021-03-31',
        '2020-12-31',
        '2020-09-30',
        '2020-06-30',
        '2020-03-31',
        '2019-12-31',
        '2019-09-30',
        '2019-06-30',
        '2019-03-31',]


# Calculate the Rebalance dates

For each rebalance date, get the members of the index at that time and look up their reporting dates

In [6]:
#all_data = []

def get_reporting_dates_per_rebalance(date, index):
    univ = bq.univ.members(index, dates=date)
    field = bq.data.sales_rev_turn(dates=bq.func.range('-5Y','0D'), fa_period_type=reporting_period) # Change to Annual if needed
    req = bql.Request(univ, field)
    data = bq.execute(req)
    df = data[0].df().dropna()
    return df.sort_values('PERIOD_END_DATE', ascending=True).reset_index().drop_duplicates(subset=['ID','PERIOD_END_DATE'], keep='first')

In [7]:
def get_rebalance_dates(index, rebalance_dates):
    all_data = []
    for date in rebalance_dates:
        all_data.append(get_reporting_dates_per_rebalance(date, index))
        print("Complete for ", date)
    df = pd.concat(all_data)
    df_concat = df[['ID','AS_OF_DATE','PERIOD_END_DATE']].sort_values('PERIOD_END_DATE', ascending=True).drop_duplicates(subset=['ID','PERIOD_END_DATE'], keep='first')
    return df_concat.set_index(['AS_OF_DATE','ID']).sort_values(['AS_OF_DATE'])

In [7]:
df_rebalance_dates = get_rebalance_dates(index, rebalance_dates)

Complete for  2024-12-31
Complete for  2024-09-30
Complete for  2024-06-30
Complete for  2024-03-31
Complete for  2023-12-31
Complete for  2023-09-30
Complete for  2023-06-30
Complete for  2023-03-31
Complete for  2022-12-31
Complete for  2022-09-30
Complete for  2022-06-30
Complete for  2022-03-31
Complete for  2021-12-31
Complete for  2021-09-30
Complete for  2021-06-30
Complete for  2021-03-31
Complete for  2020-12-31
Complete for  2020-09-30
Complete for  2020-06-30
Complete for  2020-03-31
Complete for  2019-12-31
Complete for  2019-09-30
Complete for  2019-06-30
Complete for  2019-03-31


## Request all of the data and format

Request financial statement and price points for each of the dates and securities. 

In [8]:
# Convert the request into correct format for data frame
def format_request_to_df(data, fields):
    fields = list(fields.keys())
    df_all = [data[index].df()[data[index].df()['PERIOD_END_DATE'] != 0]
                  .pivot(columns='PERIOD_END_DATE', values=[fields[index]])
                  .fillna(0) 
                  for index in range(0,len(fields))]
    df2 = pd.concat(df_all, axis=1)
    df3 = df2.stack().transpose().stack().unstack(level=0).transpose().fillna(0)
    df4 = df3.loc[:, (df3 != 0).any(axis=0)]
    # Reformat the columns to remove dates
    if len(df4.columns) == 6:
        df5 = df4.set_axis(['t-5','t-4','t-3','t-2','t-1', 't'], axis='columns')
    else:
        df5 = df4.drop(columns=df4.columns[0:(len(df4.columns)-6)])
        df5 = df5.set_axis(['t-5','t-4','t-3','t-2','t-1', 't'], axis='columns')
    # Reverse the direction of the dataset
    df6 = df5[df5.columns[::-1]]
    return df6.loc[(df6!=0).any(axis=1)]

In [9]:
# Convert datasets to dictionary
def convert_to_dict(securities, df_is, df_bs, df_px):
    date = {}
    for security in securities:
        # Convert DF to JSON
        data = {}
        df_is_sec = df_is.loc[security].to_json()
        df_bs_sec = df_bs.loc[security].to_json()
        df_px_sec = df_px.loc[security].set_index('DATE')[['Price']].to_json()
        # Convert to string and store
        data['is'] = json.dumps(df_is_sec)
        data['bs'] = json.dumps(df_bs_sec)
        data['px'] = json.dumps(df_px_sec)
        date[security] = data
    return date


def process_single_date(securities, fields):
    req = bql.Request(securities, fields)
    data = bq.execute(req)
    if len(fields) > 1:
        return format_request_to_df(data, fields)
    else:
        return data[0].df()


# main function for requesting the datasets
def update_financial_data(dates_and_securities):
    all_data = {}
    is_first = True
    dates = dates_and_securities.reset_index()['AS_OF_DATE'].unique()
    max_count = len(dates)
    f = IntProgress(min=0, max=max_count) # instantiate the bar
    display(f)
    # Loop through each date and extract securities
    for date in dates:
        if is_first:
            is_first=False
        else:
            as_of_date = str(date)[0:10]
            securities = list(dates_and_securities.loc[as_of_date].reset_index()['ID'])
            univ, is_fields, bs_fields, price = helper.setup_request(securities, as_of_date) 
            try:
                df_is = process_single_date(univ, is_fields)
                df_bs = process_single_date(univ, bs_fields)
                df_px = process_single_date(univ, price)
                all_data[as_of_date] = convert_to_dict(securities, df_is, df_bs, df_px)
            except:
                print(as_of_date)
            f.value += 1
    return all_data
        

In [39]:
df_rebalance_dates.loc['2024-10-22']

Unnamed: 0_level_0,PERIOD_END_DATE
ID,Unnamed: 1_level_1
VZ UN Equity,2024-09-30
MMM UN Equity,2024-09-30
SHW UN Equity,2024-09-30
RTX UN Equity,2024-09-30


In [40]:
# request the data
all_data = update_financial_data(df_rebalance_dates)

IntProgress(value=0, max=428)

2020-04-14
2020-11-05
2022-07-19
2022-11-01
2023-04-18
2024-07-17
2024-07-30
2025-02-04


## Save the PIT data to S3

Store this in the data bucket for the project to retrieve for the next step

In [55]:
## Save to S3
user_bucket_name = os.environ['BQUANT_SANDBOX_USER_BUCKET']
bqnt_username = os.environ['BQUANT_USERNAME']

#s3 = boto3.resource("s3")

path_to_s3 = f's3://{user_bucket_name}/{bqnt_username}/tmp/fs/{filename}'
s3 = S3FileSystem()

with s3.open(path_to_s3, 'w') as file:
     json.dump(all_data, file)

## Request datasets for Training
Use the S&P 500 and select random securities to generate the prompts that do not appear in the INDU Index. This will be used to train the data

In [4]:
# select the index
training_index = 'SPX Index'

filename = 'data_quarterly_pit_spx_refresh_blended.json'
reporting_period = 'Q'

# rebalance dates for the index
rebalance_dates = ['2024-12-31',
        '2024-09-30',
        '2024-06-30',
        '2024-03-31',
        '2023-12-31',
        '2023-09-30',
        '2023-06-30',
        '2023-03-31',
        '2022-12-31',
        '2022-09-30',
        '2022-06-30',
        '2022-03-31',
        '2021-12-31',
        '2021-09-30',
        '2021-06-30',
        '2021-03-31',
        '2020-12-31',
        '2020-09-30',
        '2020-06-30',
        '2020-03-31',
        '2019-12-31',
        '2019-09-30',
        '2019-06-30',
        '2019-03-31',]



In [10]:
df_rebalance_dates = get_rebalance_dates(training_index, rebalance_dates)

Complete for  2024-12-31
Complete for  2024-09-30
Complete for  2024-06-30
Complete for  2024-03-31
Complete for  2023-12-31
Complete for  2023-09-30
Complete for  2023-06-30
Complete for  2023-03-31
Complete for  2022-12-31
Complete for  2022-09-30
Complete for  2022-06-30
Complete for  2022-03-31
Complete for  2021-12-31
Complete for  2021-09-30
Complete for  2021-06-30
Complete for  2021-03-31
Complete for  2020-12-31
Complete for  2020-09-30
Complete for  2020-06-30
Complete for  2020-03-31
Complete for  2019-12-31
Complete for  2019-09-30
Complete for  2019-06-30
Complete for  2019-03-31


In [11]:
all_data_training = update_financial_data(df_rebalance_dates)

IntProgress(value=0, max=1056)

2020-05-08
2020-06-29
2020-07-30
2020-09-21
2020-10-29
2020-12-16
2021-01-04
2021-02-11
2021-05-06
2021-05-12
2021-06-21
2021-08-12
2021-11-11
2022-01-12
2022-03-21
2022-05-12
2022-08-04
2022-11-25
2022-12-14
2023-01-09
2023-01-30
2023-06-02
2023-07-20
2023-10-25
2023-10-26
2024-02-28
2024-04-23
2024-04-25
2024-05-09
2024-06-17
2024-07-15
2024-07-24
2024-08-08
2024-09-24
2024-09-25
2024-10-23
2024-10-30
2024-11-07
2024-11-08
2024-12-16
2025-01-08
2025-02-04
2025-02-06


In [12]:
## Save to S3
user_bucket_name = os.environ['BQUANT_SANDBOX_USER_BUCKET']
bqnt_username = os.environ['BQUANT_USERNAME']

#s3 = boto3.resource("s3")

path_to_s3 = f's3://{user_bucket_name}/{bqnt_username}/tmp/fs/{filename}'
s3 = S3FileSystem()

with s3.open(path_to_s3, 'w') as file:
     json.dump(all_data_training, file)

In [31]:
#all_data['2024-10-22']['MMM UN Equity']a

## Unused Code

In [32]:
univ = ['MMM UN Equity', 'SHW UN Equity', 'VZ UN Equity', 'RTX UN Equity']
#univ, is_fields, bs_fields, price = helper.setup_request(univ, '2025-01-21') 
univ, is_fields, bs_fields, price = helper.setup_request(univ, '2024-10-22') 


In [33]:
req = bql.Request(univ, is_fields)
data = bq.execute(req)

In [36]:
fields = is_fields
fields = list(fields.keys())
df_all = [data[index].df()[data[index].df()['PERIOD_END_DATE'] != 0]
              .pivot(columns='PERIOD_END_DATE', values=[fields[index]])
              .fillna(0) 
              for index in range(0,len(fields))]
df2 = pd.concat(df_all, axis=1)
df3 = df2.stack().transpose().stack().unstack(level=0).transpose().fillna(0)
df4 = df3.loc[:, (df3 != 0).any(axis=0)]
# Reformat the columns to remove dates
if len(df4.columns) == 6:
      df5 = df4.set_axis(['t-5','t-4','t-3','t-2','t-1', 't'], axis='columns')
else:
      df5 = df4.drop(columns=df4.columns[0:(len(df4.columns)-6)])
      df5 = df5.set_axis(['t-5','t-4','t-3','t-2','t-1', 't'], axis='columns')
 # Reverse the direction of the dataset
# df6 = df5[df5.columns[::-1]]

In [37]:
df5

Unnamed: 0_level_0,Unnamed: 1_level_0,t-5,t-4,t-3,t-2,t-1,t
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
MMM UN Equity,01 Revenue (Adj),3.101200e+10,2.866300e+10,3.268100e+10,3.265300e+10,3.058300e+10,2.856500e+10
MMM UN Equity,02 Sales and Services Revenues (Adj),3.101200e+10,2.866300e+10,3.268100e+10,3.265300e+10,3.058300e+10,2.856500e+10
MMM UN Equity,03 Financing Revenue (Adj),0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
MMM UN Equity,04 Other Revenue (Adj),0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
MMM UN Equity,05 Cost of Revenue (Adj),1.765400e+10,1.664200e+10,1.847700e+10,1.819300e+10,1.715800e+10,1.622500e+10
...,...,...,...,...,...,...,...
VZ UN Equity,47 Basic EPS from Continuing Operations,4.796038e+00,4.699173e+00,4.567779e+00,4.508337e+00,4.446360e+00,4.421884e+00
VZ UN Equity,48 Diluted Weighted Average Shares,4.208750e+09,4.211750e+09,4.213500e+09,4.215500e+09,4.217500e+09,4.219750e+09
VZ UN Equity,49 Diluted EPS,5.000000e+00,4.960000e+00,2.760000e+00,2.680000e+00,2.670000e+00,2.320000e+00
VZ UN Equity,50 Diluted EPS from Continuing Operations,5.000000e+00,4.960000e+00,2.760000e+00,2.680000e+00,2.670000e+00,2.320000e+00


In [13]:
df_is = process_single_date(univ, is_fields)
df_bs = process_single_date(univ, bs_fields)
df_px = process_single_date(univ, price)

ValueError: Length mismatch: Expected axis has 1 elements, new values have 6 elements

In [19]:
df_is

Unnamed: 0_level_0,Unnamed: 1_level_0,t,t-1,t-2,t-3,t-4,t-5
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
MMM UN Equity,01 Revenue (Adj),6.294000e+09,6.255000e+09,8.003000e+09,8.013000e+09,6.270000e+09,6.283000e+09
MMM UN Equity,02 Sales and Services Revenues (Adj),6.294000e+09,6.255000e+09,8.003000e+09,8.013000e+09,6.270000e+09,6.283000e+09
MMM UN Equity,05 Cost of Revenue (Adj),3.647000e+09,3.571000e+09,4.329000e+09,4.678000e+09,3.716000e+09,3.728000e+09
MMM UN Equity,06 Cost of Goods & Services Sold (Adj),3.647000e+09,3.571000e+09,4.329000e+09,4.678000e+09,3.716000e+09,3.728000e+09
MMM UN Equity,08 Gross Profit (Adj),2.647000e+09,2.684000e+09,3.674000e+09,3.335000e+09,2.554000e+09,2.555000e+09
...,...,...,...,...,...,...,...
VZ UN Equity,47 Basic EPS from Continuing Operations,1.159716e+00,1.113879e+00,1.110558e+00,1.037731e+00,1.184192e+00,1.175856e+00
VZ UN Equity,48 Diluted Weighted Average Shares,4.225000e+09,4.221000e+09,4.219000e+09,4.214000e+09,4.216000e+09,4.213000e+09
VZ UN Equity,49 Diluted EPS,7.800000e-01,1.090000e+00,1.090000e+00,-6.400000e-01,1.130000e+00,1.100000e+00
VZ UN Equity,50 Diluted EPS from Continuing Operations,7.800000e-01,1.090000e+00,1.090000e+00,-6.400000e-01,1.130000e+00,1.100000e+00


In [1]:

#convert_to_dict(['MMM UN Equity', 'SHW UN Equity', 'VZ UN Equity', 'RTX UN Equity'],df_is,df_bs,df_px)
#json.dumps(df_is.loc['MMM UN Equity'].to_json())

In [91]:
# Failed Dates
failed_dates = ['2020-08-17',
'2020-09-21',
'2021-08-16',
'2021-09-24',
'2022-09-28',
'2022-11-25',
'2023-10-25',
'2024-01-08',
'2024-06-17',
'2024-11-08',
'2025-01-08']

In [8]:
# # pull out the list of as of dates - will use this to loop through the securities
# dates = df_rebalance_dates.reset_index()['AS_OF_DATE'].unique()

In [1]:
# as_of_date = '2020-03-11'#str(dates[12])[0:10]
# securities = list(df_rebalance_dates.loc[as_of_date].index)
# as_of_date

In [73]:
# univ, is_fields, bs_fields, price = helper.setup_request(securities, as_of_date)

In [74]:
# req = bql.Request(securities,is_fields)
# data = bq.execute(req)

In [80]:
# # Convert the request into correct format for data frame
# def format_request_to_df(data, fields):
#     fields = list(fields.keys())
#     df_all = [data[index].df()[data[index].df()['PERIOD_END_DATE'] != 0]
#                   .pivot(columns='PERIOD_END_DATE', values=[fields[index]])
#                   .fillna(0) 
#                   for index in range(0,len(fields))]
#     df2 = pd.concat(df_all, axis=1)
#     df3 = df2.stack().transpose().stack().unstack(level=0).transpose().fillna(0)
#     df4 = df3.loc[:, (df3 != 0).any(axis=0)]
#     # Reformat the columns to remove dates
#     if len(df4.columns) == 6:
#         df5 = df4.set_axis(['t-5','t-4','t-3','t-2','t-1', 't'], axis='columns')
#     else:
#         df5 = df4.drop(columns=df4.columns[0:(len(df4.columns)-6)])
#         df5 = df5.set_axis(['t-5','t-4','t-3','t-2','t-1', 't'], axis='columns')
#     # Reverse the direction of the dataset
#     df6 = df5[df5.columns[::-1]]
#     return df6.loc[(df6!=0).any(axis=1)]

In [90]:
# # test run
# req = bql.Request(securities, is_fields)
# data_is = bq.execute(req)
# req = bql.Request(securities, bs_fields)
# data_bs = bq.execute(req)
# df_is = format_request_to_df(data_is, is_fields)
# df_bs = format_request_to_df(data_bs, bs_fields)

In [75]:
# fields = list(is_fields.keys())
# df_all = [data[index].df()[data[index].df()['PERIOD_END_DATE'] != 0]
#               .pivot(columns='PERIOD_END_DATE', values=[fields[index]])
#               .fillna(0) 
#               for index in range(0,len(fields))]
# df2 = pd.concat(df_all, axis=1)
# df3 = df2.stack().transpose().stack().unstack(level=0).transpose().fillna(0)
# df4 = df3.loc[:, (df3 != 0).any(axis=0)]