Replicating Main Results From **Intraday Residual Reversal in the U.S. Stock Market** Brogaard, Han, Kim

Notes to self for my implementation

Page 10 - Data.

1) Select the last bid and ask quote price every 30 minutes between the hours of 10:00AM and 4:00PM EST 

* using the TAQ database from July 1996 to Dec 2022 and S&P based equities

2) This results in a total of 13 quote price obersvations per day for each stock
3) Forward fill cases where there was not a quote on a given 30 minute interval
4) Calculate the midpoint of the bid and ask quote to get a return measure
5) use CFACPR to adjust share price
6) winsorize 30 min returns at 0.5% level to adjust for remaining large outliers


Page 11/12 - RET, RESIDUAL and RISK intro.

1) Return or RET is composed of RISK and RESIDUAL, where RISK is return attributable to factors and RESIDUAL is noise left from subtracting RISK from RET
2) Page 55 begins to list the 15 anomalies used

In [None]:
# Getting the universe we want to query for intraday data
import pandas as pd
from pathlib import Path

from bearplanes.utils.paths import *
from bearplanes.utils.adjustment import *
from bearplanes.utils.describe_dataframes import *

# Load in raw crsp data to refilter 
data_path = get_raw_data_dir(source='wrds', dataset='crsp')
df = pd.read_parquet(get_file_path(data_path, 'crsp_dsf_cleaned.parquet'))

# Going to apply row level filters to filter the data set
# since wrds no longer provides historical S&P constituents I think this makes sense, I could be a little more accurate and use S&P documentation on eligibility.
# Filter for quality
df_filtered = df[
    (df['adj_volume'] > 10_000_000) &
    (df['adj_close'] > 5) &
    (df['sharetype'] == 'NS') &
    (df['primaryexch'].isin(['A', 'Q', 'N'])) &
    (df['tradingstatusflg'] == 'A')
]

# Convert to datetime BEFORE creating universe_df
df_filtered['dlycaldt'] = pd.to_datetime(df_filtered['dlycaldt'])

# Filter for date range
df_filtered = df_filtered[df_filtered['dlycaldt'] >= pd.to_datetime('2015-01-01')]

# NOW create universe_df
universe_df = df_filtered[['ticker', 'dlycaldt', 'permno']]

universe_df.to_parquet(f'{data_path}/universe.parquet')

In [None]:
# Compustat fundamentals
from pathlib import Path
import pandas as pd

from bearplanes.utils.paths import *
from bearplanes.utils.adjustment import *
from bearplanes.utils.describe_dataframes import *

# Load in raw compustat data to concatenate into one 
compustat_file = get_raw_data_dir(source='wrds', dataset='compustat_fundamentals_q')
data_path = compustat_file / 'fundq_combined.parquet'

comp_df = pd.read_parquet(data_path)
# print(comp_df)

comp_df.cust.describe()
contains_iid = any(['iid' == col_name for col_name in comp_df.columns])
contains_permno = any(['permno' == col_name for col_name in comp_df.columns])

print(f"contains permno {contains_permno}")
print(f"contains iid {contains_iid}")



BASIC INFORMATION
Dimensions: 476,593 rows × 395 columns
------------------------------------------------------------
Column Data Types:
gvkey       string[python]
iid         string[python]
datadate    string[python]
fyearq               Int64
fqtr                 Int64
                 ...      
dvpsxq             Float64
mkvaltq            Float64
prccq              Float64
prchq              Float64
prclq              Float64
Length: 395, dtype: object

MEMORY USAGE
Total memory: 2208.30 MB

MISSING VALUES ANALYSIS
column_name  missing_count  percent_missing
      gvkey              0         0.000000
        iid             30         0.006295
   datadate              0         0.000000
     fyearq              0         0.000000
       fqtr            125         0.026228
       conm              0         0.000000
        tic            131         0.027487
      cusip            119         0.024969
        cik         150799        31.641044
     indfmt              0        

In [None]:
# CCM Link
from bearplanes.utils import get_raw_data_dir
from bearplanes.data.wrds.compustat.fields import OUTPUT_DIR
from bearplanes.utils import get_processed_data_dir
from pathlib import Path
import pandas as pd

from bearplanes.utils.paths import *
from bearplanes.utils.adjustment import *
from bearplanes.utils.describe_dataframes import *
from bearplanes.data.wrds.crsp.crsp_compustat_link import *

# Load in raw taq data to refilter
ccm_link = get_raw_data_dir(source='wrds', dataset='crsp_compustat_link')
data_path = ccm_link / 'ccm_link.parquet'

df = pd.read_parquet(data_path)
df.cust.describe()
print(df)

Loading library list...
Done
Downloaded 39,232 linkages
Columns: gvkey, permno, permco, linktype, linkprim, linkdt, linkenddt, iid

BASIC INFORMATION
Dimensions: 39,232 rows × 8 columns
------------------------------------------------------------
Column Data Types:
gvkey        string[python]
permno              Float64
permco              Float64
linktype     string[python]
linkprim     string[python]
linkdt       string[python]
linkenddt    string[python]
iid          string[python]
dtype: object

MEMORY USAGE
Total memory: 12.81 MB

MISSING VALUES ANALYSIS
column_name  missing_count  percent_missing
      gvkey              0         0.000000
     permno              0         0.000000
     permco              0         0.000000
   linktype              0         0.000000
   linkprim              0         0.000000
     linkdt              0         0.000000
  linkenddt          10386        26.473287
        iid              0         0.000000

EXACT DUPLICATES
Found 0 duplicate ro

In [3]:
# TAQ processed data
from bearplanes.utils import get_raw_data_dir
from bearplanes.data.wrds.compustat.fields import OUTPUT_DIR
from bearplanes.utils import get_processed_data_dir
from pathlib import Path
import pandas as pd

from bearplanes.utils.paths import *
from bearplanes.utils.adjustment import *
from bearplanes.utils.describe_dataframes import *
from bearplanes.data.wrds.crsp.crsp_compustat_link import *

# Load in taq data
taq_files = get_processed_data_dir(source='wrds', dataset='taq')
data_path = taq_files / '30_min_combined.parquet'

df = pd.read_parquet(data_path)

df.cust.describe()
print(df)




BASIC INFORMATION
Dimensions: 8,292,549 rows × 15 columns
------------------------------------------------------------
Column Data Types:
time_m         string[python]
ex             string[python]
sym_root       string[python]
sym_suffix     string[python]
bid                   Float64
bidsiz                  Int64
ask                   Float64
asksiz                  Int64
qu_cond        string[python]
qu_seqnum               Int64
qu_source      string[python]
trade_date     string[python]
permno                  Int64
bucket_time    string[python]
midpoint              Float64
dtype: object

MEMORY USAGE
Total memory: 3961.69 MB

MISSING VALUES ANALYSIS
column_name  missing_count  percent_missing
     time_m              0         0.000000
         ex              0         0.000000
   sym_root              0         0.000000
 sym_suffix        7458885        89.946831
        bid              0         0.000000
     bidsiz              0         0.000000
        ask              

In [4]:
# CRSP
from pathlib import Path
import pandas as pd

from bearplanes.utils.paths import *
from bearplanes.utils.adjustment import *
from bearplanes.utils.describe_dataframes import *

# Load in crsp data for share price adjustment
crsp_data = get_processed_data_dir(source='wrds', dataset='crsp')
crsp_data_path = crsp_data / 'crsp_dsf_combined.parquet'

crsp_df = pd.read_parquet(crsp_data_path)
print(crsp_df)
crsp_df.cust.describe()

          permno  permco  hdrcusip     cusip ticker   shrout  siccd  \
0          10001    7953  36720410  29269V10   EGAS     4361   4925   
1          10002    7954  05978R10  05978R10   BTFG    17634   6020   
2          10025    7975  00103110  00103110   AEPI     6852   3081   
3          10026    7976  46603210  46603210   JJSF    18374   2052   
4          10028    7978  29402E10  23323G10   DGSE     9834   5094   
...          ...     ...       ...       ...    ...      ...    ...   
29132302   93426   53443  92835K10  92835K10    VPG    12216   3676   
29132303   93427   53445  G3323L10  G3323L10     FN    36268   3826   
29132304   93429   53447  12503M10  12503M10   CBOE   104686   6211   
29132305   93434   53427  78513530  78513530   SANW     2139    139   
29132306   93436   53453  88160R10  88160R10   TSLA  3216517   3711   

            dlycaldt sharetype securitytype  ... tradingstatusflg  \
0         2010-01-04        NS         EQTY  ...                A   
1        

In [1]:
# Merge ccm into TAQ
from pathlib import Path
import pandas as pd

from bearplanes.utils.paths import *
from bearplanes.utils.adjustment import *
from bearplanes.utils.describe_dataframes import *

# Load in TAQ data
taq_data = get_processed_data_dir(source='wrds', dataset='taq')
taq_data_path = taq_data / '30_min_combined.parquet'
taq_df = pd.read_parquet(taq_data_path)

# Load in ccm link data
ccm_data = get_raw_data_dir(source='wrds', dataset='crsp_compustat_link')
ccm_data_path = ccm_data / 'ccm_link.parquet'
ccm_df = pd.read_parquet(ccm_data_path)

# Load in crsp data for share price adjustment
crsp_data = get_processed_data_dir(source='wrds', dataset='crsp')
crsp_data_path = crsp_data / 'crsp_dsf_combined.parquet'
crsp_df = pd.read_parquet(crsp_data_path)
# Confirm dlycaldt is a datetime
crsp_df['dlycaldt'] = pd.to_datetime(crsp_df['dlycaldt'])

## CCM ##
# pre-filter ccm
ccm_df = ccm_df[
    (ccm_df['linktype'].isin(['LC', 'LU'])) &
    (ccm_df['linkprim'] == 'P')
]

# Convert ccm dates 
ccm_df['linkdt'] = pd.to_datetime(ccm_df['linkdt'])
ccm_df['linkenddt'] = pd.to_datetime(ccm_df['linkenddt']).fillna(pd.to_datetime('2099-12-31'))

## TAQ ##
# Convert TAQ dates
taq_df['trade_date'] = pd.to_datetime(taq_df['trade_date'])

## Merge ## 
taq_merged = pd.merge(taq_df, ccm_df, how='left', on='permno')

# Check coverage
total_rows = len(taq_merged)
rows_with_gvkey = taq_merged['gvkey'].notna().sum()
rows_with_iid = taq_merged['iid'].notna().sum()

# Filter to keep ONLY rows with valid CCM links where the TAQ trade date is within the linkdt and linkenddt and the 
# gvkey and iid is there (not na), this turns out to be 87% of the TAQ data we have selected, ~7m rows
taq_merged = taq_merged[
    (taq_merged['trade_date'] >= taq_merged['linkdt']) &
    (taq_merged['trade_date'] <= taq_merged['linkenddt']) &
    (taq_merged['gvkey'].notna()) &
    (taq_merged['iid'].notna())
].copy()

# Just rename the CRSP column to match TAQ
crsp_df = crsp_df.rename(columns={'dlycaldt': 'trade_date'})

# Then merge (no need to rename taq_merged at all)
taq_crsp_merge = pd.merge(taq_merged, crsp_df, how='left', on=['permno', 'trade_date'])

print(taq_crsp_merge)
print(taq_crsp_merge.columns())

# Adjust volume, shares, and price amounts with adjustment function
# crsp_df = adjust_with_cumfacshr(crsp_df)
# crsp_selected_columns = ['dlycaldt', 'permno', 'dlycumfacshr', 'shrout', 'dlycap']
# crsp_adjustment = crsp_df[crsp_selected_columns]

# # Create 30 min returns
# taq_crsp_merge = taq_crsp_merge.sort_values(by=["permno", "trade_date"])
# sort intra day timestamps
# taq_crsp_merge['30_min_ret'] = taq_crsp_merge.groupby('permno')['midpoint'].pct_change()

                  time_m ex sym_root sym_suffix    bid  bidsiz    ask  asksiz  \
0        10:29:58.160000  T       AA       <NA>  15.86       8  15.87      20   
1        10:59:53.222000  T       AA       <NA>   15.8      29  15.81      39   
2        11:29:59.178000  T       AA       <NA>  15.73      22  15.74     111   
3        11:59:24.656000  T       AA       <NA>  15.79      19   15.8      37   
4        12:29:31.719000  Z       AA       <NA>  15.79      25   15.8      20   
...                  ... ..      ...        ...    ...     ...    ...     ...   
7957346  13:59:56.921967  Q       ZI       <NA>  10.44      22  10.45      90   
7957347  14:29:54.449264  Q       ZI       <NA>  10.43      17  10.44      35   
7957348  14:59:47.689206  L       ZI       <NA>  10.42      30  10.43      30   
7957349  15:29:55.454772  L       ZI       <NA>  10.45      37  10.46      28   
7957350  15:59:59.996951  Q       ZI       <NA>  10.52      34  10.53     434   

        qu_cond  qu_seqnum 

TypeError: 'Index' object is not callable

In [None]:
# Merge merge the merged TAQ data into compustat
from pathlib import Path
import pandas as pd

from bearplanes.utils.paths import *
from bearplanes.utils.adjustment import *
from bearplanes.utils.describe_dataframes import *
from bearplanes.data.wrds.crsp.crsp_compustat_link import *

# Load in compustat data 
compustat_file = get_raw_data_dir(source='wrds', dataset='compustat_fundamentals_q')
data_path = compustat_file / 'fundq_combined.parquet'
comp_df = pd.read_parquet(data_path)

# Load in the merged TAQ data with ccm link info

In [None]:
# Factors

# Pull in

# 1) Accruals 
# Calc: The annual change in noncash working capital minus depreciation and amortization expense, divided by 
# average total assets for the previous two fiscal years 
# noncash working capital = actq - chq - lctq
# D&A = dpy
# average total assets = atq.rolling(2).mean()

# 2) Asset Growth 
# Calc: The growth rate of total assets in the previous fiscal year
# = atq

# 3) Composite Equity Issues
# Calc: The 12-month growth in equity market capitalization minus the 12 month cumulative stock return
# 

# 4) Failure Probability
# Calc: Strictly following Campbell, Hilscher, and Szilagyi (2008)
# Cant access

# 5) Gross Profitability
# Calc: (Total Revenue - COGSs) / Total Assets

# 6) Investment to assets 
# Calc: The annual change in inventories scaled by lagged book value of assets

# 7) Momentum 
# Calc: Cumulative return over the past 22 to 252 days 

# 8) Net Operating Assets
# Calc: Operating Assets minus operating liabilities, divided by lagged? 

# 9) Net stock issues
# Calc: The annual log change in split adjusted shares outstanding

# 10) O Score
# Calc: Strictly following Ohlson 1980

# 11) Return on Assets
# Calc: The ratio of quarterly earnings to last quarters earnings

# 12) Beta
# Calc: 

# 13) Book to market
# Calc: The ratio of the book value of common equity to the market value of equity (ceqq / market cap)

# 14) Reversal 
# Calc: Cumulative return over the past 21 days

# 15) Size
# Calc: ln of market cap
