In [None]:
import os
import pandas as pd
from pathlib import Path
from dotenv import load_dotenv

load_dotenv()

table = 'crsp_dsf' # is actually crspq.dsf_v2
year = '2024'

# For notebooks, construct path relative to known project structure
# The notebook is in Strategies/dividend_cuts/, so go up 2 levels to project root
current_dir = Path.cwd()
OUTPUT_DIR = current_dir.parent.parent / "Data" / "crsp"
output_file = OUTPUT_DIR / f"{table}_{year}.parquet"

df = pd.read_parquet(output_file)

print(df.head())
print(df.columns)
print(df.dtypes)

In [1]:
import pandas as pd

from bearplanes.utils.describe_dataframes import DescribeDataframes
from bearplanes.utils.paths import (get_data_dir, get_processed_data_dir, get_project_root)

crsp_dir = get_processed_data_dir("wrds","crsp")
file_path = crsp_dir / f"crsp_dsf_combined.parquet"

df = pd.read_parquet(file_path)

# df.cust.describe()
df.cust.missing_check_permno(attribute='dlycumfacshr')
df.cust.missing_check_mktcap(attribute='dlycumfacshr')
df.cust.missing_check_timeseries(attribute='dlycumfacshr')
df.cust.missing_check_tradingstatus(attribute='dlycumfacshr')
df.cust.missing_check_sharetype(attribute='dlycumfacshr')


MISSING DATA ANALYSIS: dlycumfacshr
Total missing values: 7,553 (0.03%)
Missing across 7,553 unique securities (43.65%)
Total securities: 17,303

MARKET CAP ANALYSIS FOR MISSING: dlycumfacshr
Total records with missing dlycumfacshr: 7,553
Records with missing market cap: 7,553
------------------------------------------------------------
Market Cap Distribution (for non-NA caps):
count   0.00
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: dlycap, dtype: float64

TIME SERIES MISSING DATA ANALYSIS: dlycumfacshr
Total missing records: 7,553
Date range: 2010-01-04 to 2024-12-31
Days with missing: 2,834 / 3,774
Average missing per affected day: 2.67
------------------------------------------------------------
Most recent 20 dates with missing data:
dlycaldt
2024-11-29     5
2024-12-02     3
2024-12-03     4
2024-12-04     1
2024-12-06     4
2024-12-09     3
2024-12-10     1
2024-12-11     3
2024-12-12     2
2024-12-13     4
2024-12-16     9


In [None]:

# ============================================================
# BASIC INFORMATION
# ============================================================
# Dimensions: 29,132,307 rows Ã— 24 columns
# ------------------------------------------------------------
# Column Data Types:
# permno                       int32
# permco                       int32
# hdrcusip            string[python]
# cusip               string[python]
# ticker              string[python]
# shrout                       Int32
# siccd                        int16
# dlycaldt            string[python]
# sharetype           string[python]
# securitytype              category
# securitysubtype           category
# usincflg                  category
# primaryexch         string[python]
# conditionaltype     string[python]
# tradingstatusflg    string[python]
# dlycap                     float64
# dlycapflg                 category
# dlydistretflg       string[python]
# dlyvol                     float32
# dlyopen                    float64
# dlyhigh                    float64
# dlylow                     float64
# dlyclose                   float64
# dlycumfacshr               float64
# dtype: object
# ============================================================

# ============================================================
# MEMORY USAGE
# ============================================================
# Total memory: 15268.52 MB
# ============================================================

# ============================================================
# MISSING VALUES ANALYSIS
# ============================================================
#      column_name  missing_count  percent_missing
#           permno              0         0.000000
#           permco              0         0.000000
#         hdrcusip              0         0.000000
#            cusip           7553         0.025927
#           ticker         219728         0.754242
#           shrout           7553         0.025927
#            siccd              0         0.000000
#         dlycaldt              0         0.000000
#        sharetype              8         0.000027
#     securitytype              8         0.000027
#  securitysubtype              8         0.000027
#         usincflg              8         0.000027
#      primaryexch              8         0.000027
#  conditionaltype              8         0.000027
# tradingstatusflg              8         0.000027
#           dlycap         226531         0.777594
#        dlycapflg              0         0.000000
#    dlydistretflg              0         0.000000
#           dlyvol         226545         0.777642
#          dlyopen        1017593         3.493005
#          dlyhigh        1017468         3.492576
#           dlylow        1017468         3.492576
#         dlyclose        1017468         3.492576
#     dlycumfacshr           7553         0.025927
# ============================================================

# ============================================================
# EXACT DUPLICATES
# ============================================================
# Found 0 duplicate rows (0.00%)
# ------------------------------------------------------------
# No duplicates found
# ============================================================

# ============================================================
# NUMERIC COLUMNS ANALYSIS
# ============================================================
#                      min           max   na_count  na_percent
# permno        10001.0000  9.343600e+04        0.0    0.000000
# permco            5.0000  6.012300e+04        0.0    0.000000
# shrout            2.0000  2.920640e+07     7553.0    0.025927
# siccd             0.0000  9.999000e+03        0.0    0.000000
# dlycap            8.0700  3.915300e+09   226531.0    0.777594
# dlyvol            0.0000  3.363241e+09   226545.0    0.777642
# dlyopen           0.0001  7.300908e+05  1017593.0    3.493005
# dlyhigh           0.0014  7.419714e+05  1017468.0    3.492576
# dlylow            0.0001  7.230500e+05  1017468.0    3.492576
# dlyclose          0.0006  7.240400e+05  1017468.0    3.492576
# dlycumfacshr      0.0000  1.920000e+02     7553.0    0.025927
# ============================================================

# ============================================================
# CATEGORICAL/STRING COLUMNS ANALYSIS
# ============================================================

# [hdrcusip]
#   Missing: 0 (0.00%)
#   Unique values: 17,303
#   Top 10 values:
#     77467X10: 3,774
#     72913210: 3,774
#     92204A60: 3,774
#     92204A88: 3,774
#     92290855: 3,774
#     85916J40: 3,774
#     64420610: 3,774
#     96041310: 3,774
#     G2125H10: 3,774
#     20260810: 3,774
# ------------------------------------------------------------

# [cusip]
#   Missing: 7,553 (0.03%)
#   Unique values: 22,648
#   Top 10 values:
#     03050610: 3,774
#     91307C10: 3,774
#     82620P10: 3,774
#     52610710: 3,774
#     28225C80: 3,774
#     M8187310: 3,774
#     48203R10: 3,774
#     M1534210: 3,774
#     89011010: 3,774
#     05766520: 3,774
# ------------------------------------------------------------

# [ticker]
#   Missing: 219,728 (0.75%)
#   Unique values: 18,145
#   Top 10 values:
#     GTN: 7,548
#     HEI: 7,548
#     TAP: 7,548
#     LEN: 7,548
#     MKC: 7,548
#     BIO: 7,548
#     BF: 7,548
#     CRD: 7,548
#     BRK: 7,548
#     HVT: 7,548
# ------------------------------------------------------------

# [dlycaldt]
#   Missing: 0 (0.00%)
#   Unique values: 3,774
#   Top 10 values:
#     2024-12-23: 9,762
#     2024-12-20: 9,760
#     2024-12-30: 9,755
#     2024-12-24: 9,753
#     2024-12-27: 9,752
#     2024-12-31: 9,751
#     2024-12-26: 9,748
#     2024-12-19: 9,748
#     2024-12-18: 9,746
#     2024-12-16: 9,744
# ------------------------------------------------------------

# [sharetype]
#   Missing: 8 (0.00%)
#   Unique values: 6
#   Top 10 values:
#     NS: 25,923,233
#     AD: 1,378,645
#     SB: 1,269,139
#     UG: 549,641
#     N/A: 7,545
#     CE: 4,096
# ------------------------------------------------------------

# [securitytype]
#   Missing: 8 (0.00%)
#   Unique values: 3
#   Top 10 values:
#     EQTY: 19,585,316
#     FUND: 9,539,438
#     N/A: 7,545
# ------------------------------------------------------------

# [securitysubtype]
#   Missing: 8 (0.00%)
#   Unique values: 5
#   Top 10 values:
#     COM: 19,585,316
#     ETF: 7,276,149
#     CEF: 2,158,750
#     ETV: 104,539
#     UNK: 7,545
# ------------------------------------------------------------

# [usincflg]
#   Missing: 8 (0.00%)
#   Unique values: 2
#   Top 10 values:
#     Y: 25,325,487
#     N: 3,806,812
# ------------------------------------------------------------

# [primaryexch]
#   Missing: 8 (0.00%)
#   Unique values: 7
#   Top 10 values:
#     Q: 11,948,203
#     N: 9,419,875
#     R: 5,333,676
#     A: 1,275,075
#     B: 935,499
#     X: 219,720
#     I: 251
# ------------------------------------------------------------

# [conditionaltype]
#   Missing: 8 (0.00%)
#   Unique values: 4
#   Top 10 values:
#     RW: 28,912,503
#     NT: 212,175
#     N/A: 7,545
#     NW: 76
# ------------------------------------------------------------

# [tradingstatusflg]
#   Missing: 8 (0.00%)
#   Unique values: 5
#   Top 10 values:
#     A: 28,908,020
#     X: 212,175
#     D: 7,545
#     S: 4,222
#     H: 337
# ------------------------------------------------------------

# [dlycapflg]
#   Missing: 0 (0.00%)
#   Unique values: 5
#   Top 10 values:
#     BP: 27,532,156
#     AD: 1,378,645
#     NT: 207,682
#     DE: 7,553
#     MP: 6,271
# ------------------------------------------------------------

# [dlydistretflg]
#   Missing: 0 (0.00%)
#   Unique values: 13
#   Top 10 values:
#     NO: 28,792,476
#     C1: 321,081
#     D1: 5,168
#     S1: 4,262
#     M2: 2,872
#     C2: 2,012
#     O1: 1,607
#     N1: 1,200
#     P1: 599
#     D2: 592
# ------------------------------------------------------------
# ============================================================

In [None]:
# Drop columns exchangetier, hdrcusip, 

import os
import pandas as pd
from pathlib import Path
from dotenv import load_dotenv

load_dotenv()

table = 'crsp_dsf' # is actually crspq.dsf_v2
year = '2024'

# For notebooks, construct path relative to known project structure
# The notebook is in Strategies/dividend_cuts/, so go up 2 levels to project root
current_dir = Path.cwd()
OUTPUT_DIR = current_dir.parent.parent / "Data" / "crsp"
output_file = OUTPUT_DIR / f"{table}_{year}.parquet"

df = pd.read_parquet(output_file)

df = df.drop(['exchangetier', 'hdrcusip'], axis=1)



In [None]:
# Merge all distribution info into one dataframe, sort, and save

import os
import pandas as pd
from pathlib import Path
from dotenv import load_dotenv

load_dotenv()

table = 'stkdistributions'
start_year = 2010
end_year = 2024

dataframes = []

for year in range(start_year, end_year + 1):

    # For notebooks, construct path relative to known project structure
    # The notebook is in Strategies/dividend_cuts/, so go up 2 levels to project root
    current_dir = Path.cwd()
    OUTPUT_DIR = current_dir.parent.parent / "Data" / "crsp_distribution_events"
    output_file = OUTPUT_DIR / f"{table}_{year}.parquet"

    df = pd.read_parquet(output_file)
    dataframes.append(df)


# Concatenate all dataframes together
distributions_data = pd.concat(dataframes, ignore_index=True)

# print some quick info
print(distributions_data.head())
print(len(distributions_data))

# Save concatenated dataframe
output_file = OUTPUT_DIR / f"{table}_combined.parquet"
distributions_data.to_parquet(output_file)

In [None]:
import os
import pandas as pd
from pathlib import Path
from dotenv import load_dotenv

load_dotenv()

table = 'crsp_dsf'
end_year = 2024

current_dir = Path.cwd()
OUTPUT_DIR = current_dir.parent.parent / "Data" / "crsp"
output_file = OUTPUT_DIR / f"{table}_{end_year}.parquet"

df = pd.read_parquet(output_file)

# Counting NAs under all columns we want to look at 
COLUMNS_TO_KEEP = [
    'permno',           # Unique security ID
    'permco',           # Unique company ID
    'hdrcusip',         # Header CUSIP (8 chars)
    'cusip',            # Full CUSIP
    'ticker',           # Ticker symbol
    'shrout',           # Shares outstanding
    'siccd',            # SIC code
    'dlycaldt',         # Daily calendar date
    'sharetype',        # Share type (filter: NS and '')
    'securitytype',     # Security type (filter: EQTY)
    'securitysubtype',  # Sub type (filter: COM)
    'usincflg',         # US incorporation flag
    'primaryexch',      # Primary exchange
    'conditionaltype',  # Conditional type (filter: RW)
    'tradingstatusflg', # Trading status
    'dlycap',           # Daily market cap
    'dlycapflg',        # Daily cap flag
    'dlydistretflg',    # Distribution return flag
    'dlyvol',           # Daily volume
    'dlyopen',          # Daily open
    'dlyhigh',          # Daily high
    'dlylow',           # Daily low
    'dlyclose',         # Daily close
    'dlycumfacshr',     # Cumulative share adjustment factor (splits/stock dividends only)
]
for col in COLUMNS_TO_KEEP:
    col_na_count = df[col].isna().sum()
    print(f"Column {col} has {col_na_count} na values")

# print(df.columns)
# print(df.dtypes)

In [None]:
# permno                       Int64 keep
# hdrcusip            string[python] header cusip 8 chars
# cusip               string[python] cusip
# ticker              string[python] ticker
# shrout                       Int64 shares outstanding
# permco                       Int64 keep 
# siccd                        Int64 sic code
# dlycaldt            string[python] daily calendar date (convert to pandas)
# sharetype           string[python] share type keep (only keep NS and '')
# securitytype        string[python] security type - keep (use to filter out all types but EQTY)
# securitysubtype     string[python] sub type - keep (use to filter for only COM (common))
# usincflg            string[python] US incorporation flag
# primaryexch         string[python] primary exchange
# conditionaltype     string[python] conditional type - keep (only want RW (regular way))
# tradingstatusflg    string[python] trading status - keep (need to use for more specific security by security filtering)
# dlycap                     Float64 daily capitalization
# dlycapflg           string[python] daily capitalization flag
# dlydistretflg       string[python] daily distribution return impact flag (probably NEED)
# dlyvol                     Float64 daily volume
# dlyclose                   Float64 daily close
# dlylow                     Float64 daily low
# dlyhigh                    Float64 daily high 
# dlyopen                    Float64 daily open
# dlycumfacshr               Float64 daily cumulative factor to adjust shares/volume