In [None]:
import os
import pandas as pd
from pathlib import Path
from dotenv import load_dotenv

load_dotenv()

table = 'stkdistributions'
year = '2024'

# For notebooks, construct path relative to known project structure
# The notebook is in Strategies/dividend_cuts/, so go up 2 levels to project root
current_dir = Path.cwd()
OUTPUT_DIR = current_dir.parent.parent / "Data" / "crsp_distribution_events"
output_file = OUTPUT_DIR / f"{table}_{year}.parquet"

df = pd.read_parquet(output_file)

print(df.head())
print(len(df))

In [None]:
# Clean up and rename columns
# Keep only the columns we need and rename them to be more readable

columns_to_keep = {
    # Identifiers
    'permno': 'permno',  # CRSP security identifier - use this to match with CRSP data
    'disexdt': 'ex_date',  # Ex-Distribution Date - THE KEY DATE for matching CRSP stock prices
    'disseqnbr': 'seq_num',  # Sequence number (multiple distributions can happen on same date)
    
    # Distribution characteristics  
    'disordinaryflg': 'is_ordinary_div',  # Is this a regular dividend? (Y/N)
    'distype': 'dist_type',  # Distribution type (CD=Cash Dividend, SD=Stock Dividend, etc)
    'disfreqtype': 'freq_type',  # Frequency (M=Monthly, Q=Quarterly, etc)
    'disdetailtype': 'detail_type',  # Detail type
    
    # Payment info
    'dispaymenttype': 'payment_currency',  # Currency of payment
    'disorigcurtype': 'orig_currency',  # Original currency
    'disdivamt': 'dividend_amt',  # Dividend amount per share
    
    # Price/share adjustment factors
    'disfacpr': 'price_adj_factor',  # Factor to adjust price
    'disfacshr': 'shares_adj_factor',  # Factor to adjust shares
    
    # Additional dates (less important but sometimes useful)
    'disdeclaredt': 'declare_date',  # When dividend was announced
    'disrecorddt': 'record_date',  # Who gets the dividend (shareholders as of this date)
    'dispaydt': 'payment_date',  # When dividend is actually paid
    
    # Related securities (for stock distributions)
    'dispermno': 'received_permno',  # PERMNO of security received (for stock dividends)
    'dispermco': 'issuer_permco',  # PERMCO of issuer providing payment
}

# Select and rename
df_clean = df[columns_to_keep.keys()].rename(columns=columns_to_keep)

# Drop the tax status and amount source columns (not needed)
# distaxtype and disamountsourcetype are already excluded from columns_to_keep

print(f"Original shape: {df.shape}")
print(f"Cleaned shape: {df_clean.shape}")
print(f"\nCleaned columns: {df_clean.columns.tolist()}")
print("\nFirst few rows:")
print(df_clean.head())
print("\nData types:")
print(df_clean.dtypes)



In [None]:
# Convert data types for efficiency and correctness

# 1. Convert date strings to datetime
date_columns = ['ex_date', 'declare_date', 'record_date', 'payment_date']
for col in date_columns:
    df_clean[col] = pd.to_datetime(df_clean[col])

# 2. Convert string columns to categorical (more memory efficient)
categorical_columns = [
    'is_ordinary_div',    # Y/N - very few unique values
    'dist_type',          # CD, SD, etc. - limited set of distribution types
    'freq_type',          # M, Q, A, etc. - limited frequencies
    'detail_type',        # Limited set of detail types
    'payment_currency',   # USD, EUR, etc. - limited currencies
    'orig_currency',      # Same as above
]

for col in categorical_columns:
    df_clean[col] = df_clean[col].astype('category')

print(df_clean.head())
print(df_clean.dtypes)

In [None]:
# Merge all dividend info into one dataframe, sort, and save

import os
import pandas as pd
from pathlib import Path
from dotenv import load_dotenv

load_dotenv()

table = 'stkdistributions'
start_year = 2010
end_year = 2024

dataframes = []

for year in range(start_year, end_year + 1):

    # For notebooks, construct path relative to known project structure
    # The notebook is in Strategies/dividend_cuts/, so go up 2 levels to project root
    current_dir = Path.cwd()
    OUTPUT_DIR = current_dir.parent.parent / "Data" / "crsp_distribution_events"
    output_file = OUTPUT_DIR / f"{table}_{year}.parquet"

    df = pd.read_parquet(output_file)
    dataframes.append(df)


# Concatenate all dataframes together
distributions_data = pd.concat(dataframes, ignore_index=True)

# print some quick info
print(distributions_data.head())
print(len(distributions_data))