### 0: Retrieve Data

In [1]:
import pandas as pd
import os
import sys

# Add the project root to sys.path (two levels up from app/notebooks)
project_root = os.path.abspath("../..")
sys.path.append(project_root)

print("Current working directory:", os.getcwd())
print("Project root added to sys.path:", project_root)

# Define the data directory at the project root
data_dir = os.path.join(project_root, "data")
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

# Define paths for CSV files
ndt_csv_path = os.path.join(data_dir, "ndt.csv")
returns_csv_path = os.path.join(data_dir, "returns_market_cap.csv")

# Flag to indicate if both CSV files exist
data_loaded = False

if os.path.exists(ndt_csv_path) and os.path.exists(returns_csv_path):
    print("Both CSV files exist. Loading data from CSV...")
    ndt = pd.read_csv(ndt_csv_path, low_memory=False)
    returns_and_market_cap = pd.read_csv(returns_csv_path)
    data_loaded = True
else:
    print("At least one CSV file is missing.")

if not data_loaded:
    from app.services.snowflake_query_executor import SnowflakeQueryExecutor

    # Create a Snowflake query executor
    print("Authorize Snowflake connection with Two-Factor Authentication...")
    executor = SnowflakeQueryExecutor(
        user=os.getenv('USER'),
        password=os.getenv('PASSWORD'),
        account=os.getenv('ACCOUNT'),
        warehouse=os.getenv('WAREHOUSE'),
        database=os.getenv('DATABASE'),
        schema=os.getenv('SCHEMA')
    )

    # Build SQL file paths (located inside app/sql/)
    ndt_sql_query_file_path = os.path.join(project_root, "app", "sql", "fetch_ndt.sql")
    market_cap_returns_sql_query_file_path = os.path.join(project_root, "app", "sql", "fetch_returns_market_cap.sql")

    # Retrieve the non-derivative transactions data if missing
    if not os.path.exists(ndt_csv_path):
        print("Fetching non-derivative transactions data from Snowflake...")
        ndt = executor.execute_query_from_file(ndt_sql_query_file_path)
        ndt.to_csv(ndt_csv_path, index=False)
    else:
        print("Non-derivative transactions CSV already exists.")

    # Retrieve market cap and returns data if missing
    if not os.path.exists(returns_csv_path):
        print("Fetching market cap and returns data from Snowflake...")
        returns_and_market_cap = executor.execute_query_from_file(market_cap_returns_sql_query_file_path)
        returns_and_market_cap.to_csv(returns_csv_path, index=False)
    else:
        print("Market cap and returns CSV already exists.")

Current working directory: /Users/atang/Documents/verdad/insider-transaction-research/app/notebooks
Project root added to sys.path: /Users/atang/Documents/verdad/insider-transaction-research
Both CSV files exist. Loading data from CSV...


In [2]:
print("Non-derivative transaction data:")
print(ndt.head().to_markdown(index=False))
print()
print("Returns and market cap data:")
print(returns_and_market_cap.head().to_markdown(index=False))

Non-derivative transaction data:
| ACCESSION_NUMBER     | TRANS_DATE   | TRANS_CODE   |   EQUITY_SWAP_INVOLVED |   TRANS_SHARES |   TRANS_PRICEPERSHARE | TRANS_ACQUIRED_DISP_CD   |   SHRS_OWND_FOLWNG_TRANS | DIRECT_INDIRECT_OWNERSHIP   |   COMPANY_ID | RPTOWNERNAME   | RPTOWNER_RELATIONSHIP   | RPTOWNER_TITLE                 |
|:---------------------|:-------------|:-------------|-----------------------:|---------------:|----------------------:|:-------------------------|-------------------------:|:----------------------------|-------------:|:---------------|:------------------------|:-------------------------------|
| 0000001750-06-000002 | 2006-01-04   | S            |                      0 |          50000 |                 24.64 | D                        |                 18810    | D                           |       168154 | STORCH DAVID P | Director,Officer        | Chairman, Pres., CEO, Director |
| 0000001750-06-000002 | 2006-01-04   | S            |                      0 |

### 1: Data Preparation and Cleaning

In [3]:
from pandas.tseries.offsets import MonthEnd

# Convert date columns to datetime for consistent handling
# MONTH_END in returns_and_market_cap is the end-of-month date for returns/market cap
returns_and_market_cap['MONTH_END'] = pd.to_datetime(returns_and_market_cap['MONTH_END'])

# TRANS_DATE in ndt is the transaction date; use errors='coerce' to handle invalid dates
ndt['TRANS_DATE'] = pd.to_datetime(ndt['TRANS_DATE'], errors='coerce')

# Create a month-end column in ndt by rounding transaction dates to the nearest month end
ndt['month_end'] = ndt['TRANS_DATE'] + MonthEnd(0)

# Check for and drop rows with invalid transaction dates (NaT)
invalid_dates = ndt['TRANS_DATE'].isna().sum()
if invalid_dates > 0:
    print(f"Warning: Dropped {invalid_dates} rows with invalid TRANS_DATE.")
    ndt = ndt.dropna(subset=['TRANS_DATE'])

# Filter ndt to include only buy ('P') and sell ('S') transactions
trades = ndt[ndt['TRANS_CODE'].isin(['P', 'S'])].copy()

# Collapse duplicate rows so each ACCESSION_NUMBER is one trade
trades = trades.groupby('ACCESSION_NUMBER', as_index=False).agg({
    'TRANS_DATE': 'first',
    'TRANS_CODE': 'first',
    'EQUITY_SWAP_INVOLVED': 'first',
    'TRANS_SHARES': 'sum',
    'TRANS_PRICEPERSHARE': 'first',
    'TRANS_ACQUIRED_DISP_CD': 'first',
    'SHRS_OWND_FOLWNG_TRANS': 'first',
    'DIRECT_INDIRECT_OWNERSHIP': 'first',
    'COMPANY_ID': 'first',
    'RPTOWNERNAME': 'first',
    'RPTOWNER_RELATIONSHIP': 'first',
    'RPTOWNER_TITLE': 'first',
    'month_end': 'first'
})

# Calculate the dollar value of each trade (shares * price per share)
trades['trade_value'] = trades['TRANS_SHARES'] * trades['TRANS_PRICEPERSHARE']

print("Sample of trades data after preparation:")
print(trades.head().to_markdown(index=False))

Sample of trades data after preparation:
| ACCESSION_NUMBER     | TRANS_DATE          | TRANS_CODE   |   EQUITY_SWAP_INVOLVED |   TRANS_SHARES |   TRANS_PRICEPERSHARE | TRANS_ACQUIRED_DISP_CD   |   SHRS_OWND_FOLWNG_TRANS | DIRECT_INDIRECT_OWNERSHIP   |   COMPANY_ID | RPTOWNERNAME   | RPTOWNER_RELATIONSHIP   | RPTOWNER_TITLE                 | month_end           |      trade_value |
|:---------------------|:--------------------|:-------------|-----------------------:|---------------:|----------------------:|:-------------------------|-------------------------:|:----------------------------|-------------:|:---------------|:------------------------|:-------------------------------|:--------------------|-----------------:|
| 0000001750-06-000002 | 2006-01-04 00:00:00 | S            |                      0 |         144360 |                 24.64 | D                        |                 18810    | D                           |       168154 | STORCH DAVID P | Director,Officer        | C

### 1.5: Residualize Trade Value on Market Cap 

In [24]:
import numpy as np
import statsmodels.api as sm

# Merge trades with market cap data from returns_and_market_cap on both COMPANY_ID and month_end date
trades_merged = trades.merge(
    returns_and_market_cap[['COMPANY_ID', 'MONTH_END', 'MARKET_CAP_USD']],
    left_on=['COMPANY_ID', 'month_end'],
    right_on=['COMPANY_ID', 'MONTH_END'],
    how='left'
)

# Drop rows where market cap or trade value is missing
trades_merged = trades_merged.dropna(subset=['MARKET_CAP_USD', 'trade_value'])

# Ensure the date columns are datetime
trades_merged['month_end'] = pd.to_datetime(trades_merged['month_end'])
trades_merged['MONTH_END'] = pd.to_datetime(trades_merged['MONTH_END'])

# Filter out rows with non-positive values to avoid log issues
trades_merged = trades_merged[(trades_merged['trade_value'] > 0) & (trades_merged['MARKET_CAP_USD'] > 0)]

# Create log-transformed variables
trades_merged['log_trade_value'] = np.log(trades_merged['trade_value'])
trades_merged['log_market_cap'] = np.log(trades_merged['MARKET_CAP_USD'])

# Prepare regression variables with a constant for the intercept
X = sm.add_constant(trades_merged['log_market_cap'])
y = trades_merged['log_trade_value']

# Fit the regression model
model = sm.OLS(y, X).fit()

# Calculate the residualized trade value (on the log scale)
trades_merged['trade_value_res'] = model.resid

# Display a sample of the results
print("Sample of trades data with residualized trade value (trade_value_res):")
print(trades_merged.head().to_markdown(index=False))

Sample of trades data with residualized trade value (trade_value_res):
| ACCESSION_NUMBER     | TRANS_DATE          | TRANS_CODE   |   EQUITY_SWAP_INVOLVED |   TRANS_SHARES |   TRANS_PRICEPERSHARE | TRANS_ACQUIRED_DISP_CD   |   SHRS_OWND_FOLWNG_TRANS | DIRECT_INDIRECT_OWNERSHIP   |   COMPANY_ID | RPTOWNERNAME   | RPTOWNER_RELATIONSHIP   | RPTOWNER_TITLE                 | month_end           |      trade_value | MONTH_END           |   MARKET_CAP_USD |   log_trade_value |   log_market_cap |   trade_value_res |
|:---------------------|:--------------------|:-------------|-----------------------:|---------------:|----------------------:|:-------------------------|-------------------------:|:----------------------------|-------------:|:---------------|:------------------------|:-------------------------------|:--------------------|-----------------:|:--------------------|-----------------:|------------------:|-----------------:|------------------:|
| 0000001750-06-000002 | 2006-01-04 00:00

## 2: Market Returns Calculation

In [34]:
import numpy as np
import pandas as pd

# Assume returns_and_market_cap is your DataFrame with columns:
# 'MONTH_END', 'MARKET_CAP_USD', 'RETURN_LEAD_1_MONTHS', 
# 'RETURN_LEAD_3_MONTHS', 'RETURN_LEAD_12_MONTHS'

# Drop rows with missing MARKET_CAP_USD for a clean weighting process
df = returns_and_market_cap.dropna(subset=['MARKET_CAP_USD']).copy()

# Compute market-cap weighted returns per MONTH_END using np.average for each horizon
weighted_returns = df.groupby('MONTH_END').apply(
    lambda group: pd.Series({
        'mkt_1': np.average(group['RETURN_LEAD_1_MONTHS'], weights=group['MARKET_CAP_USD']),
        'mkt_3': np.average(group['RETURN_LEAD_3_MONTHS'], weights=group['MARKET_CAP_USD']),
        'mkt_12': np.average(group['RETURN_LEAD_12_MONTHS'], weights=group['MARKET_CAP_USD']),
    }),
    include_groups=False
).reset_index()

# Build a dictionary of market returns, renaming columns to match your original naming
market_returns = {}
for horizon in ['1', '3', '12']:
    col = 'mkt_' + horizon
    market_returns[f'market_return_{horizon}m'] = (
        weighted_returns[['MONTH_END', col]]
        .rename(columns={col: f'market_return_{horizon}m'})
    )

# Print sample market returns for each horizon
for horizon in ['1', '3', '12']:
    key = f'market_return_{horizon}m'
    print(f"Sample of {key} market returns:")
    print(market_returns[key].head().to_markdown(index=False))

# Merge the weighted market returns back into the main DataFrame
df = df.merge(weighted_returns, on='MONTH_END', how='left')

# Calculate excess returns: individual returns minus the market weighted return
df['excess_return_1m']  = df['RETURN_LEAD_1_MONTHS']  - df['mkt_1']
df['excess_return_3m']  = df['RETURN_LEAD_3_MONTHS']  - df['mkt_3']
df['excess_return_12m'] = df['RETURN_LEAD_12_MONTHS'] - df['mkt_12']

# Create size quartiles based on MARKET_CAP_USD
df['SIZE_QUARTILE'] = pd.qcut(df['MARKET_CAP_USD'], q=4, labels=[1, 2, 3, 4])

# Convert MONTH_END to datetime (if needed)
df['MONTH_END'] = pd.to_datetime(df['MONTH_END'], errors='coerce')

return_df_final = df

# Print a sample of the final DataFrame with excess returns
print("Sample of final DataFrame with excess returns:")
print(return_df_final.head().to_markdown(index=False))

Sample of market_return_1m market returns:
| MONTH_END           |   market_return_1m |
|:--------------------|-------------------:|
| 1962-01-31 00:00:00 |          0.0886072 |
| 1962-02-28 00:00:00 |         -0.0697672 |
| 1962-03-31 00:00:00 |          0.0249994 |
| 1962-04-30 00:00:00 |         -0.414634  |
| 1962-05-31 00:00:00 |          0.0520832 |
Sample of market_return_3m market returns:
| MONTH_END           |   market_return_3m |
|:--------------------|-------------------:|
| 1962-01-31 00:00:00 |           0.037974 |
| 1962-02-28 00:00:00 |          -0.44186  |
| 1962-03-31 00:00:00 |          -0.36875  |
| 1962-04-30 00:00:00 |          -0.256097 |
| 1962-05-31 00:00:00 |           0.375    |
Sample of market_return_12m market returns:
| MONTH_END           |   market_return_12m |
|:--------------------|--------------------:|
| 1962-01-31 00:00:00 |           -0.319621 |
| 1962-02-28 00:00:00 |           -0.331395 |
| 1962-03-31 00:00:00 |           -0.234376 |
| 1962-04-

## 3: Merging Trades with Returns Data

In [35]:
# Merge trades with returns_and_market_cap to align trade data with returns and market cap
# Use left join to keep all trades, even if no matching returns data exists
merged = trades_merged.merge(return_df_final, 
                      left_on=['COMPANY_ID', 'MONTH_END'], 
                      right_on=['COMPANY_ID', 'MONTH_END'], 
                      how='left')#.drop(columns='MONTH_END')  # Drop redundant MONTH_END column

print("Sample of merged data with excess returns:")
print(merged.head().to_markdown(index=False))

Sample of merged data with excess returns:
| ACCESSION_NUMBER     | TRANS_DATE          | TRANS_CODE   |   EQUITY_SWAP_INVOLVED |   TRANS_SHARES |   TRANS_PRICEPERSHARE | TRANS_ACQUIRED_DISP_CD   |   SHRS_OWND_FOLWNG_TRANS | DIRECT_INDIRECT_OWNERSHIP   |   COMPANY_ID | RPTOWNERNAME   | RPTOWNER_RELATIONSHIP   | RPTOWNER_TITLE                 | month_end           |      trade_value | MONTH_END           |   MARKET_CAP_USD_x |   log_trade_value |   log_market_cap |   trade_value_res |   RETURN_LEAD_1_MONTHS |   RETURN_LEAD_3_MONTHS |   RETURN_LEAD_12_MONTHS |   MARKET_CAP_USD_y |       mkt_1 |      mkt_3 |   mkt_12 |   excess_return_1m |   excess_return_3m |   excess_return_12m |   SIZE_QUARTILE |
|:---------------------|:--------------------|:-------------|-----------------------:|---------------:|----------------------:|:-------------------------|-------------------------:|:----------------------------|-------------:|:---------------|:------------------------|:------------------------

## 4: Net Trade Value and Quintile Assignment

In [36]:
import time 

# Start timing this section for performance tracking
start = time.perf_counter()

# Compute total trade value by company, month, and transaction type (P or S) in one pass
# Use unstack to pivot P and S into columns, filling missing values with 0
trade_values = trades_merged.groupby(['COMPANY_ID', 'month_end', 'TRANS_CODE'])['trade_value_res'].sum().unstack(fill_value=0)

# Calculate net trade value as buys (P) minus sells (S)
# Use .get() to safely handle cases where 'P' or 'S' might be missing
net_value = (trade_values.get('P', 0) - trade_values.get('S', 0)).reset_index(name='net_trade_value')

# Merge net trade value back into the merged DataFrame
merged = merged.merge(net_value, on=['COMPANY_ID', 'month_end'], how='left')

# Define a helper function to assign quintiles robustly
# Forces 5 bins using qcut, falls back to equal-range bins if necessary
def safe_qcut(x, q=5):
    try:
        # Attempt to create 5 quantile-based bins
        return pd.qcut(x, q, labels=False, duplicates='drop') + 1
    except ValueError:
        # If qcut fails (e.g., too few unique values), use equal-range bins
        if x.notna().nunique() > 1:
            return pd.cut(x, bins=min(q, x.notna().nunique()), labels=False, include_lowest=True) + 1
        # If only one unique value or all NaN, assign to quintile 1
        return pd.Series(1, index=x.index)

# Assign quintiles based on net_trade_value within each month_end
merged['net_value_quintile'] = merged.groupby('month_end')['net_trade_value'].transform(safe_qcut)

print(f"Net trade value and quintile assignment elapsed: {time.perf_counter() - start:.4f} seconds")
print("Sample of merged data with net trade value and quintiles:")
print(merged[['COMPANY_ID', 'month_end', 'net_trade_value', 'net_value_quintile']].head().to_markdown(index=False))
print(
    "net_value_quintile legend:\n"
    "1 = lowest net trade value (biggest net sellers)\n"
    "2 = lower-middle net trade value\n"
    "3 = middle net trade value\n"
    "4 = upper-middle net trade value\n"
    "5 = highest net trade value (biggest net buyers)"
)

Net trade value and quintile assignment elapsed: 0.1833 seconds
Sample of merged data with net trade value and quintiles:
|   COMPANY_ID | month_end           |   net_trade_value |   net_value_quintile |
|-------------:|:--------------------|------------------:|---------------------:|
|       168154 | 2006-01-31 00:00:00 |          -3.02156 |                    3 |
|       168154 | 2006-03-31 00:00:00 |          -7.81201 |                    1 |
|       168154 | 2006-03-31 00:00:00 |          -7.81201 |                    1 |
|       168154 | 2006-03-31 00:00:00 |          -7.81201 |                    1 |
|       168154 | 2006-04-30 00:00:00 |          -1.27386 |                    3 |
net_value_quintile legend:
1 = lowest net trade value (biggest net sellers)
2 = lower-middle net trade value
3 = middle net trade value
4 = upper-middle net trade value
5 = highest net trade value (biggest net buyers)


## 5: Performance Analysis of Net Trade Value

In [37]:
# Filter to rows with at least one non-null excess return (more permissive than all three)
valid = merged.dropna(subset=[f'excess_return_{h}m' for h in ['1', '3', '12']], how='all').copy()
print(f"Dropped {len(merged) - len(valid)} rows due to missing excess returns.")

# Compute monthly average excess returns by quintile
monthly = valid.groupby(['month_end', 'net_value_quintile'])[[f'excess_return_{h}m' for h in ['1', '3', '12']]].mean().reset_index()

# Define a function to annualize returns
# Uses geometric compounding; falls back to arithmetic mean if cumulative return is non-positive
def annualize(series):
    series = series.dropna()  # Drop NaNs before calculation
    cum_return = (1 + series).prod()  # Cumulative return
    n = len(series)  # Number of periods
    # If positive cumulative return and data exists, annualize geometrically
    if cum_return > 0 and n > 0:
        return cum_return**(12/n) - 1
    # Otherwise, use arithmetic mean annualized (or NaN if no data)
    return series.mean() * 12 if n > 0 else np.nan

# Calculate annualized excess returns per quintile
agg = monthly.groupby('net_value_quintile').agg(**{
    f'ann_excess_{h}m': (f'excess_return_{h}m', annualize) for h in ['1', '3', '12']
}).reset_index()

# Count non-null monthly observations per quintile for each horizon
counts = monthly.groupby('net_value_quintile').agg(**{
    f'count_{h}m': (f'excess_return_{h}m', 'count') for h in ['1', '3', '12']
}).reset_index()

# Calculate hit rate: proportion of months where excess return > 0 (positive performance)
hit = valid.groupby('month_end').apply(
    lambda df: df.assign(**{f'beat_mean_{h}m': df[f'excess_return_{h}m'] > 0 for h in ['1', '3', '12']}),
    include_groups=False  # Exclude group keys from apply
).groupby('net_value_quintile')[[f'beat_mean_{h}m' for h in ['1', '3', '12']]].mean().reset_index().rename(
    columns=lambda c: f'hit_rate_{c[-2:]}' if 'beat' in c else c
)

# Combine all performance metrics into one DataFrame
performance = agg.merge(hit, on='net_value_quintile').merge(counts, on='net_value_quintile').sort_values('net_value_quintile')

print("Performance by net trade value quintile:")
print(performance.to_markdown())

print("\nNotes:")
print("- Annualized returns use geometric compounding; non-positive returns fall back to arithmetic mean * 12.")
print("- Hit rate measures the fraction of months where excess return > 0.")
print("- Counts show the avg number of monthly observations per quintile for each horizon.")

# when you have fat tailed returns the mean is shifted higher 
# when you have returns that aren't logged you will overestimate the return because you mean is dragged to the right
# compound over time go down 20 and back 20 to get the return 

Dropped 0 rows due to missing excess returns.
Performance by net trade value quintile:
|    |   net_value_quintile |   ann_excess_1m |   ann_excess_3m |   ann_excess_12m |   hit_rate_1m |   hit_rate_3m |   hit_rate_2m |   count_1m |   count_3m |   count_12m |
|---:|---------------------:|----------------:|----------------:|-----------------:|--------------:|--------------:|--------------:|-----------:|-----------:|------------:|
|  0 |                    1 |      0.0315202  |       0.163099  |         0.100343 |      0.516923 |      0.474913 |      0.406988 |        264 |        264 |         261 |
|  1 |                    2 |      0.00909527 |       0.0315588 |         0.117719 |      0.491279 |      0.477047 |      0.46343  |        241 |        241 |         238 |
|  2 |                    3 |      0.0427328  |       0.0180076 |         0.224038 |      0.489858 |      0.481519 |      0.466862 |        249 |        249 |         246 |
|  3 |                    4 |     -0.0215705  | 

### Analyisis:
- Best short‑term performance (1–3m) comes from the largest net buyers (Quintile 1).
- Worst performance is clearly in Quintile 5 (biggest net sellers).
- Over 12 months, middle quintiles (2–3) still outperform extreme sellers.

In [None]:
from scipy.stats import ttest_1samp

# Filter valid observations
valid = valid.copy()

results = []
for quintile, df in valid.groupby('net_value_quintile'):
    for horizon in ['1', '3', '12']:
        series = df[f'excess_return_{horizon}m'].dropna()
        if len(series) < 5:
            # Skip tiny samples
            continue
        t_stat, p_value = ttest_1samp(series, 0)
        results.append({
            'net_value_quintile': quintile,
            'horizon_months': int(horizon),
            'mean_excess_return': series.mean(),
            't_stat': t_stat,
            'p_value': p_value,
            'n_obs': len(series)
        })

ttest_df = pd.DataFrame(results).sort_values(['net_value_quintile','horizon_months'])

print("One-sample t-test of excess returns vs zero by quintile:")
print(ttest_df.to_markdown(index=False))

## Assign monthly size quintiles to market cap

In [None]:
# Ensure that the merged dataframe has a size_quintile assignment based on MARKET_CAP_USD for each month
merged['size_quintile'] = (
    merged
    .groupby('month_end')['MARKET_CAP_USD']
    .transform(lambda x: pd.qcut(x, 5, labels=False, duplicates='drop') + 1)
)

# (Optional) Recreate the valid dataset if needed so that it carries over the new column.
valid = merged.dropna(subset=[f'excess_return_{h}m' for h in ['1', '3', '12']], how='all').copy()

print("Sample of valid data with size_quintile assigned:")
print(valid[['COMPANY_ID', 'month_end', 'MARKET_CAP_USD', 'size_quintile']].head().to_markdown(index=False))
print(
    "size_quintile legend:\n"
    "1 = smallest 20% of companies by market cap\n"
    "2 = next 20%\n"
    "3 = middle 20%\n"
    "4 = next 20%\n"
    "5 = largest 20% of companies by market cap"
)

## Compute performance by (size × net‑value) buckets

In [None]:
# 1️. Ensure size quintiles are assigned in the merged dataframe
merged['size_quintile'] = (
    merged
    .groupby('month_end')['MARKET_CAP_USD']
    .transform(lambda x: pd.qcut(x, 5, labels=False, duplicates='drop') + 1)
)

# Recreate the valid dataset so it carries the new 'size_quintile'
valid = merged.dropna(subset=[f'excess_return_{h}m' for h in ['1', '3', '12']], how='all').copy()

# 2️. Aggregate monthly excess returns by month, size_quintile, and net_value_quintile.
monthly_double = (
    valid.groupby(['month_end', 'size_quintile', 'net_value_quintile'])[
        [f'excess_return_{h}m' for h in ['1', '3', '12']]
    ]
    .mean()
    .reset_index()
)

# Function to annualize returns (geometric compounding or arithmetic fallback)
def annualize(series):
    series = series.dropna()  # Remove missing values
    cum_return = (1 + series).prod()  # Cumulative return
    n = len(series)  # Number of periods
    if cum_return > 0 and n > 0:
        return cum_return**(12/n) - 1  # Geometric annualization
    return series.mean() * 12 if n > 0 else np.nan  # Fallback: arithmetic

# Aggregate performance metrics for each double-sorted bucket
def agg_perf(df):
    return pd.Series({
        **{f'ann_excess_{h}m': annualize(df[f'excess_return_{h}m']) for h in ['1', '3', '12']},
        **{f'hit_rate_{h}m': (df[f'excess_return_{h}m'] > 0).mean() for h in ['1', '3', '12']},
        'n_obs': len(df)  # Number of monthly observations in this bucket
    })

# Group by size and net trade quintile, applying agg_perf while excluding grouping columns (fixes deprecation warning)
double_perf = (
    monthly_double
    .groupby(['size_quintile', 'net_value_quintile'])
    .apply(agg_perf, include_groups=False)
    .reset_index()
    .sort_values(['size_quintile', 'net_value_quintile'])
)

print("Double-sorted performance (by size quintile and net trade quintile):")
print(double_perf.to_markdown(index=False))

- Hit rate ~50% across almost every bucket means none consistently beat the market far more often than they lose. 
- Small stocks that sold the most (size_quintile 1 & net_value_quintile 1) had the strongest short‑term (1‑month) and especially long‑term (12‑month) excess returns.
- Big buyers (net_value_quintile 5) tended to underperform the market, especially in small firms.
- Some medium‑large buckets (size 5, net_value_quintile 2–3) show modest positive returns and hit rates above 55% over 12 months.

##### Overall, selling activity in small companies appears linked to higher future returns, but the pattern weakens for larger firms and isn’t ironclad (hit rates hover around 50%).

## pre‑2015 vs post‑2015 Does the “buyer premium” hold throughout?

In [None]:
# Add a new column "period" to indicate pre‑2015 vs post‑2015
valid['period'] = np.where(valid['month_end'] < pd.Timestamp('2015-01-01'), 'Pre-2015', 'Post-2015')

# Check how many observations are in each period
print("Observations by period:")
print(valid['period'].value_counts())

## Compute Performance Metrics by Period and Net Trade Quintile

In [None]:
# Aggregate monthly average excess returns by month, period, and net trade quintile
monthly_period = valid.groupby(['month_end', 'period', 'net_value_quintile'])[
    [f'excess_return_{h}m' for h in ['1', '3', '12']]
].mean().reset_index()

# Use the same annualization function as before
def annualize(series):
    series = series.dropna()  # Remove missing values
    cum_return = (1 + series).prod()  # Cumulative return
    n = len(series)
    if cum_return > 0 and n > 0:
        return cum_return**(12/n) - 1
    return series.mean() * 12 if n > 0 else np.nan

# Define an aggregation function for performance metrics
def agg_perf(df):
    return pd.Series({
        **{f'ann_excess_{h}m': annualize(df[f'excess_return_{h}m']) for h in ['1', '3', '12']},
        **{f'hit_rate_{h}m': (df[f'excess_return_{h}m'] > 0).mean() for h in ['1', '3', '12']},
        'n_obs': len(df)
    })

# Compute performance metrics for each period and net trade quintile
performance_period = (
    monthly_period
    .groupby(['period', 'net_value_quintile'])
    .apply(agg_perf, include_groups=False)
    .reset_index()
    .sort_values(['period', 'net_value_quintile'])
)

print("Performance by net trade quintile and period:")
print(performance_period.to_markdown(index=False))

## Run T‑Tests for Excess Returns by Period and Quintile

In [None]:
from scipy.stats import ttest_1samp

results_period = []
# Group first by period, then by net trade quintile
for period, period_df in valid.groupby('period'):
    for quintile, df in period_df.groupby('net_value_quintile'):
        for horizon in ['1', '3', '12']:
            series = df[f'excess_return_{horizon}m'].dropna()
            if len(series) < 5:  # Skip tiny samples
                continue
            t_stat, p_value = ttest_1samp(series, 0)
            results_period.append({
                'period': period,
                'net_value_quintile': quintile,
                'horizon_months': int(horizon),
                'mean_excess_return': series.mean(),
                't_stat': t_stat,
                'p_value': p_value,
                'n_obs': len(series)
            })

ttest_df_period = pd.DataFrame(results_period).sort_values(['period', 'net_value_quintile', 'horizon_months'])
print("One-sample t-test of excess returns vs zero by period and quintile:")
print(ttest_df_period.to_markdown(index=False))