In [15]:
import pandas as pd
import numpy as np
from pandas.tseries.offsets import MonthEnd
import time

DATA_DIR = "../data"

# returns_and_market_cap contains monthly returns and market cap data per company
returns_and_market_cap = pd.read_csv(f"{DATA_DIR}/returns_market_cap.csv")

# ndt contains non-derivative transaction data (e.g., insider trades)
ndt = pd.read_csv(f"{DATA_DIR}/ndt.csv")

In [16]:
print("Non-derivative transaction data:")
print(ndt.head().to_markdown(index=False))
print()
print("Returns and market cap data:")
print(returns_and_market_cap.head().to_markdown(index=False))

Non-derivative transaction data:
| ACCESSION_NUMBER     | TRANS_DATE   | TRANS_CODE   |   EQUITY_SWAP_INVOLVED |   TRANS_SHARES |   TRANS_PRICEPERSHARE | TRANS_ACQUIRED_DISP_CD   |   SHRS_OWND_FOLWNG_TRANS | DIRECT_INDIRECT_OWNERSHIP   |   COMPANY_ID | RPTOWNERNAME     | RPTOWNER_RELATIONSHIP   | RPTOWNER_TITLE         |
|:---------------------|:-------------|:-------------|-----------------------:|---------------:|----------------------:|:-------------------------|-------------------------:|:----------------------------|-------------:|:-----------------|:------------------------|:-----------------------|
| 0000076605-17-000121 | 2017-09-27   | S            |                      0 |          10000 |                 83.16 | D                        |          15000           | D                           |       295249 | Cleveland Todd M | Director,Officer        | CEO                    |
| 0001140361-17-037031 | 2017-09-27   | F            |                      0 |          28833 | 

## 1: Data Preparation and Cleaning

In [None]:
# Convert date columns to datetime for consistent handling
# MONTH_END in returns_and_market_cap is the end-of-month date for returns/market cap
returns_and_market_cap['MONTH_END'] = pd.to_datetime(returns_and_market_cap['MONTH_END'])

# TRANS_DATE in ndt is the transaction date; use errors='coerce' to handle invalid dates
ndt['TRANS_DATE'] = pd.to_datetime(ndt['TRANS_DATE'], errors='coerce')

# Create a month-end column in ndt by rounding transaction dates to the nearest month end
ndt['month_end'] = ndt['TRANS_DATE'] + MonthEnd(0)

# Check for and drop rows with invalid transaction dates (NaT)
invalid_dates = ndt['TRANS_DATE'].isna().sum()
if invalid_dates > 0:
    print(f"Warning: Dropped {invalid_dates} rows with invalid TRANS_DATE.")
    ndt = ndt.dropna(subset=['TRANS_DATE'])

# Filter ndt to include only buy ('P') and sell ('S') transactions
trades = ndt[ndt['TRANS_CODE'].isin(['P', 'S'])].copy()

# Collapse duplicate rows so each ACCESSION_NUMBER is one trade
trades = trades.groupby('ACCESSION_NUMBER', as_index=False).agg({
    'TRANS_DATE': 'first',
    'TRANS_CODE': 'first',
    'EQUITY_SWAP_INVOLVED': 'first',
    'TRANS_SHARES': 'sum',
    'TRANS_PRICEPERSHARE': 'first',
    'TRANS_ACQUIRED_DISP_CD': 'first',
    'SHRS_OWND_FOLWNG_TRANS': 'first',
    'DIRECT_INDIRECT_OWNERSHIP': 'first',
    'COMPANY_ID': 'first',
    'RPTOWNERNAME': 'first',
    'RPTOWNER_RELATIONSHIP': 'first',
    'RPTOWNER_TITLE': 'first',
    'month_end': 'first'
})

# Calculate the dollar value of each trade (shares * price per share)
trades['trade_value'] = trades['TRANS_SHARES'] * trades['TRANS_PRICEPERSHARE']

print("Sample of trades data after preparation:")
print(trades.head().to_markdown(index=False))

Sample of trades data after preparation:
| ACCESSION_NUMBER     | TRANS_DATE          | TRANS_CODE   |   EQUITY_SWAP_INVOLVED |   TRANS_SHARES |   TRANS_PRICEPERSHARE | TRANS_ACQUIRED_DISP_CD   |   SHRS_OWND_FOLWNG_TRANS | DIRECT_INDIRECT_OWNERSHIP   |   COMPANY_ID | RPTOWNERNAME   | RPTOWNER_RELATIONSHIP   | RPTOWNER_TITLE                 | month_end           |      trade_value |
|:---------------------|:--------------------|:-------------|-----------------------:|---------------:|----------------------:|:-------------------------|-------------------------:|:----------------------------|-------------:|:---------------|:------------------------|:-------------------------------|:--------------------|-----------------:|
| 0000001750-06-000002 | 2006-01-04 00:00:00 | S            |                      0 |         144360 |                 24.64 | D                        |                  6876.17 | D                           |       168154 | STORCH DAVID P | Director,Officer        | C

## 2: Market Returns Calculation

In [18]:
# Compute total market capitalization per month for weighting returns
# Sum MARKET_CAP_USD across all companies for each MONTH_END
total_cap = returns_and_market_cap.groupby('MONTH_END')['MARKET_CAP_USD'].sum().rename('total_cap')

# Join total_cap back to returns_and_market_cap for weight calculation
returns_and_market_cap = returns_and_market_cap.join(total_cap, on='MONTH_END')

# Calculate each company's weight as its market cap divided by total market cap
returns_and_market_cap['weight'] = returns_and_market_cap['MARKET_CAP_USD'] / returns_and_market_cap['total_cap']

# Initialize a dictionary to store market returns for different horizons (1, 3, 12 months)
market_returns = {}

# Compute weighted market returns for each horizon
# For each month, multiply individual company returns by their weights and sum
for horizon in ['1', '3', '12']:
    market_returns[f'market_return_{horizon}m'] = returns_and_market_cap.groupby('MONTH_END').apply(
        lambda df: np.sum(df[f'RETURN_LEAD_{horizon}_MONTHS'] * df['weight']),
        include_groups=False  # Avoid including group keys in the apply function
    ).reset_index(name=f'market_return_{horizon}m')

for horizon in ['1', '3', '12']:
    print(f"Sample of market_return_{horizon}m market returns:")
    print(market_returns[f'market_return_{horizon}m'].head().to_markdown(index=False))

Sample of market_return_1m market returns:
| MONTH_END           |   market_return_1m |
|:--------------------|-------------------:|
| 1962-01-31 00:00:00 |          0.0886072 |
| 1962-02-28 00:00:00 |         -0.0697672 |
| 1962-03-31 00:00:00 |          0.0249994 |
| 1962-04-30 00:00:00 |         -0.414634  |
| 1962-05-31 00:00:00 |          0.0520832 |
Sample of market_return_3m market returns:
| MONTH_END           |   market_return_3m |
|:--------------------|-------------------:|
| 1962-01-31 00:00:00 |           0.037974 |
| 1962-02-28 00:00:00 |          -0.44186  |
| 1962-03-31 00:00:00 |          -0.36875  |
| 1962-04-30 00:00:00 |          -0.256097 |
| 1962-05-31 00:00:00 |           0.375    |
Sample of market_return_12m market returns:
| MONTH_END           |   market_return_12m |
|:--------------------|--------------------:|
| 1962-01-31 00:00:00 |           -0.319621 |
| 1962-02-28 00:00:00 |           -0.331395 |
| 1962-03-31 00:00:00 |           -0.234376 |
| 1962-04-

## 3: Merging Trades with Returns Data

In [19]:
# Merge trades with returns_and_market_cap to align trade data with returns and market cap
# Use left join to keep all trades, even if no matching returns data exists
merged = trades.merge(returns_and_market_cap, 
                      left_on=['COMPANY_ID', 'month_end'], 
                      right_on=['COMPANY_ID', 'MONTH_END'], 
                      how='left').drop(columns='MONTH_END')  # Drop redundant MONTH_END column

# Merge in the precomputed market returns for each horizon
for horizon, df in market_returns.items():
    merged = merged.merge(df, 
                          left_on='month_end', 
                          right_on='MONTH_END', 
                          how='left').drop(columns='MONTH_END')

# Calculate excess returns for each horizon (company return - market return)
for horizon in ['1', '3', '12']:
    merged[f'excess_return_{horizon}m'] = merged[f'RETURN_LEAD_{horizon}_MONTHS'] - merged[f'market_return_{horizon}m']

print("Sample of merged data with excess returns:")
print(merged[['COMPANY_ID', 'month_end', 'TRANS_CODE', 'excess_return_1m', 'excess_return_3m', 'excess_return_12m']].head().to_markdown(index=False))

Sample of merged data with excess returns:
|   COMPANY_ID | month_end           | TRANS_CODE   |   excess_return_1m |   excess_return_3m |   excess_return_12m |
|-------------:|:--------------------|:-------------|-------------------:|-------------------:|--------------------:|
|       168154 | 2006-01-31 00:00:00 | S            |          0.0584863 |          0.0745878 |           0.0870041 |
|       168154 | 2006-03-31 00:00:00 | S            |         -0.0752978 |         -0.193422  |          -0.165043  |
|       168154 | 2006-03-31 00:00:00 | S            |         -0.0752978 |         -0.193422  |          -0.165043  |
|       168154 | 2006-03-31 00:00:00 | S            |         -0.0752978 |         -0.193422  |          -0.165043  |
|       168154 | 2006-04-30 00:00:00 | S            |         -0.0544755 |         -0.0779137 |          -0.0112499 |


## 4.1: Cluster Buying and Quintile Assignment

In [20]:
# Start timing this section for performance tracking
start = time.perf_counter()

# Identify only buy transactions (TRANS_CODE 'P')
buy_trades = trades[trades['TRANS_CODE'] == 'P'].copy()

# Count the number of unique insiders (using RPTOWNERNAME) buying in each company/month
cluster_counts = buy_trades.groupby(['COMPANY_ID', 'month_end'])['RPTOWNERNAME'] \
    .nunique() \
    .reset_index(name='num_buyers')

# Merge the cluster counts back into the merged DataFrame (fill missing values with 0)
merged = merged.merge(cluster_counts, on=['COMPANY_ID', 'month_end'], how='left')
merged['num_buyers'] = merged['num_buyers'].fillna(0)

# Flag cluster buying: a company-month is considered to have cluster buying if there are 2 or more unique buyers
merged['cluster_buy'] = merged['num_buyers'] > 1

# For a more granular measure, assign quintiles based on num_buyers within each month_end.
# For months with at least one buyer, this ranks the intensity of cluster buying.
def assign_quintiles(x):
    # Only quintile months with at least one buyer
    positives = x[x > 0]
    quintiles = pd.qcut(positives, 5, labels=False, duplicates='drop') + 1
    result = pd.Series(0, index=x.index)            # default 0 for no-buy
    result.loc[positives.index] = quintiles
    return result

merged['cluster_quintile'] = merged.groupby('month_end')['num_buyers'] \
    .transform(assign_quintiles)

# For months with no buyers, cluster_quintile might be missing; fill those with 0 (or another indicator)
merged['cluster_quintile'] = merged['cluster_quintile'].fillna(0)

print(f"Cluster buying measures computed in {time.perf_counter() - start:.4f} seconds")
print("Sample of merged data with cluster buying measures:")
print(merged[['COMPANY_ID', 'month_end', 'num_buyers', 'cluster_buy', 'cluster_quintile']].head().to_markdown(index=False))

print(
    "cluster_buy flag: True indicates 2 or more unique insider buyers in that company/month.\n"
    "cluster_quintile: 0 = no buys; 1-5 indicate increasing intensity of cluster buying among months with buyers."
)


Cluster buying measures computed in 0.2828 seconds
Sample of merged data with cluster buying measures:
|   COMPANY_ID | month_end           |   num_buyers | cluster_buy   |   cluster_quintile |
|-------------:|:--------------------|-------------:|:--------------|-------------------:|
|       168154 | 2006-01-31 00:00:00 |            0 | False         |                  0 |
|       168154 | 2006-03-31 00:00:00 |            0 | False         |                  0 |
|       168154 | 2006-03-31 00:00:00 |            0 | False         |                  0 |
|       168154 | 2006-03-31 00:00:00 |            0 | False         |                  0 |
|       168154 | 2006-04-30 00:00:00 |            0 | False         |                  0 |
cluster_buy flag: True indicates 2 or more unique insider buyers in that company/month.
cluster_quintile: 0 = no buys; 1-5 indicate increasing intensity of cluster buying among months with buyers.


## 4.2: Cluster Selling and Quintile Assignment

In [21]:
# Start timing this section for performance tracking
start = time.perf_counter()

# Identify only sell transactions (TRANS_CODE 'S')
sell_trades = trades[trades['TRANS_CODE'] == 'S'].copy()

# Count the number of unique insiders (using RPTOWNERNAME) selling in each company/month
cluster_sell_counts = sell_trades.groupby(['COMPANY_ID', 'month_end'])['RPTOWNERNAME'] \
    .nunique() \
    .reset_index(name='num_sellers')

# Merge the cluster sell counts back into the merged DataFrame (fill missing values with 0)
merged = merged.merge(cluster_sell_counts, on=['COMPANY_ID', 'month_end'], how='left')
merged['num_sellers'] = merged['num_sellers'].fillna(0)

# Flag cluster selling: a company-month is considered to have cluster selling if there are 2 or more unique sellers
merged['cluster_sell'] = merged['num_sellers'] > 1

# Assign quintiles based on num_sellers within each month_end
def assign_sell_quintiles(x):
    positives = x[x > 0]
    quintiles = pd.qcut(positives, 5, labels=False, duplicates='drop') + 1
    result = pd.Series(0, index=x.index)
    result.loc[positives.index] = quintiles
    return result

merged['cluster_sell_quintile'] = merged.groupby('month_end')['num_sellers'] \
    .transform(assign_sell_quintiles)

# Fill missing cluster_sell_quintile (months with zero sellers) with 0
merged['cluster_sell_quintile'] = merged['cluster_sell_quintile'].fillna(0)

print(f"Cluster selling measures computed in {time.perf_counter() - start:.4f} seconds")
print("Sample of merged data with cluster selling measures:")
print(merged[['COMPANY_ID', 'month_end', 'num_sellers', 'cluster_sell', 'cluster_sell_quintile']].head().to_markdown(index=False))

print(
    "cluster_sell flag: True indicates ≥2 unique insider sellers in that company/month.\n"
    "cluster_sell_quintile: 0 = no sells; 1–5 indicate increasing intensity of cluster selling among months with sellers."
)

Cluster selling measures computed in 0.3857 seconds
Sample of merged data with cluster selling measures:
|   COMPANY_ID | month_end           |   num_sellers | cluster_sell   |   cluster_sell_quintile |
|-------------:|:--------------------|--------------:|:---------------|------------------------:|
|       168154 | 2006-01-31 00:00:00 |             1 | False          |                       1 |
|       168154 | 2006-03-31 00:00:00 |             1 | False          |                       1 |
|       168154 | 2006-03-31 00:00:00 |             1 | False          |                       1 |
|       168154 | 2006-03-31 00:00:00 |             1 | False          |                       1 |
|       168154 | 2006-04-30 00:00:00 |             1 | False          |                       1 |
cluster_sell flag: True indicates ≥2 unique insider sellers in that company/month.
cluster_sell_quintile: 0 = no sells; 1–5 indicate increasing intensity of cluster selling among months with sellers.


In [22]:
# Collapse to one row per firm-month so each observation counts once
unique_merged = merged.drop_duplicates(subset=['COMPANY_ID','month_end'])[
    ['COMPANY_ID','month_end','cluster_quintile',
     'excess_return_1m','excess_return_3m','excess_return_12m']
].copy()

unique_sell = merged.drop_duplicates(subset=['COMPANY_ID','month_end'])[
    ['COMPANY_ID','month_end','cluster_sell_quintile',
     'excess_return_1m','excess_return_3m','excess_return_12m']
].copy()


## 5.1: Performance Analysis of Cluster Buying

In [23]:
# Filter to rows with at least one non-null excess return (more permissive than all three)
valid_cluster = unique_merged.dropna(subset=[f'excess_return_{h}m' for h in ['1','3','12']], how='all').copy()
print(f"Dropped {len(merged) - len(valid_cluster)} rows due to missing excess returns.")

# Compute monthly average excess returns by cluster buying quintile
monthly_cluster = valid_cluster.groupby(['month_end', 'cluster_quintile'])[
    [f'excess_return_{h}m' for h in ['1', '3', '12']]
].mean().reset_index()

# Define a function to annualize returns
# Uses geometric compounding; falls back to arithmetic mean if cumulative return is non-positive
def annualize(series):
    series = series.dropna()  # Drop NaNs before calculation
    cum_return = (1 + series).prod()  # Cumulative return
    n = len(series)  # Number of periods
    # If positive cumulative return and data exists, annualize geometrically
    if cum_return > 0 and n > 0:
        return cum_return**(12/n) - 1
    # Otherwise, use arithmetic mean annualized (or NaN if no data)
    return series.mean() * 12 if n > 0 else np.nan

# Calculate annualized excess returns per cluster buying quintile
agg_cluster = monthly_cluster.groupby('cluster_quintile').agg(**{
    f'ann_excess_{h}m': (f'excess_return_{h}m', annualize) for h in ['1', '3', '12']
}).reset_index()

# Count non-null monthly observations per cluster buying quintile for each horizon
counts_cluster = monthly_cluster.groupby('cluster_quintile').agg(**{
    f'count_{h}m': (f'excess_return_{h}m', 'count') for h in ['1', '3', '12']
}).reset_index()

# Calculate hit rate: proportion of months where excess return > 0 (positive performance)
hit_cluster = valid_cluster.groupby('month_end').apply(
    lambda df: df.assign(**{f'beat_mean_{h}m': df[f'excess_return_{h}m'] > 0 for h in ['1', '3', '12']}),
    include_groups=False  # Exclude group keys from apply
).groupby('cluster_quintile')[
    [f'beat_mean_{h}m' for h in ['1', '3', '12']]
].mean().reset_index().rename(
    columns=lambda c: f'hit_rate_{c[-2:]}' if 'beat' in c else c
)

# Combine all performance metrics into one DataFrame
performance_cluster = agg_cluster.merge(hit_cluster, on='cluster_quintile').merge(counts_cluster, on='cluster_quintile').sort_values('cluster_quintile')

print("Performance by cluster buying quintile:")
print(performance_cluster.to_markdown())

print("\nNotes:")
print("- Annualized returns use geometric compounding; non-positive returns fall back to arithmetic mean * 12.")
print("- Hit rate measures the fraction of months where excess return > 0.")
print("- Counts show the number of monthly observations per cluster buying quintile for each horizon.")

Dropped 196492 rows due to missing excess returns.
Performance by cluster buying quintile:
|    |   cluster_quintile |   ann_excess_1m |   ann_excess_3m |   ann_excess_12m |   hit_rate_1m |   hit_rate_3m |   hit_rate_2m |   count_1m |   count_3m |   count_12m |
|---:|-------------------:|----------------:|----------------:|-----------------:|--------------:|--------------:|--------------:|-----------:|-----------:|------------:|
|  0 |                  0 |       0.0279873 |       0.0839434 |        0.30845   |      0.491308 |      0.483676 |      0.463382 |        301 |        301 |         298 |
|  1 |                  1 |       0.0572494 |       0.090943  |        0.395064  |      0.483788 |      0.475536 |      0.457609 |        230 |        230 |         227 |
|  2 |                  2 |      -0.0265636 |       0.0236758 |       -0.0854162 |      0.472915 |      0.488867 |      0.473912 |        226 |        226 |         223 |
|  3 |                  3 |       0.0259867 |      -0.

## 5.2: Performance Analysis of Cluster Selling

In [24]:
# Filter to rows with at least one non-null excess return (more permissive than all three)
valid_cluster_sell = unique_sell.dropna(subset=[f'excess_return_{h}m' for h in ['1','3','12']], how='all').copy()

print(f"Dropped {len(merged) - len(valid_cluster_sell)} rows due to missing excess returns.")

# Compute monthly average excess returns by cluster selling quintile
monthly_cluster_sell = valid_cluster_sell.groupby(['month_end', 'cluster_sell_quintile'])[
    [f'excess_return_{h}m' for h in ['1', '3', '12']]
].mean().reset_index()

# Calculate annualized excess returns per cluster selling quintile
agg_cluster_sell = monthly_cluster_sell.groupby('cluster_sell_quintile').agg(**{
    f'ann_excess_{h}m': (f'excess_return_{h}m', annualize) for h in ['1', '3', '12']
}).reset_index()

# Count non-null monthly observations per cluster selling quintile for each horizon
counts_cluster_sell = monthly_cluster_sell.groupby('cluster_sell_quintile').agg(**{
    f'count_{h}m': (f'excess_return_{h}m', 'count') for h in ['1', '3', '12']
}).reset_index()

# Calculate hit rate: proportion of months where excess return > 0 (positive performance)
hit_cluster_sell = valid_cluster_sell.groupby('month_end').apply(
    lambda df: df.assign(**{f'beat_mean_{h}m': df[f'excess_return_{h}m'] > 0 for h in ['1', '3', '12']}),
    include_groups=False
).groupby('cluster_sell_quintile')[
    [f'beat_mean_{h}m' for h in ['1', '3', '12']]
].mean().reset_index().rename(
    columns=lambda c: f'hit_rate_{c[-2:]}' if 'beat' in c else c
)

# Combine all performance metrics into one DataFrame
performance_cluster_sell = (
    agg_cluster_sell
    .merge(hit_cluster_sell, on='cluster_sell_quintile')
    .merge(counts_cluster_sell, on='cluster_sell_quintile')
    .sort_values('cluster_sell_quintile')
)

print("Performance by cluster selling quintile:")
print(performance_cluster_sell.to_markdown(index=False))

print("\nNotes:")
print("- Annualized returns use geometric compounding; non-positive returns fall back to arithmetic mean * 12.")
print("- Hit rate measures the fraction of months where excess return > 0.")
print("- Counts show the number of monthly observations per cluster selling quintile for each horizon.")


Dropped 196492 rows due to missing excess returns.
Performance by cluster selling quintile:
|   cluster_sell_quintile |   ann_excess_1m |   ann_excess_3m |   ann_excess_12m |   hit_rate_1m |   hit_rate_3m |   hit_rate_2m |   count_1m |   count_3m |   count_12m |
|------------------------:|----------------:|----------------:|-----------------:|--------------:|--------------:|--------------:|-----------:|-----------:|------------:|
|                       0 |      0.0685431  |      0.155863   |        0.566196  |      0.481856 |      0.476842 |      0.461046 |        301 |        301 |         298 |
|                       1 |      0.00892506 |      0.00893337 |        0.0923987 |      0.489808 |      0.482416 |      0.45983  |        227 |        227 |         224 |
|                       2 |      0.00370978 |      0.0272323  |        0.207138  |      0.492205 |      0.489394 |      0.483772 |        222 |        222 |         219 |
|                       3 |     -0.0297065  |     -0.