In [1]:
import pandas as pd
import numpy as np
from pandas.tseries.offsets import MonthEnd
import time

DATA_DIR = "../data"

# returns_and_market_cap contains monthly returns and market cap data per company
returns_and_market_cap = pd.read_csv(f"{DATA_DIR}/returns_market_cap.csv")

# ndt contains non-derivative transaction data (e.g., insider trades)
ndt = pd.read_csv(f"{DATA_DIR}/ndt.csv")

print("Non-derivative transaction data:")
print(ndt.head().to_markdown(index=False))
print()
print("Returns and market cap data:")
print(returns_and_market_cap.head().to_markdown(index=False))

Non-derivative transaction data:
| ACCESSION_NUMBER     | TRANS_DATE   | TRANS_CODE   |   EQUITY_SWAP_INVOLVED |   TRANS_SHARES |   TRANS_PRICEPERSHARE | TRANS_ACQUIRED_DISP_CD   |   SHRS_OWND_FOLWNG_TRANS | DIRECT_INDIRECT_OWNERSHIP   |   COMPANY_ID | RPTOWNERNAME     | RPTOWNER_RELATIONSHIP   | RPTOWNER_TITLE         |
|:---------------------|:-------------|:-------------|-----------------------:|---------------:|----------------------:|:-------------------------|-------------------------:|:----------------------------|-------------:|:-----------------|:------------------------|:-----------------------|
| 0000076605-17-000121 | 2017-09-27   | S            |                      0 |          10000 |                 83.16 | D                        |          15000           | D                           |       295249 | Cleveland Todd M | Director,Officer        | CEO                    |
| 0001140361-17-037031 | 2017-09-27   | F            |                      0 |          28833 | 

## 1: Data Preparation and Cleaning

In [2]:
# Convert date columns to datetime for consistent handling
# MONTH_END in returns_and_market_cap is the end-of-month date for returns/market cap
returns_and_market_cap['MONTH_END'] = pd.to_datetime(returns_and_market_cap['MONTH_END'])

# TRANS_DATE in ndt is the transaction date; use errors='coerce' to handle invalid dates
ndt['TRANS_DATE'] = pd.to_datetime(ndt['TRANS_DATE'], errors='coerce')

# Create a month-end column in ndt by rounding transaction dates to the nearest month end
ndt['month_end'] = ndt['TRANS_DATE'] + MonthEnd(0)

# Check for and drop rows with invalid transaction dates (NaT)
invalid_dates = ndt['TRANS_DATE'].isna().sum()
if invalid_dates > 0:
    print(f"Warning: Dropped {invalid_dates} rows with invalid TRANS_DATE.")
    ndt = ndt.dropna(subset=['TRANS_DATE'])

# Filter ndt to include only buy ('P') and sell ('S') transactions
trades = ndt[ndt['TRANS_CODE'].isin(['P', 'S'])].copy()

# Collapse duplicate rows so each ACCESSION_NUMBER is one trade
trades = trades.groupby('ACCESSION_NUMBER', as_index=False).agg({
    'TRANS_DATE': 'first',
    'TRANS_CODE': 'first',
    'EQUITY_SWAP_INVOLVED': 'first',
    'TRANS_SHARES': 'sum',
    'TRANS_PRICEPERSHARE': 'first',
    'TRANS_ACQUIRED_DISP_CD': 'first',
    'SHRS_OWND_FOLWNG_TRANS': 'first',
    'DIRECT_INDIRECT_OWNERSHIP': 'first',
    'COMPANY_ID': 'first',
    'RPTOWNERNAME': 'first',
    'RPTOWNER_RELATIONSHIP': 'first',
    'RPTOWNER_TITLE': 'first',
    'month_end': 'first'
})


# Calculate the dollar value of each trade (shares * price per share)
trades['trade_value'] = trades['TRANS_SHARES'] * trades['TRANS_PRICEPERSHARE']

print("Sample of trades data after preparation:")
print(trades.head().to_markdown(index=False))
print()
print("Shape of trades data after preparation:", trades.shape)

Sample of trades data after preparation:
| ACCESSION_NUMBER     | TRANS_DATE          | TRANS_CODE   |   EQUITY_SWAP_INVOLVED |   TRANS_SHARES |   TRANS_PRICEPERSHARE | TRANS_ACQUIRED_DISP_CD   |   SHRS_OWND_FOLWNG_TRANS | DIRECT_INDIRECT_OWNERSHIP   |   COMPANY_ID | RPTOWNERNAME   | RPTOWNER_RELATIONSHIP   | RPTOWNER_TITLE                 | month_end           |      trade_value |
|:---------------------|:--------------------|:-------------|-----------------------:|---------------:|----------------------:|:-------------------------|-------------------------:|:----------------------------|-------------:|:---------------|:------------------------|:-------------------------------|:--------------------|-----------------:|
| 0000001750-06-000002 | 2006-01-04 00:00:00 | S            |                      0 |         144360 |                 24.64 | D                        |                  6876.17 | D                           |       168154 | STORCH DAVID P | Director,Officer        | C

## 2: Market Returns Calculation

In [3]:
# Compute total market capitalization per month for weighting returns
# Sum MARKET_CAP_USD across all companies for each MONTH_END
total_cap = returns_and_market_cap.groupby('MONTH_END')['MARKET_CAP_USD'].sum().rename('total_cap')

# Join total_cap back to returns_and_market_cap for weight calculation
returns_and_market_cap = returns_and_market_cap.join(total_cap, on='MONTH_END')

# Calculate each company's weight as its market cap divided by total market cap
returns_and_market_cap['weight'] = returns_and_market_cap['MARKET_CAP_USD'] / returns_and_market_cap['total_cap']

# Initialize a dictionary to store market returns for different horizons (1, 3, 12 months)
market_returns = {}

# Compute weighted market returns for each horizon
# For each month, multiply individual company returns by their weights and sum
for horizon in ['1', '3', '12']:
    market_returns[f'market_return_{horizon}m'] = returns_and_market_cap.groupby('MONTH_END').apply(
        lambda df: np.sum(df[f'RETURN_LEAD_{horizon}_MONTHS'] * df['weight']),
        include_groups=False  # Avoid including group keys in the apply function
    ).reset_index(name=f'market_return_{horizon}m')

for horizon in ['1', '3', '12']:
    print(f"Sample of market_return_{horizon}m market returns:")
    print(market_returns[f'market_return_{horizon}m'].head().to_markdown(index=False))

Sample of market_return_1m market returns:
| MONTH_END           |   market_return_1m |
|:--------------------|-------------------:|
| 1962-01-31 00:00:00 |          0.0886072 |
| 1962-02-28 00:00:00 |         -0.0697672 |
| 1962-03-31 00:00:00 |          0.0249994 |
| 1962-04-30 00:00:00 |         -0.414634  |
| 1962-05-31 00:00:00 |          0.0520832 |
Sample of market_return_3m market returns:
| MONTH_END           |   market_return_3m |
|:--------------------|-------------------:|
| 1962-01-31 00:00:00 |           0.037974 |
| 1962-02-28 00:00:00 |          -0.44186  |
| 1962-03-31 00:00:00 |          -0.36875  |
| 1962-04-30 00:00:00 |          -0.256097 |
| 1962-05-31 00:00:00 |           0.375    |
Sample of market_return_12m market returns:
| MONTH_END           |   market_return_12m |
|:--------------------|--------------------:|
| 1962-01-31 00:00:00 |           -0.319621 |
| 1962-02-28 00:00:00 |           -0.331395 |
| 1962-03-31 00:00:00 |           -0.234376 |
| 1962-04-

## 3: Merging Trades with Returns Data

In [4]:
# Merge trades with returns_and_market_cap to align trade data with returns and market cap
# Use left join to keep all trades, even if no matching returns data exists
merged = trades.merge(returns_and_market_cap, 
                      left_on=['COMPANY_ID', 'month_end'], 
                      right_on=['COMPANY_ID', 'MONTH_END'], 
                      how='left').drop(columns='MONTH_END')  # Drop redundant MONTH_END column

# Merge in the precomputed market returns for each horizon
for horizon, df in market_returns.items():
    merged = merged.merge(df, 
                          left_on='month_end', 
                          right_on='MONTH_END', 
                          how='left').drop(columns='MONTH_END')

# Calculate excess returns for each horizon (company return - market return)
for horizon in ['1', '3', '12']:
    merged[f'excess_return_{horizon}m'] = merged[f'RETURN_LEAD_{horizon}_MONTHS'] - merged[f'market_return_{horizon}m']

print("Sample of merged data with excess returns:")
print(merged[['COMPANY_ID', 'month_end', 'TRANS_CODE', 'excess_return_1m', 'excess_return_3m', 'excess_return_12m']].head().to_markdown(index=False))

Sample of merged data with excess returns:
|   COMPANY_ID | month_end           | TRANS_CODE   |   excess_return_1m |   excess_return_3m |   excess_return_12m |
|-------------:|:--------------------|:-------------|-------------------:|-------------------:|--------------------:|
|       168154 | 2006-01-31 00:00:00 | S            |          0.0584863 |          0.0745878 |           0.0870041 |
|       168154 | 2006-03-31 00:00:00 | S            |         -0.0752978 |         -0.193422  |          -0.165043  |
|       168154 | 2006-03-31 00:00:00 | S            |         -0.0752978 |         -0.193422  |          -0.165043  |
|       168154 | 2006-03-31 00:00:00 | S            |         -0.0752978 |         -0.193422  |          -0.165043  |
|       168154 | 2006-04-30 00:00:00 | S            |         -0.0544755 |         -0.0779137 |          -0.0112499 |


## 4: Non-routine Transacitons and Quantile Assignemnt

In [12]:
import numpy as np
import pandas as pd

# Compute original (pre‑trade) shares
merged['original_shares'] = np.where(
    merged['TRANS_CODE'] == 'S',
    merged['SHRS_OWND_FOLWNG_TRANS'] + merged['TRANS_SHARES'],
    np.where(
        merged['TRANS_CODE'] == 'P',
        merged['SHRS_OWND_FOLWNG_TRANS'] - merged['TRANS_SHARES'],
        np.nan
    )
)

# Keep only trades where you actually held shares before the transaction
merged = merged.query("original_shares > 0").copy()

# Calculate pct bought/sold safely
merged['pct_holdings_sold'] = np.where(
    merged['TRANS_CODE']=='S',
    merged['TRANS_SHARES']/merged['original_shares'],
    np.nan
)

merged['pct_holdings_bought'] = np.where(
    merged['TRANS_CODE']=='P',
    merged['TRANS_SHARES']/merged['original_shares'],
    np.nan
)

# Function to assign quartile (or percentile if too few unique values)
# def quantile_or_rank(series):
#     nonnull = series.dropna()
#     if nonnull.nunique() >= 4:
#         return pd.qcut(series, 4, labels=False)
#     return series.rank(pct=True)

merged['quantile_holdings_sale'] = pd.qcut(merged['pct_holdings_sold'], 4, labels=False, duplicates='drop')
merged['quantile_holdings_buy']  = pd.qcut(merged['pct_holdings_bought'], 4, labels=False, duplicates='drop')

print("Original shares ≤0:", (merged['original_shares'] <= 0).sum())
print("Pct sold >100%:", (merged['pct_holdings_sold'] > 1).sum())
print("Pct bought >100%:", (merged['pct_holdings_bought'] > 1).sum())
print("\nQuantile distribution (sales):")
print(merged['quantile_holdings_sale'].value_counts(dropna=False))
print("\nQuantile distribution (buys):")
print(merged['quantile_holdings_buy'].value_counts(dropna=False))

Original shares ≤0: 0
Pct sold >100%: 0
Pct bought >100%: 8297

Quantile distribution (sales):
quantile_holdings_sale
NaN    74340
3.0    55622
1.0    55622
0.0    55622
2.0    55621
Name: count, dtype: int64

Quantile distribution (buys):
quantile_holdings_buy
NaN    222487
0.0     18590
3.0     18585
2.0     18585
1.0     18580
Name: count, dtype: int64


In [13]:
# ---------------------------
# Performance Analysis for Individual Factors
# ---------------------------
# Filter rows with at least one non-null excess return (for 1, 3, or 12 months)
excess_cols = [f'excess_return_{h}m' for h in ['1', '3', '12']]
valid_nr = merged.dropna(subset=excess_cols, how='all').copy()
print(f"Dropped {len(merged) - len(valid_nr)} rows due to missing excess returns.")

def annualize(series):
    series = series.dropna()
    if series.empty:
        return np.nan
    n = len(series)
    if n == 1:
        return series.mean() * 12

    log_cum = np.log1p(series).sum()
    with np.errstate(over='ignore'):
        result = np.exp(log_cum * 12/n) - 1
    return series.mean() * 12 if np.isinf(result) else result

# List both factor columns to evaluate (sales and buys)
factors = ['quantile_holdings_sale', 'quantile_holdings_buy']
horizons = ['1', '3', '12']

for factor in factors:
    grouped = valid_nr.groupby(factor)
    metrics_df = pd.DataFrame({factor: grouped.size().index})
    
    for h in horizons:
        col = f'excess_return_{h}m'
        avg_excess = grouped[col].mean().rename(f'avg_excess_{h}m')
        ann_excess = grouped[col].agg(annualize).rename(f'ann_excess_{h}m')
        count_obs  = grouped[col].count().rename(f'count_{h}m')
        hit_rate   = grouped[col].apply(lambda x: (x > 0).mean()).rename(f'hit_rate_{h}m')
        metrics_df = pd.concat([metrics_df, avg_excess, ann_excess, count_obs, hit_rate], axis=1)
    
    print(f"\nPerformance metrics for {factor}:")
    print(metrics_df.reset_index(drop=True).to_markdown(index=False))

print("\nNotes:")
print("- Annualized returns use geometric compounding when possible; otherwise, arithmetic mean * 12 is used.")
print("- Hit rate measures the proportion of observations with excess return > 0.")
print("- Counts show the number of observations for each return horizon.")

Dropped 3364 rows due to missing excess returns.

Performance metrics for quantile_holdings_sale:
|   quantile_holdings_sale |   avg_excess_1m |   ann_excess_1m |   count_1m |   hit_rate_1m |   avg_excess_3m |   ann_excess_3m |   count_3m |   hit_rate_3m |   avg_excess_12m |   ann_excess_12m |   count_12m |   hit_rate_12m |
|-------------------------:|----------------:|----------------:|-----------:|--------------:|----------------:|----------------:|-----------:|--------------:|-----------------:|-----------------:|------------:|---------------:|
|                        0 |     0.00227179  |      -0.0532656 |      55186 |      0.49585  |     0.00594013  |       -0.177564 |      55186 |      0.486138 |       0.0227138  |        -0.594463 |       54438 |       0.462164 |
|                        1 |     0.00123608  |      -0.0525074 |      55162 |      0.493274 |     0.00115361  |       -0.181217 |      55162 |      0.482869 |       0.0153028  |        -0.531624 |       54532 |       0

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
