# Pairs Traiding through Unsupervised Learning

In [None]:
import numpy as np
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm

tqdm.pandas()

FILEPATH = "./data/historic_characteristics.csv"
FILEPATH_PARQ = "./data/historic_characteristics.parquet"
FILEPATH_MOM_PARQ = "./data/data_mom.parquet"
FILEPATH_CLEAN_PARQ = "./data/data_cleaning.parquet"
FILEPATH_PRE_PARQ = "./data/data_preprocessed.parquet"

MIN_YEAR=1980
MAX_YEAR=2021
CHUNKS = 10000

FEATURES = [
    "DATE", "absacc", "acc", "aeavol", "age", "agr", "baspread", "beta", "betasq", "bm",
    "bm_ia", "cash", "cashdebt", "cashpr", "cfp", "cfp_ia", "chatoia", "chcsho", "chempia",
    "chinv", "chmom", "chpmia", "chtx", "cinvest", "convind", "currat", "depr", "divi",
    "divo", "dolvol", "dy", "ear", "egr", "ep", "gma", "herf", "hire", "idiovol", "ill",
    "indmom", "invest", "lev", "lgr", "maxret", "mom1m", "ms", "mve_ia", "mvel1", "nincr",
    "operprof", "pchcapx_ia", "pchcurrat", "pchdepr", "pchgm_pchsale", "pchquick",
    "pchsale_pchrect", "pctacc", "permno", "pricedelay", "ps", "quick", "rd", "retvol",
    "roaq", "roeq", "roic", "rsup", "salecash", "salerec", "securedind", "sgr", "sic2",
    "sin", "sp", "std_dolvol", "std_turn", "tang", "tb", "turn", "zerotrade"
]
WINDOW = 48
MOM_FEATURES = [f"mom{i}m" for i in range(1, WINDOW + 1)]

# Data Wrangling

## Parquet Dataset Creation

The dataset is large, around 3GB of company characteristics from 1985 to 2021. This dataset has been currated for the papers ["Empirical Asset Pricing via Machine Learning"](https://dachxiu.chicagobooth.edu/download/ML.pdf)(2018) and ["Autoencoder Asset Pricing Models." ](https://www.sciencedirect.com/science/article/abs/pii/S0304407620301998)(2019) by Shihao Gu, Bryan Kelly and Dacheng Xiu. The raw format is available for download from the authors [personal website](https://sites.google.com/view/agoyal145) (or reach out to me for a currated dataset). The dataset has 94 1 month Lagged Firm Characteristics (as the CRSP releases these with a month delay, from the notes in their papers). Note that this CRSP datasets don't have tickers or company names, but use a permanent indentifier instead, which if you have a bloomberg terminal or access to a research site that brokers this data, you can easily convert to the company ticker.

In [None]:
import pyarrow as pa
import pyarrow.parquet as pq
import dask
dask.config.set({'dataframe.query-planning': True})
import dask.dataframe as dd

CONVERT = False
if CONVERT:
    # In case its in CSV, convert it to parquet for easy processing.
    chars_df = pd.read_csv(FILEPATH)[FEATURES]
    chars_df['DATE'] = pd.to_datetime(chars_df['DATE'], format='%Y%m%d')
    chars_df = chars_df[(chars_df['DATE'].dt.year >= MIN_YEAR) & (chars_df['DATE'].dt.year <= MAX_YEAR)]
    chars_df = chars_df.sort_values("DATE")
    chars_df.to_parquet(FILEPATH_PARQ, index=False, compression="snappy")
else:
    chars_df = pd.read_parquet(FILEPATH_PRE_PARQ) # use FILEPATH_PARQ without preproc or FILEPATH_PRE_PARQ

chars_df.head(5)

## Sanitinzation and Feature Engineering

To sanitinize we drop any company with insufficiant data to fill a window, and we fill any missing characteristic with the median of that window. We then perform a rolling window to calculate the MOM factor for 2 to 64 months, the data already has a rolling 1 month momentum. To read about momentum stratgies check out the article [Momentum and Reversion Trading Signals Analysis](https://medium.com/call-for-atlas/momentum-and-reversion-the-poor-mans-trading-strategies-9b8e1e6d3496).


In [None]:
def interpolate_with_median(group):
    rolling_median = group.rolling(window=WINDOW, min_periods=1).median()
    group= group.fillna(rolling_median).bfill()

    return group


PRE_PROC=False
if PRE_PROC:
    valid_groups = chars_df.groupby('permno').filter(lambda x: len(x) >= WINDOW and x['mom1m'].isna().sum() <= 2)
    for i in tqdm(range(2, WINDOW + 1), desc="moms"):
        rolling_func = lambda x: (x + 1).rolling(window=i).apply(np.prod, raw=True) - 1
        valid_groups[f'mom{i}m'] = valid_groups.groupby('permno')['mom1m'].transform(rolling_func)

    numerical_columns = valid_groups.select_dtypes(include=['float64', 'int64']).columns

    tqdm.pandas(desc="interpolate_with_median")
    valid_groups[numerical_columns]= valid_groups.groupby('permno')[numerical_columns].progress_transform(lambda x: interpolate_with_median(x))
    valid_groups.to_parquet(FILEPATH_PRE_PARQ, index=False, compression="snappy")
    chars_df = valid_groups

chars_df.tail(5)

## Firm Characteristics

| Acronym  | Firm characteristic                                           | Acronym    | Firm characteristic                                       |
|----------|--------------------------------------------------------------|------------|----------------------------------------------------------|
| absacc   | Absolute accruals                                            | invest     | Capital expenditures and inventory                        |
| acc      | Working capital accruals                                     | IPO        | New equity issue                                          |
| aeavol   | Abnormal earnings announcement volume                        | lev        | Leverage                                                  |
| age      | # years since first Compustat coverage                       | lgr        | Growth in long-term debt                                  |
| agr      | Asset growth                                                  | maxret     | Maximum daily return                                      |
| baspread | Bid-ask spread                                               | ms         | Financial statement score                                 |
| beta     | Beta                                                         | mve        | Size                                                      |
| betasq   | Beta squared                                                 | mve ia     | Industry-adjusted size                                    |
| bm       | Book-to-market                                               | nincr      | Number of earnings increases                              |
| bm ia    | Industry-adjusted book to market                             | operprof   | Operating profitability                                   |
| cash     | Cash holdings                                                | pchcapx ia | Industry adjusted % change in capital expenditures        |
| cashdebt | Cash flow to debt                                            | pchcurrat  | % change in current ratio                                 |
| cashpr   | Cash productivity                                            | pchdepr    | % change in depreciation                                  |
| cfp      | Cash flow to price ratio                                     | pchgm      | % change in gross margin                                  |
| cfp ia   | Industry-adjusted cash flow to price ratio                   | pchsale    | % change in sales                                         |
| chatoia  | Industry-adjusted change in asset turnover                   | pchquick   | % change in quick ratio                                   |
| chcsho   | Change in shares outstanding                                 | pctacc     | Percent accruals                                          |
| chempia  | Industry-adjusted change in employees                        | pricedelay | Price delay                                               |
| chinv    | Change in inventory                                          | ps         | Financial statements score                                |
| chmom    | Change in 6-month momentum                                   | quick      | Quick ratio                                               |
| chpmia   | Industry-adjusted change in profit margin                    | rd         | R&D increase                                              |
| chtx     | Change in tax expense                                        | retvol     | Return volatility                                         |
| cinvest  | Corporate investment                                         | roaq       | Return on assets                                          |
| convind  | Convertible debt indicator                                   | roeq       | Return on equity                                          |
| currat   | Current ratio                                                | roic       | Return on invested capital                                |
| depr     | Depreciation / PP&E                                          | rsup       | Revenue surprise                                          |
| divi     | Dividend initiation                                          | sgr        | Sales growth                                              |
| divo     | Dividend omission                                            | sin        | Sin stocks                                                |
| dolvol   | Dollar trading volume                                        | SP         | Sales to price                                            |
| dy       | Dividend to price                                            | std dolvol | Volatility of liquidity (dollar trading volume)          |
| ear      | Earnings announcement return                                 | std turn   | Volatility of liquidity (share turnover)                  |
| egr      | Growth in common shareholder equity                          | sue        | Unexpected quarterly earnings                             |
| ep       | Earnings to price                                            | tang       | Debt capacity/firm tangibility                            |
| gma      | Gross profitability                                          | tb         | Tax income to book income                                 |
| herf     | Industry sales concentration                                 | turn       | Share turnover                                            |
| hire     | Employee growth rate                                         | zerotrade  | Zero trading days                                         |
| idiovol  | Idiosyncratic return volatility                              |            |                                                           |
| ill      | Illiquidity                                                  |            |                                                           |
| indmom   | Industry momentum                                            |            |                                                           |


Additionally, there are PERMNO columns to ID the company, and a **SIC code** to ID the industry from [NAICS](https://www.naics.com/sic-codes-industry-drilldown/) to compliment the industires momentum **INDMOM**.


## Dim Reduction with PCA

We perform standardization and PCA at 95% variance, to center the data's means for the clustering algorithims and reduce its dimensionality.

In [None]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

chars_pca_df = chars_df.copy()

scaler = StandardScaler()
pca = PCA(0.95)

features_df = chars_pca_df.drop(['DATE', 'permno'], axis=1).bfill()

pipeline = Pipeline([('scaler', scaler), ('pca', pca)])
pca_df = pipeline.fit_transform(features_df)

pca_result_df = pd.DataFrame(data=pca_df, index=chars_pca_df.index)
pca_components_cols = pca_result_df.columns
pca_result_df['permno'] = chars_pca_df['permno']
pca_result_df.index = chars_pca_df['DATE']
pca_result_df.tail(1)

# Cluster Agglomerative

In [None]:
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import AgglomerativeClustering

models_dfs = []
cluster_membership = []
for quarter, data in tqdm(pca_result_df.groupby(pd.Grouper(freq='Q')), desc="Processing quarters"):
    # Use only the PCA components for nearest neighbors and clustering
    pca_data = data[pca_components_cols]
    if len(pca_data) < 2:
        print(f"Skipping {quarter} due to insufficient data.")
        continue

    neigh = NearestNeighbors(n_neighbors=2)
    nbrs = neigh.fit(pca_data)
    distances, indices = nbrs.kneighbors(pca_data)

    # Sort distances and find the distance threshold using a percentile
    distances = np.sort(distances, axis=0)
    distances = distances[:, 1]  # Get the distances to the nearest neighbor
    distance_alpha_thresh = np.percentile(distances, 30)  # Corrected percentile calculation
    agg_model = AgglomerativeClustering(n_clusters=None, distance_threshold=distance_alpha_thresh,  linkage='average')
    agg_model.fit(pca_data)

    data['Cluster'] = agg_model.labels_
    cluster_membership.append(data[['permno', 'Cluster']])

    models_dfs.append({'Quarter': f"{quarter.year}-Q{quarter.quarter}", 'NumClusters': agg_model.n_clusters_})

models_df = pd.DataFrame(models_dfs)
cluster_membership_df = pd.concat(cluster_membership)

cluster_membership_df

In [None]:
models_df