In [1]:
# === SETUP === #

#!pip install db-sqlite3
import sqlite3
import pandas as pd
import numpy as np
import polars as pl
import os
import zipfile
from google.colab import drive
import requests
from sqlalchemy import create_engine

In [2]:
drive.mount('/content/drive')

folder = '/content/drive/MyDrive/Quantitative Investment Portfolio/'
data_dir = '/content/drive/MyDrive/Quantitative Investment Portfolio/Data/'
andrew_dir = 'Data Release 2024.10/'
zip_file = os.path.join(folder,'Data Release 2024.10/Firm Level Characteristics/Full Sets/signed_predictors_dl_wide.zip')

temp_dir = '/content/temp_data/'
os.makedirs(temp_dir, exist_ok=True)
temp_csv = 'signed_predictors_dl_wide.csv'

Mounted at /content/drive


In [3]:
# === CHOOSE FIRM FACTORS === #

select1 = pd.read_excel(data_dir + 'SignalDoc.xlsx')
select1.columns = select1.iloc[0]
select1 = select1[1:]
columns_to_keep = select1[select1['Cat.Signal']=='Predictor'].Acronym

In [4]:
select2 = pd.read_csv(data_dir + 'selectedsigs.csv', header=None, names=['Acronym'])
columns_to_keep = pd.concat([columns_to_keep,select2['Acronym']])
columns_to_keep = columns_to_keep[~columns_to_keep.isin(['Price', 'Size'])]
columns_to_keep = sorted(columns_to_keep.unique())
columns_to_keep = ['permno','yyyymm']+columns_to_keep

In [9]:
columns_to_keep = ['permno','yyyymm']+select2['Acronym'].to_list()

In [10]:
# === Extract the file from Andrew Chen's Drive ===#
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
    zip_ref.extract(temp_csv, temp_dir)

csv_path = os.path.join(temp_dir, temp_csv)

In [11]:
# Create lazy DataFrame
df_lazy = pl.scan_csv(csv_path,has_header=True, infer_schema_length=1000)

In [12]:
# Reduce Dataset to interesting Columns
if columns_to_keep:
    df_lazy = df_lazy.select(columns_to_keep)

In [13]:
# Calculate null counts for each column
## Need to fix scheme first
df_lazy = df_lazy.cast(pl.Float64)
null_counts = df_lazy.null_count().collect()
row_count = df_lazy.select(pl.len()).collect().item()
null_counts = (null_counts / row_count) * 100

print("\nNull counts for each column:")
print(null_counts)


Null counts for each column:
shape: (1, 39)
┌────────┬────────┬────────────┬───────────┬───┬────────────┬────────────┬────────────┬────────────┐
│ permno ┆ yyyymm ┆ AbnormalAc ┆ Accruals  ┆ … ┆ ShareIss1Y ┆ SmileSlope ┆ SurpriseRD ┆ DebtIssuan │
│ ---    ┆ ---    ┆ cruals     ┆ ---       ┆   ┆ ---        ┆ ---        ┆ ---        ┆ ce         │
│ f64    ┆ f64    ┆ ---        ┆ f64       ┆   ┆ f64        ┆ f64        ┆ f64        ┆ ---        │
│        ┆        ┆ f64        ┆           ┆   ┆            ┆            ┆            ┆ f64        │
╞════════╪════════╪════════════╪═══════════╪═══╪════════════╪════════════╪════════════╪════════════╡
│ 0.0    ┆ 0.0    ┆ 52.181356  ┆ 39.286237 ┆ … ┆ 34.307949  ┆ 84.637838  ┆ 71.337777  ┆ 49.073764  │
└────────┴────────┴────────────┴───────────┴───┴────────────┴────────────┴────────────┴────────────┘


In [14]:
[col for col in null_counts.columns if null_counts[col][0]>90]

['ChNAnalyst']

In [15]:
[col for col in null_counts.columns if null_counts[col][0]>60]

['ChNAnalyst',
 'CPVolSpread',
 'EarningsStreak',
 'FEPS',
 'fgr5yrLag',
 'NetDebtPrice',
 'OperProfRD',
 'OScore',
 'RD',
 'REV6',
 'RIVolSpread',
 'SmileSlope',
 'SurpriseRD']

In [16]:
# === NOTES ABOUT THE MISSING VARIABLES === #

##### This is a critical Point which will affect our analysis a LOT!!!!
##### IF the data is not missing at random we need to drop it as imputation
##### will bias our results and dropping will eat up our data.
##### AgeIPO = the Age of the firm when it had a IPO recently. Missing value if IPO wasn't recent.
###### Could be imputed somehow but would require sound economic theory and I don't have the time to do it rn! DROP
##### AnalystValue -- Only firms with data for the June fiscal year-end are kept. Again fixing this might take a while! But it's scales for the missing can fill with 1? Data should be missing at random and mean should be 1.xx due to biases in analyst.
##### CPVolSpread -- If missing then missing - drop - No Impute!
##### ChForecastAccrual - Firstly the data is sorted on accurals then is 1 if forecasted earnings are higer this month. IDK why so much missing data ?
###### Perhaps due to mothly nature ? Try keepinh when training on annual data.
##### ChNAnalyst - I really like this btw the data is set to missing for 1987, I have no Idea why.
###### Also it's only for firms with small size but even then should have been available for 20% data.(It isn't). No Impute!
##### ConsRecomm - Poor data. Drop
##### DownRecomm - Binary (0,1). Hell Mann. Will Keep this and just will be the mean seems reasonalble!!!!!
##### EarningsConsistency - IDK WHY MISSING! Drop
##### EarningsStreak -- It is the surprise in earning but only if there is a streak.
##### Supposedly better than Earning Surprise. Could create alternative version on this. Drop
##### ForecastDispersion - Ah man, scaled std of eps forecasts. I am just filling it with wither inf or 0 something. Keep for now
##### MomVol - Drop
##### NetDebtPrice - first of all it's yearly. second Only for a few quintiles.
##### NetPayoutYield - Seems to be dropped when it's 0. FUCKING ANNOYING since also misisng due to other reasons but impute 0 and keep.
##### OScore - Gonna Cry. Binary variable = 1 if in 10th Q 0 if between 1 and 7. Can't be used need to be reconstructed.
##### RD - Very STRANGE THAT THIS IS MISSING. I THINK IT's supposed to be 0. Perhaps it's a annual variable ?? STILL. NEED TO BE INVESTIGATED BUT DROP
##### REV6 - Shouldn't be missing that much TT. Can be filled with mean ig.
##### RIVolSpread - Again shouldn't be missing as much. Maybe missing for small stocks ? drop
##### SmileSlope  -- Can't do anything.
##### SurpriseRD --- SurpriseRD when RD is missing this can't be not misisng
##### UpRecomm -- Binary (0,1). Hell Mann. Will Keep this and just will be the mean seems reasonalble!!!!!
##### dVolCall - EH drop
##### dVolPut - EH drop
##### fgr5yrLag - Sad but Should have been good. Keep and fill with mean.

In [17]:
# Use this to filter columns with large missing values and make the code faster.

# cols = [col for col in null_counts.columns if null_counts[col][0]<60] + ['FEPS','DownRecomm','UpRecomm','ForecastDispersion','NetPayoutYield','fgr5yrLag']
# df_lazy = df_lazy.select(cols)

In [18]:
df = df_lazy.collect()

print(f"\nReduced dataset info:")
print(f"Rows: {df.shape[0]:,}")
print(f"Columns: {df.shape[1]:,}")
print(f"Memory usage: {df.estimated_size() / 1024**2:.2f} MB")


Reduced dataset info:
Rows: 5,273,600
Columns: 39
Memory usage: 1592.40 MB


In [23]:

# Convert 'permno' and 'yyyymm' columns to int64
df = df.with_columns([
    pl.col("permno").cast(pl.Int64),
    pl.col("yyyymm").cast(pl.Int64)
])

In [20]:
# === Merging with Returns Availibility === #

df_r = pl.read_parquet(os.path.join(data_dir, 'returns.parquet')).select(['permno','yyyymm','hsiccd','ret','lnP','lnsize'])
print(df_r.sample(1,seed=10))
print(df.sample(1,seed=10))
print('returns duplicates:',df_r.select(['permno','yyyymm']).is_duplicated().sum())
print('factor duplicates:',df.select(['permno','yyyymm']).is_duplicated().sum())

shape: (1, 6)
┌────────┬────────┬────────┬─────────┬──────────┬───────────┐
│ permno ┆ yyyymm ┆ hsiccd ┆ ret     ┆ lnP      ┆ lnsize    │
│ ---    ┆ ---    ┆ ---    ┆ ---     ┆ ---      ┆ ---       │
│ i64    ┆ i64    ┆ f64    ┆ f64     ┆ f64      ┆ f64       │
╞════════╪════════╪════════╪═════════╪══════════╪═══════════╡
│ 20841  ┆ 202107 ┆ 9999.0 ┆ -0.8073 ┆ 2.285439 ┆ 11.540179 │
└────────┴────────┴────────┴─────────┴──────────┴───────────┘
shape: (1, 39)
┌─────────┬──────────┬────────────┬──────────┬───┬────────────┬────────────┬───────────┬───────────┐
│ permno  ┆ yyyymm   ┆ AbnormalAc ┆ Accruals ┆ … ┆ ShareIss1Y ┆ SmileSlope ┆ SurpriseR ┆ DebtIssua │
│ ---     ┆ ---      ┆ cruals     ┆ ---      ┆   ┆ ---        ┆ ---        ┆ D         ┆ nce       │
│ f64     ┆ f64      ┆ ---        ┆ f64      ┆   ┆ f64        ┆ f64        ┆ ---       ┆ ---       │
│         ┆          ┆ f64        ┆          ┆   ┆            ┆            ┆ f64       ┆ f64       │
╞═════════╪══════════╪══════════

In [24]:
# prompt: merge df and df2 on 'permno' and 'yyyymm'. Both are polars
merged_df = df_r.join(df, on=['permno', 'yyyymm'],how='left')
print(merged_df.head(1))
print(f"Return Rows: {df_r.shape[0]:,}")
print(f"Factor Rows: {df.shape[0]:,}")
print(f"Merged Rows: {merged_df.shape[0]:,}")

shape: (1, 43)
┌────────┬────────┬────────┬─────────┬───┬────────────┬────────────┬────────────┬──────────────┐
│ permno ┆ yyyymm ┆ hsiccd ┆ ret     ┆ … ┆ ShareIss1Y ┆ SmileSlope ┆ SurpriseRD ┆ DebtIssuance │
│ ---    ┆ ---    ┆ ---    ┆ ---     ┆   ┆ ---        ┆ ---        ┆ ---        ┆ ---          │
│ i64    ┆ i64    ┆ f64    ┆ f64     ┆   ┆ f64        ┆ f64        ┆ f64        ┆ f64          │
╞════════╪════════╪════════╪═════════╪═══╪════════════╪════════════╪════════════╪══════════════╡
│ 10001  ┆ 199001 ┆ 4925.0 ┆ -1.8519 ┆ … ┆ -0.009073  ┆ null       ┆ null       ┆ -1.0         │
└────────┴────────┴────────┴─────────┴───┴────────────┴────────────┴────────────┴──────────────┘
Return Rows: 2,584,408
Factor Rows: 5,273,600
Merged Rows: 2,584,408


In [25]:
# Save reduced dataset
output_path = os.path.join(data_dir, 'factors_raw.parquet')
df.write_parquet(output_path)

# Save the merged dataset
output_path = os.path.join(data_dir, 'factors.parquet')
merged_df.write_parquet(output_path)

# Clean up
os.remove(csv_path)
os.rmdir(temp_dir)
import gc
del df_lazy, df, df_r
gc.collect()

45

# Getting Firm Level Characteristics

This notebook prepares data for the analysis. It focuses on pre-processing firm-level financial data, merging it with market-level data, and addressing missing values.

**Data Sources:**

The analysis utilizes two primary data sources:

1. **Firm-Level Characteristics:**  A comprehensive dataset containing various financial characteristics of firms are taken from Andrew Chen's files.
2. **Returns Data:**  A dataset containing historical stock returns, market capitalization, and other relevant information is prepared by the previous code from WRDS.

**Data Cleaning and Preprocessing:**

The notebook performs the following steps:

1. **Data Extraction:** It extracts the zipped firm-level data file and converts it to a Polars DataFrame for efficient data manipulation.  The code specifies a set of relevant columns to keep, based on a pre-defined list made on the basis of theoretical understanding.

2. **Handling Missing Data:** The script examines the extent of missing values in each selected column. Some variables can be dropped, based on a domain expert's analysis and the severity of the missing data.  The script includes important commentary on why specific decisions were made regarding imputation and/or removal.

3. **Data Merging:** The preprocessed firm characteristics are joined with the market data using a common identifier (firm ID and date).

4. **Data Output:** The preprocessed data (both the raw factors and the merged dataset) are saved to new Parquet files for future analysis.
