In [1]:
# My imports

import numpy as np
import pandas as pd
import warnings

warnings.filterwarnings('ignore')

### Data Preparation

In [4]:
# Load 94 firm characteristics dataset

data_ch = pd.read_csv('../GKX/GKX_20201231.csv')
data_ch['DATE'] = pd.to_datetime(data_ch['DATE'], format='%Y%m%d') + pd.offsets.MonthEnd(0)
data_ch = data_ch[(data_ch['DATE'] >= '1957-01-31') & (data_ch['DATE'] <= '2016-12-31')]
cols = data_ch.columns.tolist()
cols_new = [x for x in cols if x not in ['permno', 'prc', 'SHROUT', 'mve0']]
data_ch = data_ch[cols_new]
data_ch

In [None]:
# Construct dummy variables

print(data_ch['sic2'].isnull().sum())
data_ch  = data_ch.dropna(subset=['sic2']).reset_index(drop=True)
#print(data_ch['sic2'].isnull().sum())

dummies = pd.get_dummies(data_ch['sic2'], prefix='dum_')
data_ch = data_ch.drop('sic2', axis=1)
data_ch = pd.concat([data_ch, dummies], axis=1)
print(data_ch.shape)

In [None]:
# Fill in missing characteristics

chas = [x for x in cols_new if x not in ['DATE', 'RET', 'sic2']]
#print(chas)
print('Total number of missing characteristics: %d' % (data_ch[chas].isnull().sum().sum()))

for cha in chas:
    data_ch[cha] = data_ch.groupby('DATE')[cha].transform(lambda x: x.fillna(x.median()))
print('Total number of missing characteristics: %d' % (data_ch[chas].isnull().sum().sum()))

for cha in chas:
    data_ch[cha] = data_ch[cha].transform(lambda x: x.fillna(x.median()))
print('Total number of missing characteristics: %d' % (data_ch[chas].isnull().sum().sum()))

In [None]:
# Load 8 macroeconomic predictors

data_ma = pd.read_csv('PredictorData2023.csv')
data_ma['yyyymm'] = pd.to_datetime(data_ma['yyyymm'], format='%Y%m') + pd.offsets.MonthEnd(0)
data_ma = data_ma[(data_ma['yyyymm'] >= '1957-01-31') & (data_ma['yyyymm'] <= '2016-12-31')].reset_index(drop=True)
data_ma

In [None]:
# Construct 8 macroeconomic predictors

ma_predictors = ['dp', 'ep', 'bm', 'ntis', 'tbl', 'tms', 'dfy', 'svar']
data_ma['Index'] = data_ma['Index'].str.replace(',', '').astype('float64')
data_ma['dp'] = np.log(data_ma['D12'] / data_ma['Index'])
data_ma['ep'] = np.log(data_ma['E12'] / data_ma['Index'])
data_ma.rename(columns={'b/m': 'bm'}, inplace=True)
data_ma['tms'] = data_ma['lty'] - data_ma['tbl']
data_ma['dfy'] = data_ma['BAA'] - data_ma['AAA']
data_ma = data_ma[['yyyymm'] + ma_predictors]
data_ma

In [None]:
# Construct the dataset including all covariates

data_ma_long = pd.merge(data_ch['DATE'], data_ma, left_on='DATE', right_on='yyyymm', how='left').drop('yyyymm', axis=1)
for cha in chas:
    for predictor in ma_predictors:
        name = cha + '_' + predictor
        data_ch[name] = data_ch[cha] * data_ma_long[predictor]
data = data_ch
data

In [None]:
# Split the dataset
# Training set

covariates = [x for x in data.columns if x != 'RET']

train_str = '1957-01-31'; train_end = '1974-12-31'
data_train = data[(train_str <= data['DATE']) & (data['DATE'] <= train_end)]
X_train, y_train = data_train[covariates], data_train[['DATE', 'RET']]
print(X_train.shape); print(y_train.shape)

In [None]:
# Validation set

val_str = '1975-01-31'; val_end = '1986-12-31'
data_val = data[(val_str <= data['DATE']) & (data['DATE'] <= val_end)]
X_val, y_val = data_val[covariates], data_val[['DATE', 'RET']]
print(X_val.shape); print(y_val.shape)

In [None]:
# Test set

test_str = '1987-01-31'; test_end = '2016-12-31'
data_test = data[(test_str <= data['DATE']) & (data['DATE'] <= test_end)]
X_test, y_test = data_test[covariates], data_test[['DATE', 'RET']]
print(X_test.shape); print(y_test.shape)