In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns; sns.set()
pd.set_option('use_inf_as_na', True)
import warnings
warnings.filterwarnings('ignore')
plt.rcParams['figure.dpi'] = 300
plt.rcParams['savefig.dpi'] = 300

In [None]:
crash_data = pd.read_csv('datasets/crash_data.csv')

In [None]:
crash_data.head()

In [None]:
crash_data.date = pd.to_datetime(crash_data.date, format='%Y%m%d')
crash_data = crash_data.set_index('date')

In [None]:
crash_dataw = crash_data.groupby('TICKER').resample('W').\
              agg({'RET':'mean', 'vwretx':'mean', 'VOL':'mean',
                   'BIDLO':'mean', 'ASKHI':'mean', 'PRC':'mean'})

In [None]:
crash_dataw = crash_dataw.reset_index()
crash_dataw.dropna(inplace=True)
stocks = crash_dataw.TICKER.unique()

In [None]:
plt.figure(figsize=(12, 8))
k = 1

for i in stocks[: 4]:
    plt.subplot(2, 2, k)
    plt.hist(crash_dataw[crash_dataw.TICKER == i]['RET'])
    plt.title('Histogram of '+i)
    k+=1
plt.show()

## Firm-specific return

In [None]:
import statsmodels.api as sm
residuals = []

for i in stocks:
    Y = crash_dataw.loc[crash_dataw['TICKER'] == i]['RET'].values
    X = crash_dataw.loc[crash_dataw['TICKER'] == i]['vwretx'].values
    X = sm.add_constant(X)
    ols = sm.OLS(Y[2:-2], X[2:-2] + X[1:-3] + X[0:-4] + \
                 X[3:-1] + X[4:]).fit()
    residuals.append(ols.resid)

In [None]:
residuals = list(map(lambda x: np.log(1 + x), residuals))

In [None]:
crash_data_sliced = pd.DataFrame([])
for i in stocks:
    crash_data_sliced = crash_data_sliced.\
                        append(crash_dataw.loc[crash_dataw.TICKER == i]
                               [2:-2])
crash_data_sliced.head()

## Elliptic Envelope 

In [None]:
from sklearn.covariance import EllipticEnvelope
envelope = EllipticEnvelope(contamination=0.02, support_fraction=1)
ee_predictions = {}

for i, j in zip(range(len(stocks)), stocks):
    envelope.fit(np.array(residuals[i]).reshape(-1, 1))
    ee_predictions[j] = envelope.predict(np.array(residuals[i])
                                         .reshape(-1, 1))

In [None]:
transform = []

for i in stocks:
    for j in range(len(ee_predictions[i])):
        transform.append(np.where(ee_predictions[i][j] == 1, 0, -1))

In [None]:
crash_data_sliced = crash_data_sliced.reset_index()
crash_data_sliced['residuals'] = np.concatenate(residuals)
crash_data_sliced['neg_outliers'] = np.where((np.array(transform)) == -1, 1, 0)
crash_data_sliced.loc[(crash_data_sliced.neg_outliers == 1) &
                      (crash_data_sliced.residuals > 0),
                      'neg_outliers'] = 0

In [None]:
crash_data_sliced['neg_outliers'].value_counts()

In [None]:
plt.figure(figsize=(12, 8)) 
k=1

for i in stocks[8:12]:
    plt.subplot(2, 2, k)
    crash_data_sliced['residuals'][crash_data_sliced.TICKER == i]\
    .hist(label='normal', bins=30, color='gray')
    outliers = crash_data_sliced['residuals'][(crash_data_sliced.TICKER == i) &
    (crash_data_sliced.neg_outliers > 0)]
    outliers.hist(color='black', label='anomaly') 
    plt.title(i)
    plt.legend()
    k+=1


In [None]:
crash_data_sliced = crash_data_sliced.set_index('date')
crash_data_sliced.index = pd.to_datetime(crash_data_sliced.index)

In [None]:
std = crash_data.groupby('TICKER')['RET'].resample('W').std()\
      .reset_index()
crash_dataw['std'] = pd.DataFrame(std['RET'])

In [None]:
yearly_data = crash_data_sliced.groupby('TICKER')['residuals']\
              .resample('Y').agg({'residuals':{'mean', 'std'}})\
              .reset_index()
yearly_data.columns = ['TICKER', 'date', 'mean', 'std']
yearly_data.head()

In [None]:
merge_crash = pd.merge(crash_data_sliced.reset_index(), yearly_data,
                       how='outer', on=['TICKER', 'date'])

In [None]:
merge_crash[['annual_mean', 'annual_std']] = merge_crash\
                                             .sort_values(by=['TICKER',
                                                              'date'])\
                                             .iloc[:, -2:]\
                                             .fillna(method='bfill')
merge_crash['residuals'] = merge_crash.sort_values(by=['TICKER',
                                                       'date'])\
                                                      ['residuals']\
                                             .fillna(method='ffill')
merge_crash = merge_crash.drop(merge_crash.iloc[: ,-4:-2], axis=1)

In [None]:
crash_risk_out = []

for j in stocks:
    for k in range(len(merge_crash[merge_crash.TICKER == j])):
        if merge_crash[merge_crash.TICKER == j]['residuals'].iloc[k] < \
        merge_crash[merge_crash.TICKER == j]['annual_mean'].iloc[k] - \
        3.09 * \
        merge_crash[merge_crash.TICKER == j]['annual_std'].iloc[k]:
            crash_risk_out.append(1)
        else:
            crash_risk_out.append(0)

In [None]:
merge_crash['crash_risk'] = crash_risk_out
merge_crash['crash_risk'].value_counts()

In [None]:
merge_crash = merge_crash.set_index('date')
merge_crash_annual = merge_crash.groupby('TICKER')\
                     .resample('1Y')['crash_risk'].sum().reset_index()

In [None]:
down = []

for j in range(len(merge_crash)):
    if merge_crash['residuals'].iloc[j] < \
       merge_crash['annual_mean'].iloc[j]:
        down.append(1)
    else:
        down.append(0)

In [None]:
merge_crash = merge_crash.reset_index()
merge_crash['down'] = pd.DataFrame(down)
merge_crash['up'] = 1 - merge_crash['down']
down_residuals = merge_crash[merge_crash.down == 1]\
                 [['residuals', 'TICKER', 'date']]
up_residuals = merge_crash[merge_crash.up == 1]\
               [['residuals', 'TICKER', 'date']]

In [None]:
down_residuals['residuals_down_sq'] = down_residuals['residuals'] ** 2
down_residuals['residuals_down_cubic'] = down_residuals['residuals'] **3
up_residuals['residuals_up_sq'] = up_residuals['residuals'] ** 2
up_residuals['residuals_up_cubic'] = up_residuals['residuals'] ** 3
down_residuals['down_residuals'] = down_residuals['residuals']
up_residuals['up_residuals'] = up_residuals['residuals']
del down_residuals['residuals']
del up_residuals['residuals']

In [None]:
merge_crash['residuals_sq'] = merge_crash['residuals'] ** 2
merge_crash['residuals_cubic'] = merge_crash['residuals'] ** 3

In [None]:
merge_crash_all = merge_crash.merge(down_residuals,
                                    on=['TICKER', 'date'],
                                    how='outer')
merge_crash_all = merge_crash_all.merge(up_residuals,
                                        on=['TICKER', 'date'],
                                        how='outer')

In [None]:
cols = ['BIDLO', 'ASKHI', 'residuals', 
        'annual_std', 'residuals_sq', 'residuals_cubic',
        'down', 'up', 'residuals_up_sq', 'residuals_down_sq',
        'neg_outliers']
merge_crash_all = merge_crash_all.set_index('date')
merge_grouped = merge_crash_all.groupby('TICKER')[cols]\
                .resample('1Y').sum().reset_index()
merge_grouped['neg_outliers'] = np.where(merge_grouped.neg_outliers >=
                                         1, 1, 0)

In [None]:
merge_grouped = merge_grouped.set_index('date')
merge_all = merge_grouped.groupby('TICKER')\
            .resample('1Y').agg({'down':['sum', 'count'],
                                 'up':['sum', 'count']})\
            .reset_index()
merge_all.head()

In [None]:
merge_grouped['down'] = merge_all['down']['sum'].values
merge_grouped['up'] = merge_all['up']['sum'].values
merge_grouped['count'] = merge_grouped['down'] + merge_grouped['up']

In [None]:
merge_grouped = merge_grouped.reset_index()

In [None]:
merge_grouped['duvol'] = np.log(((merge_grouped['up'] - 1) * 
                                 merge_grouped['residuals_down_sq']) /
                                ((merge_grouped['down'] - 1) * 
                                 merge_grouped['residuals_up_sq']))

In [None]:
merge_grouped['duvol'].mean()

In [None]:
merge_grouped['ncskew'] = - (((merge_grouped['count'] * 
                               (merge_grouped['count'] - 1) **
                               (3 / 2)) * 
                             merge_grouped['residuals_cubic']) / 
                             (((merge_grouped['count'] - 1) * 
                               (merge_grouped['count'] - 2)) * 
                              merge_grouped['residuals_sq'] **
                              (3 / 2)))

In [None]:
merge_grouped['ncskew'].mean()

In [None]:
merge_grouped['crash_risk'] = merge_crash_annual['crash_risk']
merge_grouped['crash_risk'] = np.where(merge_grouped.crash_risk >= 
                                       1, 1, 0)

In [None]:
merge_crash_all_grouped2 = merge_crash_all.groupby('TICKER')\
                            [['VOL', 'PRC']]\
                           .resample('1Y').mean().reset_index()
merge_grouped[['VOL', 'PRC']] = merge_crash_all_grouped2[['VOL', 'PRC']]

In [None]:
merge_grouped[['ncskew','duvol']].corr()

## Balance Sheet Data

In [None]:
bs = pd.read_csv('datasets/bs_v.3.csv')
bs['Date'] = pd.to_datetime(bs.datadate, format='%Y%m%d')
bs['annual_date'] = bs['Date'].dt.year

In [None]:
bs['RoA'] = bs['ni'] / bs['at']
bs['leverage'] = bs['lt'] / bs['at']

In [None]:
merge_grouped['annual_date'] = merge_grouped['date'].dt.year
bs['TICKER'] = bs.tic
del bs['tic']

In [None]:
merge_ret_bs = pd.merge(bs, merge_grouped,
                        on=['TICKER', 'annual_date'])

In [None]:
merge_ret_bs2 = merge_ret_bs.set_index('Date')
merge_ret_bs2 = merge_ret_bs2.groupby('TICKER').resample('Y').mean()
merge_ret_bs2.reset_index(inplace=True)

In [None]:
merge_ret_bs2['vol_csho_diff'] = (merge_ret_bs2.groupby('TICKER')
                                  ['VOL'].shift(-1) / 
                                  merge_ret_bs2.groupby('TICKER')
                                  ['csho'].shift(-1))
merge_ret_bs2['dturn1'] = merge_ret_bs2['VOL'] / merge_ret_bs2['csho']
merge_ret_bs2['dturn'] = merge_ret_bs2['vol_csho_diff'] - \
                         merge_ret_bs2['dturn1']

In [None]:
merge_ret_bs2['p/e'] = merge_ret_bs2['PRC'] / merge_ret_bs2['ni']
merge_ret_bs2['turnover_rate'] = merge_ret_bs2['VOL'] / \
                                 merge_ret_bs2['csho']
merge_ret_bs2['equity_share'] = merge_ret_bs2['ceq'] / \
                                (merge_ret_bs2['ceq'] +
                                 merge_ret_bs2['dt'])
merge_ret_bs2['firm_size'] = np.log(merge_ret_bs2['at'])
merge_ret_bs2['cefd'] = (((merge_ret_bs2['at'] -
                           merge_ret_bs2['lt']) / merge_ret_bs2['csho']) - 
                           merge_ret_bs2['PRC']) / (merge_ret_bs2['at'] - 
                           merge_ret_bs2['lt']) / merge_ret_bs2['csho']

In [None]:
merge_ret_bs2 = merge_ret_bs2.set_index('Date')
merge_ret_bs2['buying_volume'] = merge_ret_bs2['VOL'] * \
                                 (merge_ret_bs2['PRC'] - 
                                  merge_ret_bs2['BIDLO']) / \
                                 (merge_ret_bs2['ASKHI'] - 
                                  merge_ret_bs2['BIDLO'])
merge_ret_bs2['selling_volume'] = merge_ret_bs2['VOL'] * \
                                  (merge_ret_bs2['ASKHI'] - 
                                   merge_ret_bs2['PRC']) / \
                                  (merge_ret_bs2['ASKHI'] - 
                                   merge_ret_bs2['BIDLO'])
buying_volume = merge_ret_bs2.groupby('TICKER')['buying_volume'] \
                .resample('Y').sum().reset_index()
selling_volume = merge_ret_bs2.groupby('TICKER')['selling_volume'] \
                .resample('Y').sum().reset_index()
del buying_volume['TICKER']
del buying_volume['Date']

In [None]:
buy_sel_vol = pd.concat([buying_volume,selling_volume], axis=1)
buy_sel_vol['bsi'] = (buy_sel_vol.buying_volume - 
                      buy_sel_vol.selling_volume) / \
                     (buy_sel_vol.buying_volume + 
                      buy_sel_vol.selling_volume)

In [None]:
merge_ret_bs2 = merge_ret_bs2.reset_index()
merge_ret_bs2 = pd.merge(buy_sel_vol ,merge_ret_bs2,
                         on=['TICKER', 'Date'])

## Firm Sentiment via PCA

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [None]:
firm_sentiment = merge_ret_bs2[['p/e', 'turnover_rate',
                                'equity_share', 'cefd',
                                'leverage', 'bsi']]
firm_sentiment = firm_sentiment.apply(lambda x: x.fillna(x.mean()),
                                      axis=0)

In [None]:
firm_sentiment_std = StandardScaler().fit_transform(firm_sentiment)
pca = PCA(n_components=6)
pca_market_sentiment = pca.fit_transform(firm_sentiment_std)
print('Explained Variance Ratios per Component are:\n {}'\
      .format(pca.explained_variance_ratio_))

In [None]:
loadings_1 = pd.DataFrame(pca.components_.T * 
                          np.sqrt(pca.explained_variance_), 
                          columns=['PC1', 'PC2', 'PC3',
                                   'PC4', 'PC5', 'PC6'],
                          index=firm_sentiment.columns)
loadings_1

In [None]:
df_loading1 = pd.DataFrame(loadings_1.mean(axis=1))
df_loading1

In [None]:
firm_sentiment = pd.DataFrame(np.dot(pca_market_sentiment,
                                     np.array(df_loading1)))
merge_ret_bs2['firm_sent'] = firm_sentiment

## Panel Data Application

In [None]:
merge_ret_bs2['log_size'] = np.log(merge_ret_bs2['at'])

In [None]:
merge_ret_bs2.set_index(['TICKER', 'Date'], inplace=True)

In [None]:
X = (merge_ret_bs2[['log_size', 'rect', 'ppegt', 'dturn',
                'ncskew', 'residuals', 'RoA', 'annual_std',
                'firm_sent']]).shift(1)
X['neg_outliers'] = merge_ret_bs2['neg_outliers']

In [None]:
from pyeconometrics.panel_discrete_models import FixedEffectPanelModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
FE_ML = FixedEffectPanelModel()
FE_ML.fit(X, 'neg_outliers')
FE_ML.summary()

In [None]:
del X['neg_outliers']
X['crash_risk'] = merge_ret_bs2['crash_risk']

In [None]:
FE_crash = FixedEffectPanelModel()
FE_crash.fit(X, 'crash_risk')
FE_crash.summary()