In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from finance_byu.summarize import summary



In [2]:
crsp_daily1 = pd.read_feather('C:/Users/benja/desktop/ACME_Senior/Fin585/Final Project/crsp_daily.ftr')

crsp_monthly1 = pd.read_feather('C:/Users/benja/desktop/ACME_Senior/Fin585/Final Project/crsp_monthly.ftr')

In [3]:
# for both datasets keep only the data post jan 1 2000 from caldt
crsp_daily2 = crsp_daily1[crsp_daily1['caldt'] >= '2010-01-01']
crsp_monthly2 = crsp_monthly1[crsp_monthly1['caldt'] >= '2010-01-01']

In [4]:
crsp_daily = crsp_daily2.copy()
crsp_monthly = crsp_monthly2.copy()

In [5]:
# Cleaning the daily dataset

crsp_daily['prc'] = abs(crsp_daily['prc'])

crsp_daily['prc_lag'] = crsp_daily.groupby('permno')['prc'].shift(1)
crsp_daily['ret_lag'] = crsp_daily.groupby('permno')['ret'].shift(1)

# keep prc_lag > 5
crsp_daily = crsp_daily[crsp_daily['prc_lag'] > 5]

crsp_daily = crsp_daily[crsp_daily['ret_lag'] > -1]

crsp_daily.sort_values(by = ['permno', 'caldt'], inplace = True)

crsp_daily.drop(columns = ['shrcd', 'excd', 'siccd', 'vol', 'shr', 'prc_lag'], inplace = True)

# Adding column for positive and negative returns

crsp_daily['ret_class'] = np.where(crsp_daily['ret_lag'].shift(1) >= 0, '1', '0')

# Calculate rolling yearly number of positive and negative days for each stock

n = 252

crsp_daily['pos_days'] = crsp_daily.groupby('permno').rolling(window = n, min_periods = n)['ret_class'].sum().reset_index(level=0, drop=True)

crsp_daily['neg_days'] = n - crsp_daily['pos_days']

crsp_daily['%pos'] = crsp_daily['pos_days'] / n

crsp_daily['%neg'] = crsp_daily['neg_days'] / n

crsp_daily['%neg - %pos'] = crsp_daily['%neg'] - crsp_daily['%pos']

crsp_daily.drop(columns = ['ret', 'ret_class', 'pos_days', 'neg_days', '%pos', '%neg'], inplace = True)

crsp_daily.dropna(inplace = True)

In [6]:
crsp_daily.head()

Unnamed: 0,permno,caldt,prc,ret_lag,%neg - %pos
6663,10001,2011-01-03,10.45,-0.00095,-0.119048
6664,10001,2011-01-04,10.44,-0.006654,-0.119048
6665,10001,2011-01-05,10.4,-0.000957,-0.119048
6666,10001,2011-01-06,10.43,-0.003831,-0.119048
6667,10001,2011-01-07,10.49,0.002885,-0.111111


In [7]:
crsp_monthly.drop(columns = ['cusip', 'ticker', 'shrcd', 'excd', 'siccd', 'vol', 'shr', 'cumfacshr'], inplace = True)

crsp_monthly['prc_lag'] = crsp_monthly.groupby('permno')['prc'].shift(1)
crsp_monthly['ret_lag'] = crsp_monthly.groupby('permno')['ret'].shift(1)

# keep prc_lag > 5
crsp_monthly = crsp_monthly[crsp_monthly['prc_lag'] > 5]

crsp_monthly = crsp_monthly[crsp_monthly['ret_lag'] > -1]

crsp_monthly.drop(columns = ['prc_lag', 'ret'], inplace = True)

crsp_monthly.head()

Unnamed: 0,permno,caldt,prc,ret_lag
306,10001,2010-02-26,10.0084,-0.018932
307,10001,2010-03-31,10.17,-0.000656
308,10001,2010-04-30,11.39,0.020643
309,10001,2010-05-28,11.4,0.124385
310,10001,2010-06-30,10.86,0.004829


In [9]:
crsp_monthly['logret'] = np.log(1 + crsp_monthly['ret_lag'])
crsp_monthly['mom'] = crsp_monthly.groupby('permno')['logret'].rolling(11,11).sum().reset_index(drop=True, level=0)
crsp_monthly['mom'] = crsp_monthly.groupby('permno')['mom'].shift(2)
crsp_monthly.drop(columns = ['ret_lag'], inplace = True)
crsp_monthly.dropna(inplace=True)
crsp_monthly.tail()

Unnamed: 0,permno,caldt,prc,logret,mom
4889699,93436,2023-08-31,258.07999,0.021392,-0.095979
4889700,93436,2023-09-29,250.22,-0.035588,-0.12677
4889701,93436,2023-10-31,200.84,-0.030929,-0.030128
4889702,93436,2023-11-30,240.08,-0.219832,-0.027402
4889703,93436,2023-12-29,248.48,0.178463,0.095016


In [11]:
# merge crsp_daily and crsp_monthly on permno and caldt, I dont want to drop any daily rows
# I want to fill daily rows that would be nans with all the data from that month
crsp = pd.merge(crsp_daily, crsp_monthly, on = ['permno', 'caldt'], how = 'left')

#fill any nans with the data that follows
crsp.fillna(method = 'bfill', inplace = True)

crsp.drop(columns = ['prc_x', 'prc_y', 'logret'], inplace = True)

crsp

  crsp.fillna(method = 'bfill', inplace = True)


Unnamed: 0,permno,caldt,ret_lag,%neg - %pos,mom
0,10001,2011-01-03,-0.000950,-0.119048,0.020414
1,10001,2011-01-04,-0.006654,-0.119048,0.020414
2,10001,2011-01-05,-0.000957,-0.119048,0.020414
3,10001,2011-01-06,-0.003831,-0.119048,0.020414
4,10001,2011-01-07,0.002885,-0.111111,0.020414
...,...,...,...,...,...
18794434,93436,2023-12-22,0.029781,-0.087302,0.095016
18794435,93436,2023-12-26,-0.007701,-0.095238,0.095016
18794436,93436,2023-12-27,0.016116,-0.095238,0.095016
18794437,93436,2023-12-28,0.018822,-0.103175,0.095016


In [12]:
crsp['id'] = np.sign(crsp['mom']) * crsp['%neg - %pos']
crsp.head()

Unnamed: 0,permno,caldt,ret_lag,%neg - %pos,mom,id
0,10001,2011-01-03,-0.00095,-0.119048,0.020414,-0.119048
1,10001,2011-01-04,-0.006654,-0.119048,0.020414,-0.119048
2,10001,2011-01-05,-0.000957,-0.119048,0.020414,-0.119048
3,10001,2011-01-06,-0.003831,-0.119048,0.020414,-0.119048
4,10001,2011-01-07,0.002885,-0.111111,0.020414,-0.111111


In [13]:
crsp['mom_bins'] = crsp.groupby("caldt")['mom'].transform(pd.cut,2, labels=False)
crsp.tail()

Unnamed: 0,permno,caldt,ret_lag,%neg - %pos,mom,id,mom_bins
18794434,93436,2023-12-22,0.029781,-0.087302,0.095016,-0.087302,1
18794435,93436,2023-12-26,-0.007701,-0.095238,0.095016,-0.095238,1
18794436,93436,2023-12-27,0.016116,-0.095238,0.095016,-0.095238,1
18794437,93436,2023-12-28,0.018822,-0.103175,0.095016,-0.103175,1
18794438,93436,2023-12-29,-0.031594,-0.111111,0.095016,-0.111111,1


In [14]:
crsp['id_bins'] = crsp.groupby(["caldt", "mom_bins"])['id'].transform(pd.cut,5, labels=False)
crsp.tail()

Unnamed: 0,permno,caldt,ret_lag,%neg - %pos,mom,id,mom_bins,id_bins
18794434,93436,2023-12-22,0.029781,-0.087302,0.095016,-0.087302,1,2
18794435,93436,2023-12-26,-0.007701,-0.095238,0.095016,-0.095238,1,2
18794436,93436,2023-12-27,0.016116,-0.095238,0.095016,-0.095238,1,2
18794437,93436,2023-12-28,0.018822,-0.103175,0.095016,-0.103175,1,2
18794438,93436,2023-12-29,-0.031594,-0.111111,0.095016,-0.111111,1,2


In [17]:
crsp['id_bins2'] = crsp.groupby(["caldt"])['id'].transform(pd.qcut,5, labels=False)

In [18]:
port = crsp.groupby(['caldt','mom_bins','id_bins'])['ret_lag'].mean()*100
port = port.unstack(level=['mom_bins','id_bins'])
port.head()

mom_bins,0,0,0,0,0,1,1,1,1,1
id_bins,0,1,2,3,4,0,1,2,3,4
caldt,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
2011-01-03,0.0,0.779494,-0.036467,-0.154106,-0.0218,0.079143,-0.14005,-0.387494,-0.242761,-0.619954
2011-01-04,0.0,0.098874,1.099253,1.25927,0.0109,0.376687,1.202464,1.559454,1.773841,1.248933
2011-01-05,0.0,-0.097925,-0.722088,-0.794958,-0.84855,-0.523286,-1.203249,-1.262617,-1.224199,-0.779
2011-01-06,5.3837,2.264033,0.135981,1.328489,2.168,0.04,0.041155,0.60992,0.686346,0.04425
2011-01-07,0.28195,0.587567,-0.342933,0.846711,-1.9241,-0.04,-0.221665,-0.344111,-0.164539,0.08845


In [19]:
port1 = crsp.groupby(['caldt','mom_bins','id_bins2'])['ret_lag'].mean()*100
port1 = port1.unstack(level=['mom_bins','id_bins2'])
port1.head()

mom_bins,0,0,0,0,0,1,1,1,1,1
id_bins2,0,1,2,3,4,0,1,2,3,4
caldt,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
2011-01-03,0.496434,0.025244,-0.144044,-0.12061,-0.20564,-0.124486,-0.464888,-0.282166,-0.244564,-0.392536
2011-01-04,0.399736,0.832082,1.224296,1.338579,1.417903,0.975774,1.53127,1.526175,1.720719,1.715554
2011-01-05,-0.296695,-0.524801,-0.709342,-0.959379,-0.914653,-1.009559,-1.392146,-1.13646,-1.229201,-1.248095
2011-01-06,5.3837,0.837733,0.6619,0.974482,1.7777,0.27178,0.550692,0.733771,0.710093,0.716895
2011-01-07,0.28195,0.587567,0.244775,-0.153609,-0.743283,-0.291211,-0.361419,-0.333244,-0.4,-0.270527


In [20]:
summary(port).loc[['mean','std','tstat']].round(3)

mom_bins,0,0,0,0,0,1,1,1,1,1
id_bins,0,1,2,3,4,0,1,2,3,4
mean,0.123,0.144,0.103,0.137,0.125,0.108,0.054,0.057,0.065,0.106
std,3.275,1.662,1.655,2.309,4.111,2.474,1.336,1.25,1.337,2.969
tstat,2.146,4.919,3.549,3.37,1.738,2.486,2.318,2.607,2.765,2.034


In [21]:
summary(port1).loc[['mean','std','tstat']].round(3)

mom_bins,0,0,0,0,0,1,1,1,1,1
id_bins2,0,1,2,3,4,0,1,2,3,4
mean,0.105,0.127,0.098,0.116,0.063,0.05,0.052,0.067,0.068,0.062
std,2.77,1.827,1.838,1.844,1.67,1.148,1.272,1.32,1.375,1.429
tstat,2.162,3.931,3.038,3.597,2.14,2.514,2.351,2.899,2.844,2.482
