In [1]:
import pandas as pd
import numpy as np
from finance_byu.summarize import summary
import statsmodels.formula.api as smf
from finance_byu.regtables import Regtable

In [2]:
df = pd.read_csv("crsp_daily.csv")
df = df[['permno','caldt','prc','ret']]
df

Unnamed: 0,permno,caldt,prc,ret
0,10000,1986-01-07,-2.5625,
1,10000,1986-01-08,-2.5000,-0.024390
2,10000,1986-01-09,-2.5000,0.000000
3,10000,1986-01-10,-2.5000,0.000000
4,10000,1986-01-13,-2.6250,0.050000
...,...,...,...,...
105258375,93436,2023-12-22,252.5400,-0.007701
105258376,93436,2023-12-26,256.6100,0.016116
105258377,93436,2023-12-27,261.4400,0.018822
105258378,93436,2023-12-28,253.1800,-0.031594


In [3]:
# Clean prc
df['prc'] = abs(df['prc'])

# Lag ret
df['ret_lag'] = df.groupby('permno')['ret'].shift(1)

# Filter out returns
df = df[df['ret'] > -1]

df

Unnamed: 0,permno,caldt,prc,ret,ret_lag
1,10000,1986-01-08,2.500,-0.024390,
2,10000,1986-01-09,2.500,0.000000,-0.024390
3,10000,1986-01-10,2.500,0.000000,0.000000
4,10000,1986-01-13,2.625,0.050000,0.000000
5,10000,1986-01-14,2.750,0.047619,0.050000
...,...,...,...,...,...
105258375,93436,2023-12-22,252.540,-0.007701,0.029781
105258376,93436,2023-12-26,256.610,0.016116,-0.007701
105258377,93436,2023-12-27,261.440,0.018822,0.016116
105258378,93436,2023-12-28,253.180,-0.031594,0.018822


In [4]:
df['up'] = df['ret_lag'] > 0
df['up'] = df['up'].astype(int)
df['down'] = df['ret_lag'] < 0
df['down'] = df['down'].astype(int)
df['total'] = 1
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['up'] = df['ret_lag'] > 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['up'] = df['up'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['down'] = df['ret_lag'] < 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = valu

Unnamed: 0,permno,caldt,prc,ret,ret_lag,up,down,total
1,10000,1986-01-08,2.5,-0.02439,,0,0,1
2,10000,1986-01-09,2.5,0.0,-0.02439,0,1,1
3,10000,1986-01-10,2.5,0.0,0.0,0,0,1
4,10000,1986-01-13,2.625,0.05,0.0,0,0,1
5,10000,1986-01-14,2.75,0.047619,0.05,1,0,1


In [5]:
# Create mdt
df['caldt'] = pd.to_datetime(df['caldt'])
df['mdt'] = df['caldt'].dt.to_period('M')
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['caldt'] = pd.to_datetime(df['caldt'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['mdt'] = df['caldt'].dt.to_period('M')


Unnamed: 0,permno,caldt,prc,ret,ret_lag,up,down,total,mdt
1,10000,1986-01-08,2.5,-0.02439,,0,0,1,1986-01
2,10000,1986-01-09,2.5,0.0,-0.02439,0,1,1,1986-01
3,10000,1986-01-10,2.5,0.0,0.0,0,0,1,1986-01
4,10000,1986-01-13,2.625,0.05,0.0,0,0,1,1986-01
5,10000,1986-01-14,2.75,0.047619,0.05,1,0,1,1986-01


In [6]:
df = df.groupby(['permno','mdt'])[['prc','up','down','total']].agg({'prc': 'last', 'up': 'sum','down':'sum','total':'sum'})
df.reset_index("mdt",inplace=True)
df.reset_index("permno",inplace=True)
df

Unnamed: 0,permno,mdt,prc,up,down,total
0,10000,1986-01,4.37500,8,2,18
1,10000,1986-02,3.25000,2,10,19
2,10000,1986-03,4.43750,6,2,20
3,10000,1986-04,4.00000,3,8,22
4,10000,1986-05,3.10938,4,13,21
...,...,...,...,...,...,...
4918548,93436,2023-08,258.08000,9,14,23
4918549,93436,2023-09,250.22000,8,12,20
4918550,93436,2023-10,200.84000,10,12,22
4918551,93436,2023-11,240.08000,15,6,21


In [7]:
df.to_csv("parsed_monthly.csv")