In [1]:
#import shenanigans
import numpy as np
import pandas as pd
import datetime as dt

import matplotlib.pyplot as plt

import warnings
warnings.simplefilter("ignore")

import statsmodels.api as sm
from sklearn.linear_model import LinearRegression

from scipy import stats

In [2]:
#read and load data
data1 = pd.read_csv('model1.csv')
data2 = pd.read_csv('model2.csv')
data3 = pd.read_csv('model3.csv')

In [3]:
#Convert Dates to Datetime
data1['Date'] = pd.to_datetime(data1.Date, format='%Y-%m-%d')
data2['Date'] = pd.to_datetime(data2.Date, format='%Y-%m-%d')
data3['Date'] = pd.to_datetime(data3.Date, format='%Y-%m-%d')

#Convert Returns from percentage to decimal
data1["Return"] = data1["Return"]/100
data2["Return"] = data2["Return"]/100
data3["Return"] = data3["Return"]/100

In [4]:
#Remove duplicated records; keep last
data1 = data1[~data1[['Date', 'GVKEY']].duplicated(keep='last')]
data2 = data2[~data2[['Date', 'GVKEY']].duplicated(keep='last')]
data3 = data3[~data3[['Date', 'GVKEY']].duplicated(keep='last')]

### Question 1

In [5]:
#Create function to solve question 1
def Question1(dataset):
    datelist = np.sort(dataset["Date"].unique())[::-1]
    depParams = dataset.columns[3:].tolist()
    
    MainTable = pd.DataFrame(index = datelist, 
                         columns = [["Constant"] + depParams + ["Adjusted R2"] + ["No. of Firms"]])
    
    SideTable = pd.DataFrame(index = datelist, 
                         columns = [["Constant"] + depParams])
    
    for currdate in datelist:
        datatmp = dataset[dataset["Date"]==currdate].reset_index(drop = True).copy()
        if (len(datatmp)>1):            
            reg = sm.OLS(datatmp["Return"], sm.add_constant(datatmp[depParams])).fit()
            MainTable.loc[currdate, "Constant"] = reg.params[0]
            SideTable.loc[currdate, "Constant"] = reg.bse[0]
            for i in range(0,len(depParams)):
                exec("MainTable.loc[currdate, depParams[{0}]] = reg.params[{1}]".format(i, i+1))
                exec("SideTable.loc[currdate, depParams[{0}]] = reg.bse[{1}]".format(i, i+1))
            MainTable.loc[currdate, "Adjusted R2"] = reg.rsquared_adj
            MainTable.loc[currdate, "No. of Firms"] = reg.nobs
        else:
            MainTable.loc[currdate, "No. of Firms"] = 1
    SideTable = SideTable.replace([np.inf, -np.inf], np.nan)
    return MainTable, SideTable

In [6]:
#Solve and save down answer tables
data1_result1, data1_result1b = Question1(data1)
data1_result1.to_csv (r'data1_result1.csv', index = True, header=True)
data1_result1b.to_csv (r'data1_result1b.csv', index = True, header=True)

data2_result1, data2_result1b = Question1(data2)
data2_result1.to_csv (r'data2_result1.csv', index = True, header=True)
data2_result1b.to_csv (r'data2_result1b.csv', index = True, header=True)

data3_result1, data3_result1b = Question1(data3)
data3_result1.to_csv (r'data3_result1.csv', index = True, header=True)
data3_result1b.to_csv (r'data3_result1b.csv', index = True, header=True)

### Question 2

In [7]:
def Question2(dataset_result1, dataset_result1b):
    data_result2 = pd.DataFrame(index = dataset_result1.columns[:-2],
                             columns = ["Mean", "StdErr", "T-Stat", "P-Stat","Sig"])
    for col in data_result2.index:
        data_result2.loc[col,"Mean"] = dataset_result1[col].mean()
        data_result2.loc[col,"StdErr"] = dataset_result1b[col].mean()
        tvalue = stats.ttest_1samp(dataset_result1[col].dropna(),0)[0]
        pvalue = stats.ttest_1samp(dataset_result1[col].dropna(),0)[1]
        data_result2.loc[col,"T-Stat"] = tvalue
        data_result2.loc[col,"P-Stat"] = pvalue
        if pvalue < 0.1:
            if pvalue < 0.05:
                if pvalue < 0.01:
                    data_result2.loc[col,"Sig"] = "1%"
                else:
                    data_result2.loc[col,"Sig"] = "5%"
            else:
                data_result2.loc[col,"Sig"] = "10%"
        else:
            data_result2.loc[col,"Sig"] = "NotSig"
    return data_result2

In [8]:
data1_result2 = Question2(data1_result1, data1_result1b)
data1_result2.to_csv (r'data1_result2.csv', index = True, header=True)

data2_result2 = Question2(data2_result1, data2_result1b)
data2_result2.to_csv (r'data2_result2.csv', index = True, header=True)

data3_result2 = Question2(data3_result1, data3_result1b)
data3_result2.to_csv (r'data3_result2.csv', index = True, header=True)

In [9]:
data1_result2

Unnamed: 0,Mean,StdErr,T-Stat,P-Stat,Sig
Constant,0.016094,0.01377,4.62062,5e-06,1%
LogSize_-1,-0.000803,0.002584,-1.69462,0.090603,10%
LogB/M_-1,0.004783,0.005609,5.277325,0.0,1%
"Return_-2,-12",0.011512,0.009874,5.030341,1e-06,1%


In [10]:
data2_result2

Unnamed: 0,Mean,StdErr,T-Stat,P-Stat,Sig
Constant,0.015235,0.020492,2.557301,0.010775,5%
LogSize_-1,-0.000445,0.003986,-0.413936,0.679058,NotSig
LogB/M_-1,0.004609,0.007495,4.832851,2e-06,1%
"Return_-2,-12",0.012227,0.015672,2.759604,0.00595,1%
"LogIssues_-1,-36",-0.005636,0.013802,-1.597059,0.110739,NotSig
Accruals_Yr-1,-4e-06,6.7e-05,-0.293948,0.768892,NotSig
ROA_Yr-1,-0.00024,0.085726,-0.012983,0.989645,NotSig
LogAG_Yr-1,-0.003767,0.027171,-0.911634,0.3623,NotSig


In [11]:
data3_result2

Unnamed: 0,Mean,StdErr,T-Stat,P-Stat,Sig
Constant,0.015484,0.015709,4.479822,9e-06,1%
LogSize_-1,-0.00101,0.002594,-1.775176,0.076354,10%
LogB/M_-1,0.002265,0.006823,2.403627,0.016523,5%
"Return_-2,-12",0.006645,0.010928,3.730254,0.000209,1%
"LogIssues_-1,-36",-0.000919,0.013874,-0.207548,0.83565,NotSig
Accruals_Yr-1,-1.3e-05,4.2e-05,-1.016147,0.309952,NotSig
ROA_Yr-1,0.015216,0.064006,2.258956,0.024229,5%
LogAG_Yr-1,-0.007115,0.020817,-3.827034,0.000143,1%
"DY_-1,-12",0.004186,0.195149,0.209533,0.8341,NotSig
"LogReturn_-13,-36",1e-06,0.006396,0.001551,0.998763,NotSig


### Question 3

1. Rolling 10 Year window, calculate
    a. 10yr Average for all Params
    b. Run Regression for calculating slopes 
2. Predict regression result using Average params
3. Compare predicted to real and calculate for table

In [14]:
data3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1357955 entries, 0 to 1378601
Data columns (total 16 columns):
 #   Column             Non-Null Count    Dtype         
---  ------             --------------    -----         
 0   GVKEY              1357955 non-null  int64         
 1   Date               1357955 non-null  datetime64[ns]
 2   Return             1357955 non-null  float64       
 3   LogSize_-1         1357955 non-null  float64       
 4   LogB/M_-1          1357955 non-null  float64       
 5   Return_-2,-12      1357955 non-null  float64       
 6   LogIssues_-1,-36   1357955 non-null  float64       
 7   Accruals_Yr-1      1357955 non-null  float64       
 8   ROA_Yr-1           1357955 non-null  float64       
 9   LogAG_Yr-1         1357955 non-null  float64       
 10  DY_-1,-12          1357955 non-null  float64       
 11  LogReturn_-13,-36  1357955 non-null  float64       
 12  LogIssues_-1,-12   1357955 non-null  float64       
 13  Turnover_-1,-12    1357955 

In [12]:
data3[data3["GVKEY"]==1000]

Unnamed: 0,GVKEY,Date,Return,LogSize_-1,LogB/M_-1,"Return_-2,-12","LogIssues_-1,-36",Accruals_Yr-1,ROA_Yr-1,LogAG_Yr-1,"DY_-1,-12","LogReturn_-13,-36","LogIssues_-1,-12","Turnover_-1,-12",Debt/Price_Yr-1,Sales/Price_Yr-1
0,1000,1974-04-30,0.095238,1.924431,-0.081561,-0.4285718,-0.298542,-0.246,0.0894,0.089507,0.0,-0.551282,-0.153494,9739.004887,1.021711,5.509943
1,1000,1974-05-31,-0.130435,2.016424,0.048045,-0.2222232,-0.297521,-0.246,0.0894,0.089507,0.0,-0.686046,-0.150169,9758.016991,0.931915,5.025683
2,1000,1974-06-30,0.05,1.877682,0.187806,-0.08000134,-0.296501,-0.246,0.0894,0.089507,0.0,-0.637681,-0.146841,9405.625892,1.070609,5.773643
3,1000,1974-07-31,-0.142857,1.927492,0.139016,-0.1666681,-0.295198,-0.246,0.0894,0.089507,0.0,-0.636363,-0.132863,9020.064092,1.018589,5.493106
4,1000,1974-08-31,0.166667,1.766826,0.293167,-0.322582,-0.301428,-0.246,0.0894,0.089507,0.0,-0.367346,-0.126248,8754.809368,1.196121,6.45051
5,1000,1974-09-30,-0.190476,1.914419,0.139016,-0.2173922,-0.307701,-0.246,0.0894,0.089507,0.0,-0.452381,-0.119502,8662.078519,1.031992,5.565384
6,1000,1974-10-31,0.058824,1.69651,0.350325,-0.1250005,-0.263732,-0.246,0.0894,0.089507,0.0,-0.351351,-0.117826,8256.079708,1.283256,6.920415
7,1000,1974-11-30,0.0,1.697998,0.293167,-0.15,-0.266138,-0.246,0.0894,0.089507,0.0,-0.487179,-0.165151,7759.250244,1.281347,6.910123
8,1000,1974-12-31,-0.055556,1.639045,0.293167,0.2857149,-0.268829,-0.246,0.0894,0.089507,0.0,-0.575758,-0.215689,7304.437107,1.359157,7.329741
9,1000,1975-01-31,0.0,1.51924,0.350325,0.2857149,-0.329917,-0.246,0.0894,0.089507,0.0,-0.695652,-0.25097,6109.437653,1.532148,8.262654
