In [272]:
#import shenanigans
import numpy as np
import pandas as pd
import datetime as dt

import matplotlib.pyplot as plt

import warnings
warnings.simplefilter("ignore")

import statsmodels.api as sm
from sklearn.linear_model import LinearRegression

from scipy import stats

In [273]:
#read and load data
data1 = pd.read_csv('model1.csv')
data2 = pd.read_csv('model2.csv')
data3 = pd.read_csv('model3.csv')

In [274]:
#Convert Dates to Datetime
data1['Date'] = pd.to_datetime(data1.Date, format='%Y-%m-%d')
data2['Date'] = pd.to_datetime(data2.Date, format='%Y-%m-%d')
data3['Date'] = pd.to_datetime(data3.Date, format='%Y-%m-%d')

#Convert Returns from percentage to decimal
data1["Return"] = data1["Return"]/100
data2["Return"] = data2["Return"]/100
data3["Return"] = data3["Return"]/100

In [275]:
#Remove duplicated records; keep last
data1 = data1[~data1[['Date', 'GVKEY']].duplicated(keep='last')]
data2 = data2[~data2[['Date', 'GVKEY']].duplicated(keep='last')]
data3 = data3[~data3[['Date', 'GVKEY']].duplicated(keep='last')]

### Question 1

In [5]:
#Create function to solve question 1
def Question1(dataset):
    datelist = np.sort(dataset["Date"].unique())[::-1]
    depParams = dataset.columns[3:].tolist()
    
    MainTable = pd.DataFrame(index = datelist, 
                         columns = [["Constant"] + depParams + ["Adjusted R2"] + ["No. of Firms"]])
    
    SideTable = pd.DataFrame(index = datelist, 
                         columns = [["Constant"] + depParams])
    
    for currdate in datelist:
        datatmp = dataset[dataset["Date"]==currdate].reset_index(drop = True).copy()
        if (len(datatmp)>1):            
            reg = sm.OLS(datatmp["Return"], sm.add_constant(datatmp[depParams])).fit()
            MainTable.loc[currdate, "Constant"] = reg.params[0]
            SideTable.loc[currdate, "Constant"] = reg.bse[0]
            for i in range(0,len(depParams)):
                exec("MainTable.loc[currdate, depParams[{0}]] = reg.params[{1}]".format(i, i+1))
                exec("SideTable.loc[currdate, depParams[{0}]] = reg.bse[{1}]".format(i, i+1))
            MainTable.loc[currdate, "Adjusted R2"] = reg.rsquared_adj
            MainTable.loc[currdate, "No. of Firms"] = reg.nobs
        else: 
            MainTable.loc[currdate, "No. of Firms"] = 1
    SideTable = SideTable.replace([np.inf, -np.inf], np.nan)
    return MainTable, SideTable

In [6]:
#Solve and save down answer tables
data1_result1, data1_result1b = Question1(data1)
data1_result1.to_csv (r'data1_result1.csv', index = True, header=True)
data1_result1b.to_csv (r'data1_result1b.csv', index = True, header=True)

data2_result1, data2_result1b = Question1(data2)
data2_result1.to_csv (r'data2_result1.csv', index = True, header=True)
data2_result1b.to_csv (r'data2_result1b.csv', index = True, header=True)

data3_result1, data3_result1b = Question1(data3)
data3_result1.to_csv (r'data3_result1.csv', index = True, header=True)
data3_result1b.to_csv (r'data3_result1b.csv', index = True, header=True)

In [7]:
data1_result1

Unnamed: 0,Constant,LogSize_-1,LogB/M_-1,"Return_-2,-12",Adjusted R2,No. of Firms
2021-03-31,0.070153,0.001472,0.020264,-0.013328,0.046385,923.0
2021-02-28,0.194326,-0.007977,0.039267,0.019392,0.061897,1004.0
2021-01-31,0.220899,-0.016224,0.067245,0.297142,0.042235,1173.0
2020-12-31,0.163836,-0.01281,-0.010294,-0.015446,0.013615,3468.0
2020-11-30,0.288554,-0.012023,0.017516,-0.04559,0.03288,3500.0
...,...,...,...,...,...,...
1964-08-31,0.187475,-0.0167,0.141974,-0.600251,,3.0
1964-07-31,-0.055355,0.00644,-0.052775,0.14927,,3.0
1964-06-30,-0.977434,0.187247,0.258189,0.179663,,3.0
1964-05-31,-0.060533,0.009142,0.073093,0.368732,,3.0


In [202]:
data2_result1

Unnamed: 0,Constant,LogSize_-1,LogB/M_-1,"Return_-2,-12","LogIssues_-1,-36",Accruals_Yr-1,ROA_Yr-1,LogAG_Yr-1,Adjusted R2,No. of Firms
2021-03-31,0.005465,0.007034,0.022183,0.00564,-0.01147,-0.000002,0.136759,0.042628,0.046927,436.0
2021-02-28,0.365953,-0.031605,0.027741,-0.00318,-0.027405,-0.000041,0.110756,0.059094,0.084421,504.0
2021-01-31,0.196858,0.004568,0.112887,0.348645,-0.466657,0.000081,-0.644014,-0.180558,0.070688,644.0
2020-12-31,0.18357,-0.015177,-0.009077,-0.015458,-0.011577,-0.000005,0.007395,-0.027067,0.021629,2068.0
2020-11-30,0.254734,-0.005203,0.038726,-0.062321,-0.003232,-0.000008,-0.167597,0.014831,0.071075,2071.0
...,...,...,...,...,...,...,...,...,...,...
1966-08-31,,,,,,,,,,1
1966-07-31,,,,,,,,,,1
1966-06-30,,,,,,,,,,1
1966-05-31,,,,,,,,,,1


In [17]:
data3_result1

Unnamed: 0,Constant,LogSize_-1,LogB/M_-1,"Return_-2,-12","LogIssues_-1,-36",Accruals_Yr-1,ROA_Yr-1,LogAG_Yr-1,"DY_-1,-12","LogReturn_-13,-36","LogIssues_-1,-12","Turnover_-1,-12",Debt/Price_Yr-1,Sales/Price_Yr-1,Adjusted R2,No. of Firms
2021-03-31,-0.054286,0.008575,0.008727,0.018048,-0.010645,0.000007,0.113564,0.035128,0.768936,0.019835,-0.050903,-0.0,0.050305,0.003557,0.155379,412.0
2021-02-28,0.13817,-0.00488,0.023783,-0.024811,0.002797,0.000002,0.060061,-0.018918,-0.700483,-0.014884,-0.057176,0.0,0.030091,0.004603,0.138348,471.0
2021-01-31,0.174601,-0.027149,-0.001908,0.063074,0.02356,-0.00002,-0.010223,-0.030902,0.187995,-0.015165,0.024132,0.0,-0.013502,0.001663,0.218096,589.0
2020-12-31,0.188331,-0.014367,-0.003838,-0.002568,-0.034911,-0.000009,-0.021544,-0.039454,-0.639728,0.013014,0.081276,0.0,0.002417,-0.001618,0.028965,1833.0
2020-11-30,0.208749,-0.010434,0.01692,-0.06548,0.044577,-0.000003,-0.04107,-0.024447,-0.194055,0.006846,-0.09933,0.0,0.028487,0.005928,0.096661,1816.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1968-07-31,,,,,,,,,,,,,,,,1
1968-06-30,,,,,,,,,,,,,,,,1
1968-05-31,,,,,,,,,,,,,,,,1
1968-04-30,,,,,,,,,,,,,,,,1


### Question 2

In [9]:
def Question2(dataset_result1, dataset_result1b):
    data_result2 = pd.DataFrame(index = dataset_result1.columns[:-2],
                             columns = ["Mean", "StdErr", "T-Stat", "P-Stat","Sig"])
    for col in data_result2.index:
        data_result2.loc[col,"Mean"] = dataset_result1[col].mean()
        data_result2.loc[col,"StdErr"] = dataset_result1b[col].mean()
        tvalue = stats.ttest_1samp(dataset_result1[col].dropna(),0)[0]
        pvalue = stats.ttest_1samp(dataset_result1[col].dropna(),0)[1]
        data_result2.loc[col,"T-Stat"] = tvalue
        data_result2.loc[col,"P-Stat"] = pvalue
        if pvalue < 0.1:
            if pvalue < 0.05:
                if pvalue < 0.01:
                    data_result2.loc[col,"Sig"] = "1%"
                else:
                    data_result2.loc[col,"Sig"] = "5%"
            else:
                data_result2.loc[col,"Sig"] = "10%"
        else:
            data_result2.loc[col,"Sig"] = "NotSig"
    return data_result2

In [10]:
data1_result2 = Question2(data1_result1, data1_result1b)
data1_result2.to_csv (r'data1_result2.csv', index = True, header=True)

data2_result2 = Question2(data2_result1, data2_result1b)
data2_result2.to_csv (r'data2_result2.csv', index = True, header=True)

data3_result2 = Question2(data3_result1, data3_result1b)
data3_result2.to_csv (r'data3_result2.csv', index = True, header=True)

In [11]:
data1_result2

Unnamed: 0,Mean,StdErr,T-Stat,P-Stat,Sig
Constant,0.016094,0.01377,4.62062,5e-06,1%
LogSize_-1,-0.000803,0.002584,-1.69462,0.090603,10%
LogB/M_-1,0.004783,0.005609,5.277325,0.0,1%
"Return_-2,-12",0.011512,0.009874,5.030341,1e-06,1%


In [12]:
data2_result2

Unnamed: 0,Mean,StdErr,T-Stat,P-Stat,Sig
Constant,0.015235,0.020492,2.557301,0.010775,5%
LogSize_-1,-0.000445,0.003986,-0.413936,0.679058,NotSig
LogB/M_-1,0.004609,0.007495,4.832851,2e-06,1%
"Return_-2,-12",0.012227,0.015672,2.759604,0.00595,1%
"LogIssues_-1,-36",-0.005636,0.013802,-1.597059,0.110739,NotSig
Accruals_Yr-1,-4e-06,6.7e-05,-0.293948,0.768892,NotSig
ROA_Yr-1,-0.00024,0.085726,-0.012983,0.989645,NotSig
LogAG_Yr-1,-0.003767,0.027171,-0.911634,0.3623,NotSig


In [13]:
data3_result2

Unnamed: 0,Mean,StdErr,T-Stat,P-Stat,Sig
Constant,0.015484,0.015709,4.479822,9e-06,1%
LogSize_-1,-0.00101,0.002594,-1.775176,0.076354,10%
LogB/M_-1,0.002265,0.006823,2.403627,0.016523,5%
"Return_-2,-12",0.006645,0.010928,3.730254,0.000209,1%
"LogIssues_-1,-36",-0.000919,0.013874,-0.207548,0.83565,NotSig
Accruals_Yr-1,-1.3e-05,4.2e-05,-1.016147,0.309952,NotSig
ROA_Yr-1,0.015216,0.064006,2.258956,0.024229,5%
LogAG_Yr-1,-0.007115,0.020817,-3.827034,0.000143,1%
"DY_-1,-12",0.004186,0.195149,0.209533,0.8341,NotSig
"LogReturn_-13,-36",1e-06,0.006396,0.001551,0.998763,NotSig


### Question 3

1. Rolling 10 Year window, calculate
    a. 10yr Average for all Params
    b. Run Regression for calculating slopes 
2. Predict regression result using Average params
3. Compare predicted to real and calculate for table

In [14]:
data3_result1

Unnamed: 0,Constant,LogSize_-1,LogB/M_-1,"Return_-2,-12","LogIssues_-1,-36",Accruals_Yr-1,ROA_Yr-1,LogAG_Yr-1,"DY_-1,-12","LogReturn_-13,-36","LogIssues_-1,-12","Turnover_-1,-12",Debt/Price_Yr-1,Sales/Price_Yr-1,Adjusted R2,No. of Firms
2021-03-31,-0.054286,0.008575,0.008727,0.018048,-0.010645,0.000007,0.113564,0.035128,0.768936,0.019835,-0.050903,-0.0,0.050305,0.003557,0.155379,412.0
2021-02-28,0.13817,-0.00488,0.023783,-0.024811,0.002797,0.000002,0.060061,-0.018918,-0.700483,-0.014884,-0.057176,0.0,0.030091,0.004603,0.138348,471.0
2021-01-31,0.174601,-0.027149,-0.001908,0.063074,0.02356,-0.00002,-0.010223,-0.030902,0.187995,-0.015165,0.024132,0.0,-0.013502,0.001663,0.218096,589.0
2020-12-31,0.188331,-0.014367,-0.003838,-0.002568,-0.034911,-0.000009,-0.021544,-0.039454,-0.639728,0.013014,0.081276,0.0,0.002417,-0.001618,0.028965,1833.0
2020-11-30,0.208749,-0.010434,0.01692,-0.06548,0.044577,-0.000003,-0.04107,-0.024447,-0.194055,0.006846,-0.09933,0.0,0.028487,0.005928,0.096661,1816.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1968-07-31,,,,,,,,,,,,,,,,1
1968-06-30,,,,,,,,,,,,,,,,1
1968-05-31,,,,,,,,,,,,,,,,1
1968-04-30,,,,,,,,,,,,,,,,1


In [276]:
datelist = np.sort(data3["Date"].unique())

In [277]:
TenyrAvg = pd.DataFrame(index =range(0,len(datelist)-150))
for i in range(len(datelist)-150):
    TenyrAvg.loc[i,"StartDate"] = datelist[i+30]
    TenyrAvg.loc[i,"EndDate"] = datelist[i+149]
    TenyrAvg.loc[i,"ForecastDate"] = datelist[i+150]
Q3Table = TenyrAvg.copy()
TenyrAvg = TenyrAvg.reindex(columns = ["StartDate", "EndDate", "ForecastDate", "Constant"] + data3.columns[3:].tolist()).copy()

In [278]:
for i in range(len(TenyrAvg)):
    TenyrAvg.iloc[i,3:] = data3_result1[(data3_result1.index>=TenyrAvg["StartDate"][i]) &
                                        (data3_result1.index<=TenyrAvg["EndDate"][i])].reset_index(drop=True).mean(axis = 0)[:-2]

In [279]:
TenyrAvg

Unnamed: 0,StartDate,EndDate,ForecastDate,Constant,LogSize_-1,LogB/M_-1,"Return_-2,-12","LogIssues_-1,-36",Accruals_Yr-1,ROA_Yr-1,LogAG_Yr-1,"DY_-1,-12","LogReturn_-13,-36","LogIssues_-1,-12","Turnover_-1,-12",Debt/Price_Yr-1,Sales/Price_Yr-1
0,1970-09-30,1980-08-31,1980-09-30,0.016217,-0.001174,0.005076,0.014545,-0.000803,-2.689788e-05,0.030217,0.000973,-0.028925,-0.001453,-0.001216,-8.442971e-08,0.001960,0.000107
1,1970-10-31,1980-09-30,1980-10-31,0.016011,-0.001133,0.004941,0.015688,-0.001095,-2.472386e-05,0.027862,-0.000455,-0.022179,-0.001177,0.000126,-8.227222e-08,0.001953,0.000043
2,1970-11-30,1980-10-31,1980-11-30,0.015419,-0.001056,0.005063,0.015687,-0.000740,-2.528699e-05,0.028930,-0.000762,-0.022373,-0.001333,0.000968,-6.832067e-08,0.001873,0.000165
3,1970-12-31,1980-11-30,1980-12-31,0.016353,-0.001070,0.005141,0.016000,-0.000905,-2.546323e-05,0.027919,-0.001085,-0.035892,-0.001493,0.002118,-6.125942e-08,0.001726,0.000151
4,1971-01-31,1980-12-31,1981-01-31,0.016146,-0.001192,0.004776,0.016369,-0.000501,-3.171662e-05,0.029164,-0.001767,-0.041940,-0.001807,0.003274,-6.379363e-08,0.001563,0.000181
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
482,2010-11-30,2020-10-31,2020-11-30,0.011420,-0.000299,-0.001920,0.002649,-0.003939,7.466593e-07,0.000882,-0.004506,-0.058342,-0.000212,0.002626,-9.458982e-09,-0.001420,0.000805
483,2010-12-31,2020-11-30,2020-12-31,0.012733,-0.000351,-0.001768,0.001887,-0.003778,6.589751e-07,0.000121,-0.004398,-0.057239,-0.000275,0.002329,-7.849098e-09,-0.001216,0.000848
484,2011-01-31,2020-12-31,2021-01-31,0.013630,-0.000474,-0.001818,0.002069,-0.004398,5.460520e-07,0.000391,-0.004464,-0.059251,0.000034,0.002982,-7.824780e-09,-0.001173,0.000837
485,2011-02-28,2021-01-31,2021-02-28,0.014580,-0.000644,-0.001858,0.002717,-0.004353,5.240780e-07,0.000080,-0.004310,-0.056271,0.000053,0.003338,-4.991091e-09,-0.001349,0.000846


In [280]:
data3 = data3[data3['Date']>=TenyrAvg["ForecastDate"][0]].reset_index(drop=True).copy()
data3_report = data3.copy()
data3_report["Return_Est"] = np.nan

In [281]:
foredatelist = TenyrAvg['ForecastDate'].tolist()
for i in range(len(foredatelist)):
    const = TenyrAvg[TenyrAvg["ForecastDate"] == foredatelist[i]].iloc[0,3]
    coeff = np.array(TenyrAvg[TenyrAvg["ForecastDate"] == foredatelist[i]].iloc[:,4:])
    varbl = np.array(data3[data3["Date"] == foredatelist[i]].iloc[:,3:])
    Ret_Est = np.matmul(varbl, coeff.transpose()) + const
    data3_report.loc[data3_report["Date"] == foredatelist[i],"Return_Est"] = Ret_Est

In [282]:
data3_report

Unnamed: 0,GVKEY,Date,Return,LogSize_-1,LogB/M_-1,"Return_-2,-12","LogIssues_-1,-36",Accruals_Yr-1,ROA_Yr-1,LogAG_Yr-1,"DY_-1,-12","LogReturn_-13,-36","LogIssues_-1,-12","Turnover_-1,-12",Debt/Price_Yr-1,Sales/Price_Yr-1,Return_Est
0,1004,1980-09-30,-0.069307,3.403200,-0.008336,0.212759,0.666214,-1.103,0.049795,0.127634,0.032475,1.135378,0.340428,39003.009007,1.043409,3.947655,0.012570
1,1004,1980-10-31,-0.097872,3.354484,0.063490,0.307462,0.690142,-1.103,0.049795,0.127634,0.034894,1.214217,0.363538,40723.496164,1.095497,4.144727,0.014758
2,1004,1980-11-30,0.083333,3.264595,0.175968,0.310127,0.713549,-1.103,0.049795,0.127634,0.038095,1.152966,0.386127,42510.382218,1.198533,4.534554,0.016646
3,1004,1980-12-31,0.197802,3.366727,0.095925,0.169193,0.736458,-1.103,0.049795,0.127634,0.035165,1.293935,0.285765,44580.714483,1.082167,4.094295,0.013731
4,1004,1981-01-31,-0.084404,3.548000,-0.084563,-0.001728,0.738063,-1.103,0.049795,0.127634,0.029358,1.959914,0.177471,44969.910598,0.902752,3.415492,0.007345
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1190154,317264,2020-08-31,-0.011710,6.076221,0.661244,-0.154099,-0.077902,-115.174,-0.030312,-0.065911,0.000000,0.270834,-0.071939,252262.069994,1.598806,0.368673,0.004746
1190155,317264,2020-09-30,-0.050948,6.063467,0.673023,-0.178057,-0.078937,-115.174,-0.030312,-0.065911,0.000000,0.514577,-0.070355,252022.588874,1.619327,0.373405,0.005613
1190156,317264,2020-10-31,0.023720,6.010200,0.725315,-0.185330,-0.079973,-115.174,-0.030312,-0.065911,0.000000,0.519062,-0.067314,257688.600552,1.707922,0.393834,0.003944
1190157,317264,2020-11-30,0.332927,6.027104,0.701871,-0.356627,-0.086573,-115.174,-0.030312,-0.065911,0.000000,0.741259,-0.069822,252706.685796,1.679295,0.387233,0.003046


In [297]:
Q3Mid = data3_report.loc[:,['GVKEY', 'Date', 'Return', 'Return_Est']].copy()

In [298]:
Q3Mid['Return_Est'] - Q3Mid['Return'].shift(1)

0               NaN
1          0.084065
2          0.114518
3         -0.069602
4         -0.190457
             ...   
1190154   -0.098613
1190155    0.017323
1190156    0.054892
1190157   -0.020674
1190158   -0.327125
Length: 1190159, dtype: float64

In [299]:
Q3Mid.iloc[700:]

Unnamed: 0,GVKEY,Date,Return,Return_Est
700,1011,1995-04-30,-0.013793,-0.014497
701,1011,1995-05-31,-0.034965,-0.006354
702,1011,1995-06-30,0.057971,-0.005373
703,1012,1985-10-31,-0.076923,0.030801
704,1012,1985-11-30,0.083333,0.032331
...,...,...,...,...
1190154,317264,2020-08-31,-0.011710,0.004746
1190155,317264,2020-09-30,-0.050948,0.005613
1190156,317264,2020-10-31,0.023720,0.003944
1190157,317264,2020-11-30,0.332927,0.003046


In [146]:
Q3Table

Unnamed: 0,StartDate,EndDate,ForecastDate
0,1970-09-30,1980-08-31,1980-09-30
1,1970-10-31,1980-09-30,1980-10-31
2,1970-11-30,1980-10-31,1980-11-30
3,1970-12-31,1980-11-30,1980-12-31
4,1971-01-31,1980-12-31,1981-01-31
...,...,...,...
482,2010-11-30,2020-10-31,2020-11-30
483,2010-12-31,2020-11-30,2020-12-31
484,2011-01-31,2020-12-31,2021-01-31
485,2011-02-28,2021-01-31,2021-02-28


In [127]:
data3_result1

Unnamed: 0,Constant,LogSize_-1,LogB/M_-1,"Return_-2,-12","LogIssues_-1,-36",Accruals_Yr-1,ROA_Yr-1,LogAG_Yr-1,"DY_-1,-12","LogReturn_-13,-36","LogIssues_-1,-12","Turnover_-1,-12",Debt/Price_Yr-1,Sales/Price_Yr-1,Adjusted R2,No. of Firms
2021-03-31,-0.054286,0.008575,0.008727,0.018048,-0.010645,0.000007,0.113564,0.035128,0.768936,0.019835,-0.050903,-0.0,0.050305,0.003557,0.155379,412.0
2021-02-28,0.13817,-0.00488,0.023783,-0.024811,0.002797,0.000002,0.060061,-0.018918,-0.700483,-0.014884,-0.057176,0.0,0.030091,0.004603,0.138348,471.0
2021-01-31,0.174601,-0.027149,-0.001908,0.063074,0.02356,-0.00002,-0.010223,-0.030902,0.187995,-0.015165,0.024132,0.0,-0.013502,0.001663,0.218096,589.0
2020-12-31,0.188331,-0.014367,-0.003838,-0.002568,-0.034911,-0.000009,-0.021544,-0.039454,-0.639728,0.013014,0.081276,0.0,0.002417,-0.001618,0.028965,1833.0
2020-11-30,0.208749,-0.010434,0.01692,-0.06548,0.044577,-0.000003,-0.04107,-0.024447,-0.194055,0.006846,-0.09933,0.0,0.028487,0.005928,0.096661,1816.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1968-07-31,,,,,,,,,,,,,,,,1
1968-06-30,,,,,,,,,,,,,,,,1
1968-05-31,,,,,,,,,,,,,,,,1
1968-04-30,,,,,,,,,,,,,,,,1
