In [1]:
import statsmodels.api as sm
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
from linearmodels import PanelOLS
from patsy import dmatrices
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from yellowbrick.datasets import load_concrete
from yellowbrick.regressor import ResidualsPlot
from scipy import stats

In [2]:
joint = pd.read_excel('joint_snap.xlsx')

In [3]:
#convert the month column into correct form
joint['Month'] = pd.to_datetime(joint['Month'])
age = (joint['Month'] - joint['Inception Date'])/np.timedelta64(1,'M')
age = [int(i) for i in age]
joint['Age'] = age
#joint.dropna(subset=["esg_score"], inplace=True)

#convert the esb_l from string to float
esb_l = pd.DataFrame({'Equity Style Box (Long)':['Large Value', 'Large Growth', 'Large Blend', 'Mid Value', 'Mid Growth', 'Mid Blend','Small Value','Small Growth','Small Blend'], 'esb_l': [1, 2, 3, 4, 5, 6, 7, 8, 9]})
joint_new = joint.join(esb_l.set_index('Equity Style Box (Long)'), on='Equity Style Box (Long)')

#relative_flow is the size divided by flow value
flow = joint_new["Fund Flow"]
size = joint_new['Fund Size']
relative_flow = flow/size
joint_new['relative_flow'] = relative_flow

joint_new = joint_new.rename(columns={"Fund Sustainability": "esg_score"})

In [4]:
#build a new column, in which the esg_score is the value of next month
esg_ex = joint_new['esg_score']

esg_mod = [np.nan]*len(esg_ex)

for i in range(3276):
    for j in range(0,91):
         esg_mod[92*i+j+1] = esg_ex[92*i+j]
            
esg_delta = esg_ex - esg_mod
            
joint_new['esg_delta'] = esg_delta

In [5]:
#build a new column, in which the performance is the value of next month
perf_ex = joint_new['capm_Performance']

perf_mod = [np.nan]*len(perf_ex)

for i in range(3276):
    for j in range(0,91):
         perf_mod[92*i+j+1] = perf_ex[92*i+j]
            
perf_delta = perf_ex - perf_mod
            
joint_new['performance_delta'] = perf_delta

In [6]:
#catagorize the month - 2012/01 to 2019/08, totoally 92 months
month = pd.Categorical(joint_new.Month)

#use the 'Month' to divide the dataset, the index is Fund ID
df = joint_new.set_index(['FundId', 'Month'])
df['Month'] = month

In [11]:
#capm Performance summary
exog_vars = ['esg_delta','size_log', 'Age']
exog = sm.add_constant(df[exog_vars])

mod = PanelOLS(df.performance_delta, exog, entity_effects=True, time_effects=True)


res = mod.fit()


In [12]:
print(res)

                          PanelOLS Estimation Summary                           
Dep. Variable:      performance_delta   R-squared:                        0.0002
Estimator:                   PanelOLS   R-squared (Between):             -22.258
No. Observations:              144307   R-squared (Within):              -0.0187
Date:                Mon, Apr 06 2020   R-squared (Overall):             -0.4235
Time:                        21:37:11   Log-likelihood                 -3.78e+05
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      9.8051
Entities:                        2929   P-value                           0.0000
Avg Obs:                       49.268   Distribution:                F(3,141286)
Min Obs:                       1.0000                                           
Max Obs:                       90.000   F-statistic (robust):             9.8051
                            