In [1]:
import statsmodels.api as sm
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
from linearmodels import PanelOLS
from patsy import dmatrices
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from yellowbrick.datasets import load_concrete
from yellowbrick.regressor import ResidualsPlot
from scipy import stats

In [None]:
#Preparation, already finished

df_org = pd.read_excel('FinalVersion.xlsx')
snapshot = pd.read_excel('fund_snapshot_lux.xlsx')

cols = ['FundId', 'Month', 'Fund Size', 'Fund Flow', 'Fund Sustainability', 'CAPM Performance', 'Performance Three Factor', 'Performance Four Factor']
df = df_org[cols].replace({0:np.nan})

cols_snapshot = ['FundId', 'Inception Date', 'Equity Style Box (Long)']
snap = snapshot[cols_snapshot].replace({0:np.nan})

In [None]:
#snap join into the summary, the ages are calculated according to the inception date
joint = df.join(snap.set_index('FundId'), on='FundId')
age = (joint['Inception Date'] - pd.to_datetime(2019-8-1))/np.timedelta64(1,'M')
joint['Age'] = age

fund_size_log = np.log10(joint['Fund Size'])
joint['size_log'] = fund_size_log

#the joint is saved and the column name Fund Sustainability is changed because of the space
#joint.to_excel("joint_snap.xlsx") 

In [2]:
joint = pd.read_excel('joint_snap.xlsx')

In [3]:
#convert the month column into correct form
joint['Month'] = pd.to_datetime(joint['Month'])
age = (joint['Month'] - joint['Inception Date'])/np.timedelta64(1,'M')
age = [int(i) for i in age]
joint['Age'] = age
#joint.dropna(subset=["esg_score"], inplace=True)

#multiple categories to dummy variable
dummies = pd.get_dummies(joint['Equity Style Box (Long)'])
joint_new = joint.join(dummies)

#relative_flow is the size divided by flow value
flow = joint_new["Fund Flow"]
size = joint_new['Fund Size']
relative_flow = flow/size
joint_new['relative_flow'] = relative_flow

joint_new = joint_new.rename(columns={"Fund Sustainability": "esg_score"})

#build a new column, in which the esg_score is the value of next month
esg_ex = joint_new['esg_score']

esg_mod = [np.nan]*len(esg_ex)

for i in range(3276):
    for j in range(0,91):
         esg_mod[92*i+j+1] = esg_ex[92*i+j]
            
joint_new['esg_modified'] = esg_mod

In [8]:
joint_new

Unnamed: 0,FundId,Month,Fund Size,Fund Flow,esg_score,capm_Performance,performance_Three_Factor,performance_Four_Factor,Inception Date,Equity Style Box (Long),...,Large Growth,Large Value,Mid Blend,Mid Growth,Mid Value,Small Blend,Small Growth,Small Value,relative_flow,esg_modified
0,FS00008SZD,2012-01-01,2.609599e+08,4.379443e+07,,1.507391,3.652442,1.841484,2009-01-28,Large Growth,...,1,0,0,0,0,0,0,0,0.167821,
1,FS00008SZD,2012-02-01,2.988403e+08,2.572761e+07,,1.544497,1.220171,1.155288,2009-01-28,Large Growth,...,1,0,0,0,0,0,0,0,0.086092,
2,FS00008SZD,2012-03-01,3.071028e+08,1.558167e+06,,4.025839,0.078026,0.963080,2009-01-28,Large Growth,...,1,0,0,0,0,0,0,0,0.005074,
3,FS00008SZD,2012-04-01,3.440745e+08,3.753147e+07,,2.647697,0.989935,1.171659,2009-01-28,Large Growth,...,1,0,0,0,0,0,0,0,0.109079,
4,FS00008SZD,2012-05-01,3.771187e+08,4.877692e+07,,-0.066138,-0.807304,-0.354775,2009-01-28,Large Growth,...,1,0,0,0,0,0,0,0,0.129341,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
301387,FSUSA09WNF,2019-04-01,8.137770e+06,-1.487095e+03,,-3.067829,-3.546688,-4.150616,2008-11-11,Large Blend,...,0,0,0,0,0,0,0,0,-0.000183,41.12
301388,FSUSA09WNF,2019-05-01,7.979041e+06,8.084123e+02,,3.666930,4.017213,5.035920,2008-11-11,Large Blend,...,0,0,0,0,0,0,0,0,0.000101,
301389,FSUSA09WNF,2019-06-01,8.651099e+06,2.060187e+03,40.91,4.207674,3.772864,3.199166,2008-11-11,Large Blend,...,0,0,0,0,0,0,0,0,0.000238,
301390,FSUSA09WNF,2019-07-01,8.737163e+06,-2.267210e+03,,1.134835,4.536452,4.007588,2008-11-11,Large Blend,...,0,0,0,0,0,0,0,0,-0.000259,40.91


In [None]:
#calculate the vif
X = joint_new[['esg_score','Age','size_log']].dropna()

vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif["features"] = X.columns

vif

In [None]:
#residual with train and data set
data = joint_new[['esg_score','Age','size_log','capm_Performance', 'relative_flow']].dropna()

X = data[['esg_score','Age','size_log']]
y = data['capm_Performance']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.02, random_state=42)
model = Ridge()

visualizer = ResidualsPlot(model)


visualizer.fit(X_train, y_train)  # Fit the training data to the visualizer
#visualizer.score(X_test, y_test)  # Evaluate the model on the test data
 
#visualizer.show()                 # Finalize and render the figure

#visualizer.show(outpath="Residual.pdf")

In [4]:
#catagorize the month - 2012/01 to 2019/08, totoally 92 months
month = pd.Categorical(joint_new.Month)

#use the 'Month' to divide the dataset, the index is Fund ID
df = joint_new.set_index(['FundId', 'Month'])
df['Month'] = month

In [5]:
#capm Performance summary
exog_vars = ['esg_score']
exog = sm.add_constant(df[exog_vars])
#ex represents t-1: last month
#exog_vars_ex = ['esg_modified','size_log', 'Age','Large Value', 'Large Growth', 'Large Blend', 'Mid Value', 'Mid Growth', 'Mid Blend','Small Value','Small Growth','Small Blend']
#exog_ex = sm.add_constant(df[exog_vars_ex])

mod = PanelOLS(df.capm_Performance, exog, time_effects=True)
#mod_ex = PanelOLS(df.capm_Performance, exog_ex, time_effects=True)

res = mod.fit()
#res_ex = mod_ex.fit()

'''
#Performance three factor summary
exog_vars_tf = ['esg_score','size_log', 'Age']
exog_tf = sm.add_constant(df[exog_vars_tf])
#ex represents t-1: last month
exog_vars_ex_tf = ['esg_modified','size_log', 'Age']
exog_ex_tf = sm.add_constant(df[exog_vars_ex_tf])

mod_tf = PanelOLS(df.performance_Three_Factor, exog_tf, entity_effects=True, time_effects=True)
mod_ex_tf = PanelOLS(df.performance_Three_Factor, exog_ex_tf, entity_effects=True, time_effects=True)

res_tf = mod_tf.fit()
res_ex_tf = mod_ex_tf.fit()


#Performance four factor summary
exog_vars_ff = ['esg_score','size_log', 'Age']
exog_ff = sm.add_constant(df[exog_vars_ff])
#ex represents t-1: last month
exog_vars_ex_ff = ['esg_modified','size_log', 'Age']
exog_ex_ff = sm.add_constant(df[exog_vars_ex_ff])

mod_ff = PanelOLS(df.performance_Four_Factor, exog_ff, entity_effects=True, time_effects=True)
mod_ex_ff = PanelOLS(df.performance_Four_Factor, exog_ex_ff, entity_effects=True, time_effects=True)

res_ff = mod_ff.fit()
res_ex_ff = mod_ex_ff.fit()
'''

  return ptp(axis=axis, out=out, **kwargs)
Inputs contain missing values. Dropping rows with missing observations.


"\n#Performance three factor summary\nexog_vars_tf = ['esg_score','size_log', 'Age']\nexog_tf = sm.add_constant(df[exog_vars_tf])\n#ex represents t-1: last month\nexog_vars_ex_tf = ['esg_modified','size_log', 'Age']\nexog_ex_tf = sm.add_constant(df[exog_vars_ex_tf])\n\nmod_tf = PanelOLS(df.performance_Three_Factor, exog_tf, entity_effects=True, time_effects=True)\nmod_ex_tf = PanelOLS(df.performance_Three_Factor, exog_ex_tf, entity_effects=True, time_effects=True)\n\nres_tf = mod_tf.fit()\nres_ex_tf = mod_ex_tf.fit()\n\n\n#Performance four factor summary\nexog_vars_ff = ['esg_score','size_log', 'Age']\nexog_ff = sm.add_constant(df[exog_vars_ff])\n#ex represents t-1: last month\nexog_vars_ex_ff = ['esg_modified','size_log', 'Age']\nexog_ex_ff = sm.add_constant(df[exog_vars_ex_ff])\n\nmod_ff = PanelOLS(df.performance_Four_Factor, exog_ff, entity_effects=True, time_effects=True)\nmod_ex_ff = PanelOLS(df.performance_Four_Factor, exog_ex_ff, entity_effects=True, time_effects=True)\n\nres_ff

In [6]:
print(res)

                          PanelOLS Estimation Summary                           
Dep. Variable:       capm_Performance   R-squared:                     3.627e-05
Estimator:                   PanelOLS   R-squared (Between):             -0.0052
No. Observations:              159496   R-squared (Within):               0.0001
Date:                Wed, Apr 08 2020   R-squared (Overall):              0.0001
Time:                        13:21:48   Log-likelihood                -3.619e+05
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      5.7812
Entities:                        3271   P-value                           0.0162
Avg Obs:                       48.761   Distribution:                F(1,159404)
Min Obs:                       1.0000                                           
Max Obs:                       91.000   F-statistic (robust):             5.7812
                            

In [None]:
print(res_ex_tf)

In [None]:
print(res_tf)

In [None]:
print(res_ex_ff)

In [None]:
print(res_ff)

In [None]:
#draw the plot of residual: x-axis is reconstructed y, y-axis is corresponfing residual
residual = data['capm_Performance'] - res.params[1]*data['esg_score'] - res.params[2]*data['size_log'] - res.params[3]*data['Age'] - res.params[0]
y_value = res.params[1]*data['esg_score'] + res.params[2]*data['size_log'] + res.params[3]*data['Age'] + res.params[0]
plt.cla()
plt.plot(y_value, residual, ".b")
plt.grid(True)
plt.title("Residual Plot CAPM Performance")
plt.xlabel("Predicted Value")
plt.ylabel("Residual")
#plt.pause(.5)
plt.savefig('ResPlot_CAPM_Performance.pdf')

In [None]:
#Pearson correlation coefficient and p-value for testing non-correlation.
stats.pearsonr(data['esg_score'], data['capm_Performance'])
stats.pearsonr(data['esg_score'], data['size_log'])
stats.pearsonr(data['esg_score'], data['Age'])
stats.pearsonr(data['size_log'], data['capm_Performance'])
stats.pearsonr(data['Age'], data['capm_Performance'])
stats.pearsonr(data['Age'], data['size_log'])

In [None]:
#Robustness test

#capm Performance summary
exog_vars_rob = ['esg_score','size_log', 'Age', 'relative_flow']
exog_rob = sm.add_constant(df[exog_vars_rob])

mod_rob = PanelOLS(df.capm_Performance, exog_rob, entity_effects=True, time_effects=True)

res_rob = mod_rob.fit()

In [None]:
print(res_rob)

In [None]:
#Robustness test

#Performance Three Factor summary
exog_vars_rob = ['esg_score','size_log', 'Age', 'relative_flow']
exog_rob = sm.add_constant(df[exog_vars_rob])

mod_tf_rob = PanelOLS(df.performance_Three_Factor, exog_rob, entity_effects=True, time_effects=True)

res_tf_rob = mod_tf_rob.fit()

In [None]:
print(res_tf_rob) 

In [None]:
#Robustness test

#Performance Four Factor summary
exog_vars_rob = ['esg_score','size_log', 'Age', 'relative_flow']
exog_rob = sm.add_constant(df[exog_vars_rob])

mod_ff_rob = PanelOLS(df.performance_Four_Factor, exog_rob, entity_effects=True, time_effects=True)

res_ff_rob = mod_tf_rob.fit()

In [None]:
print(res_ff_rob) 