In [2]:
import statsmodels.api as sm
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
from linearmodels import PanelOLS
from patsy import dmatrices
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from yellowbrick.datasets import load_concrete
from yellowbrick.regressor import ResidualsPlot
from scipy import stats

In [None]:
#Preparation, already finished

df_org = pd.read_excel('FinalVersion.xlsx')
snapshot = pd.read_excel('fund_snapshot_lux.xlsx')

cols = ['FundId', 'Month', 'Fund Size', 'Fund Flow', 'Fund Sustainability', 'CAPM Performance', 'Performance Three Factor', 'Performance Four Factor']
df = df_org[cols].replace({0:np.nan})

cols_snapshot = ['FundId', 'Inception Date', 'Equity Style Box (Long)']
snap = snapshot[cols_snapshot].replace({0:np.nan})

In [None]:
#snap join into the summary, the ages are calculated according to the inception date
joint = df.join(snap.set_index('FundId'), on='FundId')
age = (joint['Inception Date'] - pd.to_datetime(2019-8-1))/np.timedelta64(1,'M')
joint['Age'] = age

fund_size_log = np.log10(joint['Fund Size'])
joint['size_log'] = fund_size_log

#the joint is saved and the column name Fund Sustainability is changed because of the space
#joint.to_excel("joint_snap.xlsx") 

In [3]:
joint = pd.read_excel('joint_snap.xlsx')

In [4]:
#convert the month column into correct form
joint['Month'] = pd.to_datetime(joint['Month'])
age = (joint['Month'] - joint['Inception Date'])/np.timedelta64(1,'M')
age = [int(i) for i in age]
joint['Age'] = age
#joint.dropna(subset=["esg_score"], inplace=True)

#convert the esb_l from string to float
esb_l = pd.DataFrame({'Equity Style Box (Long)':['Large Value', 'Large Growth', 'Large Blend', 'Mid Value', 'Mid Growth', 'Mid Blend','Small Value','Small Growth','Small Blend'], 'esb_l': [1, 2, 3, 4, 5, 6, 7, 8, 9]})
joint_new = joint.join(esb_l.set_index('Equity Style Box (Long)'), on='Equity Style Box (Long)')

#relative_flow is the size divided by flow value
flow = joint_new["Fund Flow"]
size = joint_new['Fund Size']
relative_flow = flow/size
joint_new['relative_flow'] = relative_flow

joint_new = joint_new.rename(columns={"Fund Sustainability": "esg_score"})

In [5]:
#build a new column, in which the esg_score is the value of next month
esg_ex = joint_new['esg_score']

esg_mod = [np.nan]*len(esg_ex)

for i in range(3276):
    for j in range(1,92):
         esg_mod[92*i+j-1] = esg_ex[92*i+j]
            
joint_new['esg_modified'] = esg_mod

In [None]:
#calculate the vif
X = joint_new[['esg_score','Age','size_log']].dropna()

vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif["features"] = X.columns

vif

In [None]:
#residual with train and data set
data = joint_new[['esg_score','Age','size_log','capm_Performance', 'relative_flow']].dropna()

X = data[['esg_score','Age','size_log']]
y = data['capm_Performance']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.02, random_state=42)
model = Ridge()

visualizer = ResidualsPlot(model)


visualizer.fit(X_train, y_train)  # Fit the training data to the visualizer
visualizer.score(X_test, y_test)  # Evaluate the model on the test data
 
visualizer.show()                 # Finalize and render the figure

#visualizer.show(outpath="Residual.pdf")

In [6]:
#catagorize the month - 2012/01 to 2019/08, totoally 92 months
month = pd.Categorical(joint_new.Month)

#use the 'Month' to divide the dataset, the index is Fund ID
df = joint_new.set_index(['FundId', 'Month'])
df['Month'] = month

In [7]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Fund Size,Fund Flow,esg_score,capm_Performance,performance_Three_Factor,performance_Four_Factor,Inception Date,Equity Style Box (Long),Age,size_log,esb_l,relative_flow,esg_modified,Month
FundId,Month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
FS00008SZD,2012-01-01,2.609599e+08,4.379443e+07,,1.507391,3.652442,1.841484,2009-01-28,Large Growth,35,8.416574,2.0,0.167821,,2012-01-01
FS00008SZD,2012-02-01,2.988403e+08,2.572761e+07,,1.544497,1.220171,1.155288,2009-01-28,Large Growth,36,8.475439,2.0,0.086092,,2012-02-01
FS00008SZD,2012-03-01,3.071028e+08,1.558167e+06,,4.025839,0.078026,0.963080,2009-01-28,Large Growth,37,8.487284,2.0,0.005074,,2012-03-01
FS00008SZD,2012-04-01,3.440745e+08,3.753147e+07,,2.647697,0.989935,1.171659,2009-01-28,Large Growth,38,8.536653,2.0,0.109079,,2012-04-01
FS00008SZD,2012-05-01,3.771187e+08,4.877692e+07,,-0.066138,-0.807304,-0.354775,2009-01-28,Large Growth,39,8.576478,2.0,0.129341,,2012-05-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
FSUSA09WNF,2019-04-01,8.137770e+06,-1.487095e+03,,-3.067829,-3.546688,-4.150616,2008-11-11,Large Blend,124,6.910505,3.0,-0.000183,,2019-04-01
FSUSA09WNF,2019-05-01,7.979041e+06,8.084123e+02,,3.666930,4.017213,5.035920,2008-11-11,Large Blend,125,6.901951,3.0,0.000101,40.91,2019-05-01
FSUSA09WNF,2019-06-01,8.651099e+06,2.060187e+03,40.91,4.207674,3.772864,3.199166,2008-11-11,Large Blend,126,6.937071,3.0,0.000238,,2019-06-01
FSUSA09WNF,2019-07-01,8.737163e+06,-2.267210e+03,,1.134835,4.536452,4.007588,2008-11-11,Large Blend,127,6.941370,3.0,-0.000259,,2019-07-01


In [8]:
#xtreg In_wage age 
exog_vars = ['esg_score','size_log', 'Age']
exog = sm.add_constant(df[exog_vars])

mod = PanelOLS(df.capm_Performance, exog, entity_effects=True, time_effects=True)

res = mod.fit()

print(res)
#res2 = mod.fit(cov_type='clustered', cluster_entity=True)

  return ptp(axis=axis, out=out, **kwargs)
Inputs contain missing values. Dropping rows with missing observations.


                          PanelOLS Estimation Summary                           
Dep. Variable:       capm_Performance   R-squared:                        0.0002
Estimator:                   PanelOLS   R-squared (Between):             -0.7186
No. Observations:              155436   R-squared (Within):              -0.0020
Date:                Mon, Apr 06 2020   R-squared (Overall):             -0.0357
Time:                        00:19:46   Log-likelihood                 -3.51e+05
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      9.2051
Entities:                        3262   P-value                           0.0000
Avg Obs:                       47.651   Distribution:                F(3,152081)
Min Obs:                       1.0000                                           
Max Obs:                       91.000   F-statistic (robust):             9.2051
                            

In [9]:
#xtreg In_wage age 
exog_vars = ['esg_modified','size_log', 'Age']
exog = sm.add_constant(df[exog_vars])

mod_ex = PanelOLS(df.capm_Performance, exog, entity_effects=True, time_effects=True)

res_ex = mod_ex.fit()

print(res_ex)

                          PanelOLS Estimation Summary                           
Dep. Variable:       capm_Performance   R-squared:                        0.0001
Estimator:                   PanelOLS   R-squared (Between):             -1.4760
No. Observations:              154833   R-squared (Within):              -0.0077
Date:                Mon, Apr 06 2020   R-squared (Overall):             -0.0909
Time:                        00:19:49   Log-likelihood                  -3.5e+05
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      6.5230
Entities:                        3252   P-value                           0.0002
Avg Obs:                       47.612   Distribution:                F(3,151488)
Min Obs:                       1.0000                                           
Max Obs:                       91.000   F-statistic (robust):             6.5230
                            

In [None]:
#draw the plot of residual: x-axis is reconstructed y, y-axis is corresponfing residual
residual = data['capm_Performance'] - res.params[1]*data['esg_score'] - res.params[2]*data['size_log'] - res.params[3]*data['Age'] - res.params[0]
y_value = res.params[1]*data['esg_score'] + res.params[2]*data['size_log'] + res.params[3]*data['Age'] + res.params[0]
plt.cla()
plt.plot(y_value, residual, ".b")
plt.grid(True)
plt.title("Residual Plot CAPM Performance")
plt.xlabel("Predicted Value")
plt.ylabel("Residual")
#plt.pause(.5)
plt.savefig('ResPlot_CAPM_Performance.pdf')

In [None]:
#Pearson correlation coefficient and p-value for testing non-correlation.
stats.pearsonr(data['esg_score'], data['capm_Performance'])

In [None]:
stats.pearsonr(data['esg_score'], data['size_log'])

In [None]:
stats.pearsonr(data['esg_score'], data['Age'])

In [None]:
stats.pearsonr(data['size_log'], data['capm_Performance'])

In [None]:
stats.pearsonr(data['Age'], data['capm_Performance'])

In [None]:
stats.pearsonr(data['Age'], data['size_log'])

In [None]:
x2 = np.linspace(38, 60, 3276*92)
y2 = [0.02*ix - 0.4699 for ix in x2]
plt.cla()
plt.plot(df['Fund Sustainability'][1000:2000], df['capm_Performance'][1000:2000], ".r")
plt.plot(x2, y2, "-b")
plt.grid(True)
plt.title("Factor Loading First Month: 2012-01")
plt.xlabel("beta_value")
plt.ylabel("Fund Return of All Month")
plt.pause(.5)


In [None]:
#example
data = pd.read_stata('nlswork.dta')
year = pd.Categorical(data.year)
data = data.set_index(['idcode','year'])
data['year']=year

exog_vars = ['age','tenure','south']
exog = sm.add_constant(data[exog_vars])

mod = PanelOLS(data.ln_wage, exog, entity_effects=True)

res = mod.fit()

res2 = mod.fit(cov_type='clustered', cluster_entity=True)

exog_vars2 = ['age','tenure','south', 'year']
exog2 = sm.add_constant(data[exog_vars2])

mod2 = PanelOLS(data.ln_wage, exog2, entity_effects=True)

res3 = mod2.fit(cov_type='clustered', cluster_entity=True)