In [1]:
import statsmodels.api as sm
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
from linearmodels import PanelOLS
from patsy import dmatrices
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
#Preparation, already finished

df_org = pd.read_excel('FinalVersion.xlsx')
snapshot = pd.read_excel('fund_snapshot_lux.xlsx')

cols = ['FundId', 'Month', 'Fund Size', 'Fund Flow', 'Fund Sustainability', 'CAPM Performance', 'Performance Three Factor', 'Performance Four Factor']
df = df_org[cols].replace({0:np.nan})

cols_snapshot = ['FundId', 'Inception Date', 'Equity Style Box (Long)']
snap = snapshot[cols_snapshot].replace({0:np.nan})

In [None]:
#snap join into the summary, the ages are calculated according to the inception date
joint = df.join(snap.set_index('FundId'), on='FundId')
age = (joint['Inception Date'] - pd.to_datetime(2019-8-1))/np.timedelta64(1,'M')
joint['Age'] = age

fund_size_log = np.log10(joint['Fund Size'])
joint['size_log'] = fund_size_log

#the joint is saved and the column name Fund Sustainability is changed because of the space
#joint.to_excel("joint_snap.xlsx") 

In [8]:
joint = pd.read_excel('joint_snap.xlsx')

In [3]:
#convert the month column into correct form
joint['Month'] = pd.to_datetime(joint['Month'])
age = (joint['Month'] - joint['Inception Date'])/np.timedelta64(1,'M')
age = [int(i) for i in age]
joint['Age'] = age
#joint.dropna(subset=["esg_score"], inplace=True)

#convert the esb_l from string to float
esb_l = pd.DataFrame({'Equity Style Box (Long)':['Large Value', 'Large Growth', 'Large Blend', 'Mid Value', 'Mid Growth', 'Mid Blend','Small Value','Small Growth','Small Blend'], 'esb_l': [1, 2, 3, 4, 5, 6, 7, 8, 9]})
joint_new = joint.join(esb_l.set_index('Equity Style Box (Long)'), on='Equity Style Box (Long)')

#relative_flow is the size divided by flow value
flow = joint_new["Fund Flow"]
size = joint_new['Fund Size']
relative_flow = flow/size
joint_new['relative_flow'] = relative_flow

joint_new = joint_new.rename(columns={"Fund Sustainability": "esg_score"})

In [39]:
X = joint_new[['esg_score','Age','size_log', 'relative_flow']].dropna()

In [41]:
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif["features"] = X.columns

In [42]:
vif

Unnamed: 0,VIF Factor,features
0,58.217609,esg_score
1,3.061453,Age
2,62.285585,size_log
3,1.002377,relative_flow


In [None]:
#catagorize the month - 2012/01 to 2019/08, totoally 92 months
month = pd.Categorical(joint_new.Month)

#use the 'Month' to divide the dataset, the index is Fund ID
df = joint_new.set_index(['FundId', 'Month'])
df['Month'] = month

In [None]:
#xtreg In_wage age 
exog_vars = ['esg_score','size_log', 'Age', 'relative_flow']
exog = sm.add_constant(df[exog_vars])

In [None]:
exog

In [None]:
mod = PanelOLS(df.capm_Performance, exog, entity_effects=True, time_effects=True)
mod

In [None]:
res = mod.fit()
res2 = mod.fit(cov_type='clustered', cluster_entity=True)

In [None]:
x2 = np.linspace(38, 60, 3276*92)
y2 = [0.02*ix - 0.4699 for ix in x2]
plt.cla()
plt.plot(df['Fund Sustainability'][1000:2000], df['capm_Performance'][1000:2000], ".r")
plt.plot(x2, y2, "-b")
plt.grid(True)
plt.title("Factor Loading First Month: 2012-01")
plt.xlabel("beta_value")
plt.ylabel("Fund Return of All Month")
plt.pause(.5)


In [None]:
#example
data = pd.read_stata('nlswork.dta')
year = pd.Categorical(data.year)
data = data.set_index(['idcode','year'])
data['year']=year

exog_vars = ['age','tenure','south']
exog = sm.add_constant(data[exog_vars])

mod = PanelOLS(data.ln_wage, exog, entity_effects=True)

res = mod.fit()

res2 = mod.fit(cov_type='clustered', cluster_entity=True)

exog_vars2 = ['age','tenure','south', 'year']
exog2 = sm.add_constant(data[exog_vars2])

mod2 = PanelOLS(data.ln_wage, exog2, entity_effects=True)

res3 = mod2.fit(cov_type='clustered', cluster_entity=True)