# Generating RRI Scores & Fundamental Data

2024-09-21

This notebook calculates the RRI score based on incident data provided from Mark Thompson

In [1]:
import pandas as pd
import numpy as np
import csv
import ujson 
import json
from tqdm.notebook import tqdm

In [None]:
# ISIN to RR ID
rr_isin = pd.read_csv('data/REPRISK_ISINs.csv', quoting=csv.QUOTE_NONE)
rr_isin.columns = ['rr', 'ISIN']
rr_isin['ISIN'] = rr_isin['ISIN'].str.replace('\"', '')
map_isin = rr_isin.set_index('ISIN')['rr'].to_dict()

# This code was taken from the RepRisk website at https://www.reprisk.com/lab/jn/esg-score.html

# Incident Data
incident_data = pd.read_csv('data/REPRISK_EVENTS.csv', quoting=csv.QUOTE_NONE)
incident_data.columns = ['date', 'ID_incident', 'ID_RR', 'reach', 'severity', 'unsharp', 'novelty', 
                         'environment', 'social', 'government', 'crosscutting']

incident_data['date'] = pd.to_datetime(incident_data['date'])
incident_data['severity'].replace({0: 1}, inplace=True)
incident_data['reach'].replace({0: 1}, inplace=True)
incident_data['novelty'].replace({0: 1}, inplace=True)

severity_weights = {1: 1, 
                    2: 10, 
                    3: 100}
reach_weights = {1: 1, 
                 2: 2, 
                 3: 3}
novelty_weights = {1: 1,
                   2: 2}

incident_data["Incident Score"] = (incident_data['severity'].replace(severity_weights)
                                   *incident_data['reach'].replace(reach_weights)
                                   *incident_data['novelty'].replace(novelty_weights))

n_years = 2
n_days = 365*n_years
curvature_ = 1
weights = np.arange(1,n_days + 1)
weights = weights/weights.max()
weights = ((2**weights) - 1)**curvature_
time_weights = pd.Series(weights)

start = pd.Timestamp("2007-01-01")
todate = max(incident_data['date']).strftime('%Y-%m-%d')
end = pd.Timestamp(todate)
date_range_extended = pd.date_range(start=start-pd.Timedelta(days=n_days), end=end)
date_range = pd.date_range(start=start, end=end)
incident_data.set_index("ID_RR", inplace=True)

def get_raw_scores(id_, incident_data, col="Incident Score"):
    try:
        company_incidents = incident_data.reset_index().set_index('date')
        company_incidents = company_incidents[company_incidents['ID_RR']==id_]
        incident_scores = company_incidents[col].groupby(level=0).sum().reindex(date_range_extended).fillna(0)
        stacked_incident_score = incident_scores.rolling(n_days).apply(lambda scores: np.dot(scores, weights)).fillna(0)
        stacked_incident_score = stacked_incident_score.reindex(date_range)
    except KeyError:
        stacked_incident_score = pd.Series(0, index=date_range)
    stacked_incident_score.name = "Incident Score"
    return stacked_incident_score  

def scale(series, lambda_=0.000105, curvature=5.3):
    return 100*((1 - np.exp(-lambda_*series))**(1/curvature))

scaling = scale(pd.Series(range(750)))

def max_decay(series, decay=0.5**(1/365)):
    new_series = pd.Series(index=series.index, dtype="float")
    previous = 0
    for date, value in series.items():
        if previous > 0:
            if value/previous < decay:
                new_series[date] = previous*decay
            else:
                new_series[date] = value
        else:
            new_series[date] = value
            
        previous = new_series[date]
    return new_series

## Generate RRI

RRI scores using incident data for all equities and bonds

In [3]:
# read spx historical constituents
spx_const = pd.read_csv("data/spx_constituents.csv", quoting=csv.QUOTE_NONE, delimiter=' ')
spx_const.columns = ['date'] + [x.replace('\"', '') for x in spx_const.columns[1:]]
spx_const['date'] = spx_const['date'].str.replace('\"', '')
spx_const.set_index('date', inplace=True)
spx_const = spx_const.T

# read euro historical constituents
euro_const = pd.read_csv("data/stoxx_constituents.csv", quoting=csv.QUOTE_NONE, delimiter=' ')
euro_const.columns = ['date'] + [x.replace('\"', '') for x in euro_const.columns[1:]]
euro_const['date'] = euro_const['date'].str.replace('\"', '')
euro_const.set_index('date', inplace=True)
euro_const = euro_const.T

In [95]:
# esg scores for every isin
spx_esg = {}

# calculate and plot ESG sub-scores and overall score for each isin in SPX
esgx = ["environment", "social", "government", "crosscutting"]
for isin in tqdm(spx_const.index):
    # if esg data is already there don't recalculate
    if len(spx_esg.get(isin, [])):
        continue
    # if isin is not mapped to a reprisk id, continue
    if not map_isin.get(isin, False):
        continue
        
    # map to reprisk id
    id_ = map_isin[isin]
    scores = {}

    # calculate esg_score
    series = get_raw_scores(id_, incident_data)
    series = max_decay(scale(series))
    scores['esg_score'] = series

    # code to calculate subscores (environment, social, governance)
    
    # for subscore in esgx:
    #     series = get_raw_scores(id_, incident_data[incident_data[subscore] == 1])
    #     series = max_decay(scale(series))
    #     scores[subscore] = series
    
    spx_esg[isin] = pd.DataFrame(scores)

  0%|          | 0/1073 [00:00<?, ?it/s]

In [96]:
# esg scores for every isin
euro_esg = {}

# calculate and plot ESG sub-scores and overall score for each isin in euro
esgx = ["environment", "social", "government", "crosscutting"]
for isin in tqdm(euro_const.index):
    # if esg data is already there don't recalculate
    if len(euro_esg.get(isin, [])):
        continue
    # if isin is not mapped to a reprisk id, continue
    if not map_isin.get(isin, False):
        continue
        
    # map to reprisk id
    id_ = map_isin[isin]
    scores = {}

    # calculate esg_score
    series = get_raw_scores(id_, incident_data)
    series = max_decay(scale(series))
    scores['esg_score'] = series

    # code to calculate subscores (environment, social, governance)
    
    # for subscore in esgx:
    #     series = get_raw_scores(id_, incident_data[incident_data[subscore] == 1])
    #     series = max_decay(scale(series))
    #     scores[subscore] = series
    
    euro_esg[isin] = pd.DataFrame(scores)

  0%|          | 0/1616 [00:00<?, ?it/s]

In [None]:
FILE = "data/historical.constituents_iBoxx_USD_INVESTMENT_GRADE_COMPONENTS.csv" # bond dtta US investment grade location

df = pd.read_csv(FILE, quoting=csv.QUOTE_NONE) # convert to pandas dataframe

df.columns = ['date', 'isin', 'index_weight', 'bid_price', 'ask_price'] # rename columns

# convert numeric columns to float
for key in ['date', 'isin', 'index_weight', 'bid_price', 'ask_price']:
    df[key] = df[key].str.replace('\"', '')
    if key not in {'date', 'isin'}:
        df[key] = df[key].astype(float)

df = df.dropna()

df['date'] = pd.to_datetime(df['date']) # set date as datetime object and make it the index
df.set_index('date', inplace=True)

dates = sorted(list(set(df.index))) # get all dates for which there is data

us_bond_esg = {} # esg scores corresponding to isin
all_us_bond_isin = set(df['isin']) # all isin

# all isins for which there exists a reprisk id
s = set()
for isin in all_us_bond_isin.intersection(set(map_isin.keys())):
    s.add(map_isin[isin])

esgx = ["environment", "social", "government", "crosscutting"]
for id_ in tqdm(s):
    # pass if data already exists
    if len(us_bond_esg.get(id_, [])):
        continue 
    scores = {}

    # calculate esg score
    series = get_raw_scores(id_, incident_data)
    series = max_decay(scale(series))
    scores['esg_score'] = series

    # # calculate subscores (environment, social, governance)
    # for subscore in esgx:
    #     series = get_raw_scores(id_, incident_data[incident_data[subscore] == 1])
    #     series = max_decay(scale(series))
    #     scores[subscore] = series
    
    us_bond_esg[id_] = pd.DataFrame(scores)

In [None]:
FILE = "data/historical.constituents_iBoxx_EUR_INVESTMENT_GRADE_COMPONENTS.csv"

df = pd.read_csv(FILE, quoting=csv.QUOTE_NONE) # convert to pandas dataframe

df.columns = ['date', 'isin', 'index_weight', 'bid_price', 'ask_price']

for key in ['date', 'isin', 'index_weight', 'bid_price', 'ask_price']:
    df[key] = df[key].str.replace('\"', '')
    if key not in {'date', 'isin'}:
        df[key] = df[key].astype(float)

df = df.dropna()

df['date'] = pd.to_datetime(df['date']) # set date as datetime object and make it the index
df.set_index('date', inplace=True)

dates = sorted(list(set(df.index))) # sort given dates (monthly)

euro_bond_esg = {}
all_euro_bond_isin = set(df['isin'])

s = set()
for isin in all_euro_bond_isin.intersection(set(map_isin.keys())):
    s.add(map_isin[isin])

esgx = ["environment", "social", "government", "crosscutting"]
for id_ in tqdm(s):
    if len(euro_bond_esg.get(id_, [])):
        continue 
    scores = {}
    
    series = get_raw_scores(id_, incident_data)
    series = max_decay(scale(series))
    scores['esg_score'] = series
    
    # for subscore in esgx:
    #     series = get_raw_scores(id_, incident_data[incident_data[subscore] == 1])
    #     series = max_decay(scale(series))
    #     scores[subscore] = series
    
    euro_bond_esg[id_] = pd.DataFrame(scores)

In [None]:
eurodf = pd.concat(euro_bond_esg, axis=0)
print(eurodf)
usdf = pd.concat(us_bond_esg, axis=0)
print(usdf)

eurodf.to_csv('data/euro_bond_esg.csv')
print('EURO BOND DATA COMPLETE...')
usdf.to_csv('data/us_bond_esg.csv')
print('US BOND DATA COMPLETE...')

                    esg_score  environment     social  government  \
2.0     2007-01-01   0.000000     0.000000   0.000000         0.0   
        2007-01-02   0.000000     0.000000   0.000000         0.0   
        2007-01-03   0.000000     0.000000   0.000000         0.0   
        2007-01-04   0.000000     0.000000   0.000000         0.0   
        2007-01-05   0.000000     0.000000   0.000000         0.0   
...                       ...          ...        ...         ...   
65535.0 2024-05-27  19.132182    19.132182  19.132182         0.0   
        2024-05-28  19.100995    19.100995  19.100995         0.0   
        2024-05-29  19.069617    19.069617  19.069617         0.0   
        2024-05-30  19.038045    19.038045  19.038045         0.0   
        2024-05-31  19.006277    19.006277  19.006277         0.0   

                    crosscutting  trend_esg_score  
2.0     2007-01-01      0.000000              NaN  
        2007-01-02      0.000000              NaN  
        2007-01

In [19]:
# load bond data and save to avoid computations
euro_bond_esg = pd.read_csv('data/euro_bond_esg.csv')
us_bond_esg = pd.read_csv('data/us_bond_esg.csv')

euro_bond_esg.columns = ['rr_id', 'date', 'esg_score', 'environment', 'social', 'government', 'crosscutting', 'trend_esg_score']
us_bond_esg.columns = ['rr_id', 'date', 'esg_score', 'environment', 'social', 'government', 'crosscutting', 'trend_esg_score']

us_bond_esg.set_index('date', inplace=True)
euro_bond_esg.set_index('date', inplace=True)

## Equity Data

RRI, Market Cap, Free Float, Price, Sector, Constituents for Equities

In [31]:
from datetime import datetime

# sector data for SPX
sector_data = pd.read_csv("data/SPsectors.csv", quoting=csv.QUOTE_NONE)
spx_sector = {sector_data['ISIN'].iloc[i]: sector_data['SP_sectors'].iloc[i] for i in range(len(sector_data))}

print('**SPX sectors loaded**')

# SPX marketcaps
spx_mc = pd.read_csv("data/spx_TR.CompanyMarketCap.csv", quoting=csv.QUOTE_NONE)

spx_mc.columns = ['num', 'isin', 'market_cap', 'date']

for key in ['num', 'isin', 'market_cap', 'date']:
    spx_mc[key] = spx_mc[key].str.replace('\"', '')

spx_mc = spx_mc.replace(r'^\s*$', np.nan, regex=True)

spx_mc = spx_mc.dropna()

spx_mc['market_cap'] = pd.to_numeric(spx_mc['market_cap'])

spx_mc['date'] = spx_mc['date'].apply(lambda x: datetime.strptime(x, '%Y-%m-%dT%H:%M:%SZ'))
spx_mc.set_index('isin', inplace=True)
spx_mc = spx_mc.drop('num', axis=1)

print('**SPX marketcap loaded**')

# spx price data

spx_price = pd.read_csv("data/spx_TR.CLOSEPRICE.csv", quoting=csv.QUOTE_NONE)

spx_price.columns = ['num', 'isin', 'close', 'date']

for key in ['num', 'isin', 'close', 'date']:
    spx_price[key] = spx_price[key].str.replace('\"', '')
    
spx_price = spx_price.replace(r'^\s*$', np.nan, regex=True)
spx_price = spx_price.dropna()
spx_price['close'] =  pd.to_numeric(spx_price['close'])
spx_price['date'] = spx_price['date'].apply(lambda x: datetime.strptime(x, '%Y-%m-%dT%H:%M:%SZ'))
spx_price.set_index('isin', inplace=True)
spx_price = spx_price.drop('num', axis=1)

print('**SPX price data loaded**')

# spx freefloat data

spx_ff = pd.read_csv('data/spx_TR.FreeFloatPct.csv', quoting=csv.QUOTE_NONE)
spx_ff.columns = ['RR_id', 'ISIN', 'Free Float', 'date']

for key in ['RR_id', 'ISIN', 'Free Float', 'date']:
    spx_ff[key] = spx_ff[key].str.replace('\"', '')
spx_ff['Free Float'].replace('', np.nan, inplace=True)
spx_ff = spx_ff.dropna()
spx_ff = spx_ff.reindex(index=spx_ff.index[::-1])
spx_ff['Free Float'] =  pd.to_numeric(spx_ff['Free Float'])
spx_ff['Free Float'] = spx_ff['Free Float']/100
spx_ff.set_index('ISIN', inplace=True)
spx_ff['date']= pd.to_datetime(spx_ff['date']).dt.strftime('%Y-%m-%d')
spx_ff.index.name = 'isin'

print('**SPX freefloat loaded**')

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  spx_ff['Free Float'].replace('', np.nan, inplace=True)


In [165]:
# sector data for euro
# sector data for Eurostoxx
euro_sector_data = pd.read_csv("data/stoxx_TR.TRBCEconomicSector.csv", quoting=csv.QUOTE_NONE)
euro_sector_data.columns = ['num', 'isin', 'sector']
for key in ['num', 'isin', 'sector']: euro_sector_data[key] = euro_sector_data[key].str.replace('\"', '')
euro_sector = {euro_sector_data['isin'].iloc[i]: euro_sector_data['sector'].iloc[i] for i in range(len(euro_sector_data))}

print('**EURO sectors loaded**')

# euro marketcaps
euro_mc = pd.read_csv("data/stoxx_TR.CompanyMarketCap.csv", quoting=csv.QUOTE_NONE)

euro_mc.columns = ['num', 'isin', 'market_cap', 'date']

for key in ['num', 'isin', 'market_cap', 'date']:
    euro_mc[key] = euro_mc[key].str.replace('\"', '')

euro_mc = euro_mc.replace(r'^\s*$', np.nan, regex=True)

euro_mc = euro_mc.dropna()

euro_mc['market_cap'] = pd.to_numeric(euro_mc['market_cap'])

euro_mc['date'] = euro_mc['date'].apply(lambda x: datetime.strptime(x, '%Y-%m-%dT%H:%M:%SZ'))
euro_mc.set_index('isin', inplace=True)
euro_mc = euro_mc.drop('num', axis=1)

print('**EURO marketcap loaded**')


# euro price data

euro_price = pd.read_csv("data/stoxx_TR.CLOSEPRICE.csv", quoting=csv.QUOTE_NONE)

euro_price.columns = ['num', 'isin', 'close', 'date']

for key in ['num', 'isin', 'close', 'date']:
    euro_price[key] = euro_price[key].str.replace('\"', '')
    
euro_price = euro_price.replace(r'^\s*$', np.nan, regex=True)
euro_price = euro_price.dropna()
euro_price['close'] =  pd.to_numeric(euro_price['close'])
euro_price['date'] = euro_price['date'].apply(lambda x: datetime.strptime(x, '%Y-%m-%dT%H:%M:%SZ'))
euro_price.set_index('isin', inplace=True)
euro_price = euro_price.drop('num', axis=1)

print('**EURO price data loaded**')

# euro freefloat data

euro_ff = pd.read_csv('data/stoxx_TR.FreeFloatPct.csv', quoting=csv.QUOTE_NONE)
euro_ff.columns = ['RR_id', 'ISIN', 'Free Float', 'date']

for key in ['RR_id', 'ISIN', 'Free Float', 'date']:
    euro_ff[key] = euro_ff[key].str.replace('\"', '')
euro_ff['Free Float'].replace('', np.nan, inplace=True)
euro_ff = euro_ff.dropna()
euro_ff = euro_ff.reindex(index=euro_ff.index[::-1])
euro_ff['Free Float'] =  pd.to_numeric(euro_ff['Free Float'])
euro_ff['Free Float'] = euro_ff['Free Float']/100
euro_ff.set_index('ISIN', inplace=True)
euro_ff['date']= pd.to_datetime(euro_ff['date']).dt.strftime('%Y-%m-%d')
euro_ff.index.name = 'isin'


print('**EURO freefloat loaded**')

**EURO sectors loaded**
**EURO marketcap loaded**
**EURO price data loaded**



A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.





**EURO freefloat loaded**


In [None]:
import pandas_market_calendars as mcal

# convert datetime objects to string objects
def convert_dates(datelist):
    for i in range(len(datelist)):
        datelist[i] = datelist[i].strftime('%Y-%m-%d')
    return [x.strftime('%Y-%m-%d') for x in datelist]
    
# trading calendar
nyse = mcal.get_calendar('NYSE')
lse = mcal.get_calendar('LSE')

# monthly datelist
monthly_dates_spx = convert_dates(nyse.valid_days(start_date='2007-01-01', end_date='2024-04-01').to_series().resample('ME').last())
monthly_dates_euro = convert_dates(lse.valid_days(start_date='2007-01-01', end_date='2024-04-01').to_series().resample('ME').last())

In [228]:
# calculate trend rri by taking 30 day difference
# calculate percent rri by taking change in rri over 30 days
for isin in tqdm(spx_esg):
    spx_esg[isin]['trend_score'] = spx_esg[isin]['esg_score'] - spx_esg[isin]['esg_score'].shift(30)
    spx_esg[isin]['percent_score'] = spx_esg[isin]['esg_score'] / (spx_esg[isin]['esg_score'].shift(30) + 0.0001)

for isin in tqdm(euro_esg):
    euro_esg[isin]['trend_score'] = euro_esg[isin]['esg_score'] - euro_esg[isin]['esg_score'].shift(30)
    euro_esg[isin]['percent_score'] = euro_esg[isin]['esg_score'] / (euro_esg[isin]['esg_score'].shift(30) + 0.0001)

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1223 [00:00<?, ?it/s]

In [229]:
# reformat dataframes

esg_spx = pd.concat(spx_esg, axis=0)
esg_euro = pd.concat(euro_esg, axis=0)

esg_spx = esg_spx.swaplevel(0, 1) 
esg_euro = esg_euro.swaplevel(0, 1) 

In [None]:
# determine which grouping the company should go in
def map_sector(isin):
    sector = spx_sector.get(isin, 'n/a')
    if sector in ['financials', 'utilities']:
        return 'financials/util'
    if sector == 'n/a':
        return 'n/a'
    return 'other'
    
spx_data = {}

for date in tqdm(monthly_dates_spx):
    spx_data[date] = {}

    # group together prices, marketcap, rri scores, freefloat for respective ISINs in constituents of each day
    ff = spx_ff[spx_ff['date'] <= date].sort_values(by=['isin', 'date'], ascending=[True, False])
    spx_data[date] = spx_const[date][spx_const[date] == 1].to_frame().merge(
            spx_price[spx_price['date'] == date]['close'], how='left', left_index=True, right_index=True
        ).merge(
            spx_mc[spx_mc['date'] == date]['market_cap'], how='left', left_index=True, right_index=True
        ).merge(
            esg_spx.loc[date]['esg_score'], how='left', left_index=True, right_index=True
        ).merge(
            ff[~ff.index.duplicated(keep='first')]['Free Float'], how='left', left_index=True, right_index=True
        ).merge(
            esg_spx.loc[date]['trend_score'], how='left', left_index=True, right_index=True
        ).merge(
            esg_spx.loc[date]['percent_score'], how='left', left_index=True, right_index=True
        )
    spx_data[date]['sector'] = spx_data[date].index.map(map_sector)
    spx_data[date] = spx_data[date].drop(date, axis=1)
    
    # complete removal of missing data
    spx_data[date] = spx_data[date].dropna()
spx_data = pd.concat(spx_data)

In [None]:
# determine which grouping the company should go in

def map_sector(isin):
    sector = euro_sector.get(isin, 'nan')
    if sector in ['Financials', 'Utilities']:
        return 'financials/util'
    if sector == 'nan':
        return 'n/a'
    return 'other'
    
euro_data = {}

for date in tqdm(monthly_dates_euro):
    euro_data[date] = {}

    # group together prices, marketcap, rri scores, freefloat for respective ISINs in constituents of each day
    ff = euro_ff[euro_ff['date'] <= date].sort_values(by=['isin', 'date'], ascending=[True, False])
    euro_data[date] = euro_const[date][euro_const[date] == 1].to_frame().merge(
            euro_price[euro_price['date'] == date]['close'], how='left', left_index=True, right_index=True
        ).merge(
            euro_mc[euro_mc['date'] == date]['market_cap'], how='left', left_index=True, right_index=True
        ).merge(
            esg_euro.loc[date]['esg_score'], how='left', left_index=True, right_index=True
        ).merge(
            ff[~ff.index.duplicated(keep='first')]['Free Float'], how='left', left_index=True, right_index=True
        ).merge(
            esg_euro.loc[date]['trend_score'], how='left', left_index=True, right_index=True
        ).merge(
            esg_euro.loc[date]['percent_score'], how='left', left_index=True, right_index=True
        )
    euro_data[date]['sector'] = euro_data[date].index.map(map_sector)
    euro_data[date] = euro_data[date].drop(date, axis=1)
    euro_data[date] = euro_data[date].dropna()
euro_data = pd.concat(euro_data)

In [None]:
spx_df, euro_df = spx_data, euro_data

euro_df.index.names = ['date', 'isin']
spx_df.index.names = ['date', 'isin']

In [124]:
spx_price_data = {}

# calculate last available prices
for isin in tqdm(set(spx_price.index)):
    spx_price_data[isin] = spx_price[spx_price.index == isin]
euro_price_data = {}

for isin in tqdm(set(euro_price.index)):
    euro_price_data[isin] = euro_price[euro_price.index == isin]

  0%|          | 0/1064 [00:00<?, ?it/s]

  0%|          | 0/1583 [00:00<?, ?it/s]

In [125]:
euro_df.to_csv('simulation_data/euro_df.csv')
spx_df.to_csv('simulation_data/spx_df.csv')

## Bond Data

RRI, Price, Constituents for investment grade bond data US & Euro

In [135]:
FILE = "data/historical.constituents_iBoxx_EUR_INVESTMENT_GRADE_COMPONENTS.csv" # bond data US investment grade location

euro_bonds_const = pd.read_csv(FILE, quoting=csv.QUOTE_NONE) # convert to pandas dataframe

euro_bonds_const.columns = ['date', 'isin', 'index_weight', 'bid_price', 'ask_price']

for key in ['date', 'isin', 'index_weight', 'bid_price', 'ask_price']:
    euro_bonds_const[key] = euro_bonds_const[key].str.replace('\"', '')
    if key not in {'date', 'isin'}:
        euro_bonds_const[key] = euro_bonds_const[key].astype(float)

euro_bonds_const = euro_bonds_const.dropna()

euro_bonds_const['date'] = pd.to_datetime(euro_bonds_const['date']) # set date as datetime object and make it the index
euro_bonds_const.set_index('date', inplace=True)

In [136]:
FILE = "data/historical.constituents_iBoxx_USD_INVESTMENT_GRADE_COMPONENTS.csv" # bond data US investment grade location

us_bonds_const = pd.read_csv(FILE, quoting=csv.QUOTE_NONE) # convert to pandas dataframe

us_bonds_const.columns = ['date', 'isin', 'index_weight', 'bid_price', 'ask_price']

for key in ['date', 'isin', 'index_weight', 'bid_price', 'ask_price']:
    us_bonds_const[key] = us_bonds_const[key].str.replace('\"', '')
    if key not in {'date', 'isin'}:
        us_bonds_const[key] = us_bonds_const[key].astype(float)

us_bonds_const = us_bonds_const.dropna()

us_bonds_const['date'] = pd.to_datetime(us_bonds_const['date']) # set date as datetime object and make it the index
us_bonds_const.set_index('date', inplace=True)

In [137]:
# sectoral data for bonds
sector_bond_data = pd.read_csv('data/bonds_SPsectors.csv')
map_sector_bond = {}

for isin in tqdm(set(us_bonds_const['isin']).union(set(euro_bonds_const['isin']))):
    try:
        if len(sector_bond_data[sector_bond_data['isin'] == isin]['sp_sectors']):
            map_sector_bond[isin] = sector_bond_data[sector_bond_data['isin'] == isin]['sp_sectors'].values[0]
    except Exception as e:
        pass

  0%|          | 0/39346 [00:00<?, ?it/s]

In [138]:
us_bond_esg = pd.read_csv('data/us_bond_esg.csv') # read esg score for every bond

us_bond_esg.columns = ['rr_id', 'date', 'esg_score', 'environment', 'social',
       'government', 'crosscutting', 'trend_esg_score']

euro_bond_esg = pd.read_csv('data/euro_bond_esg.csv') # red esg score for every bond

euro_bond_esg.columns = ['rr_id', 'date', 'esg_score', 'environment', 'social',
       'government', 'crosscutting', 'trend_esg_score']


In [139]:
# convert bond esg data to dictionary
us_bond_esg = {rr: us_bond_esg[us_bond_esg['rr_id'] == rr] for rr in tqdm(set(us_bond_esg['rr_id']))}
for rr in us_bond_esg:
    us_bond_esg[rr].set_index('date', inplace=True)

# calculate trend rri and percent rri
for rr in tqdm(us_bond_esg):
    us_bond_esg[rr]['trend_score'] = us_bond_esg[rr]['esg_score'] - us_bond_esg[rr]['esg_score'].shift(30)
    us_bond_esg[rr]['percent_score'] = us_bond_esg[rr]['esg_score'] / (us_bond_esg[rr]['esg_score'].shift(30) + 0.0001)

# constituents for a given date

us_bonds_const = {date: us_bonds_const[us_bonds_const.index == date] for date in tqdm(set(us_bonds_const.index))}

# all dates being tracked
dates = sorted(list(us_bonds_const.keys()))

# store actual data
us_ig_bond_data = {}

for date in tqdm(dates):
    if not len(us_bonds_const.get(pd.to_datetime(date), [])):
        continue
    us_ig_bond_data[date] = us_bonds_const.get(pd.to_datetime(date))
    sectors = []
    # add esg data to each isin
    for m in ['trend_score', 'percent_score']:
        esg_scores = []
        for isin in us_bonds_const[pd.to_datetime(date)]['isin']:
            if not map_isin.get(isin):
                esg_scores.append(np.nan)
            elif not len(us_bond_esg.get(map_isin[isin], [])):
                esg_scores.append(np.nan)
            else:
                t = us_bond_esg[map_isin[isin]]
                t = t[t.index == date.strftime('%Y-%m-%d')]
                if not len(t):
                    esg_scores.append(np.nan)
                else:
                    esg_scores.append(float(t[m]))
        us_ig_bond_data[date][m] = esg_scores

# remove missing data
for date in tqdm(dates):
    us_ig_bond_data[date] = us_ig_bond_data[date].dropna()

for date in tqdm(dates):
    us_ig_bond_data[date].set_index(['isin'], inplace=True)

# assign sectors to bonds
for date in tqdm(dates):
    if not len(us_bonds_const.get(pd.to_datetime(date), [])):
        continue
    sectors = []
    for isin in us_ig_bond_data[date].index:
        if not map_sector_bond.get(isin):
            sectors.append(np.nan)
        elif map_sector_bond[isin] in ['financials', 'utilities']:
            sectors.append('financials/utilities')
        else:
            sectors.append('other')
    us_ig_bond_data[date]['sector'] = sectors

# reformat us bond data
df_list = []
for date, df in us_ig_bond_data.items():
    df['date'] = date  
    df_list.append(df)

us_ig_df = pd.concat(df_list)

us_ig_df.set_index(['date', us_ig_df.index], inplace=True)

us_ig_df.index.names = ['date', 'isin']

# store price data
us_ig_prices = {}

for date in tqdm(set(us_ig_df.index.get_level_values(0))):
    us_ig_prices[date] = {}
    for isin in us_ig_df.loc[date].index:
        try:
            us_ig_prices[date][isin] = us_ig_df.loc[date, isin]['ask_price']
        except Exception:
            print(date, isin)

  0%|          | 0/2608 [00:00<?, ?it/s]

  0%|          | 0/2608 [00:00<?, ?it/s]

  0%|          | 0/215 [00:00<?, ?it/s]

  0%|          | 0/215 [00:00<?, ?it/s]

  esg_scores.append(float(t[m]))
  esg_scores.append(float(t[m]))
  esg_scores.append(float(t[m]))
  esg_scores.append(float(t[m]))
  esg_scores.append(float(t[m]))
  esg_scores.append(float(t[m]))
  esg_scores.append(float(t[m]))
  esg_scores.append(float(t[m]))
  esg_scores.append(float(t[m]))
  esg_scores.append(float(t[m]))
  esg_scores.append(float(t[m]))
  esg_scores.append(float(t[m]))
  esg_scores.append(float(t[m]))
  esg_scores.append(float(t[m]))
  esg_scores.append(float(t[m]))
  esg_scores.append(float(t[m]))
  esg_scores.append(float(t[m]))
  esg_scores.append(float(t[m]))
  esg_scores.append(float(t[m]))
  esg_scores.append(float(t[m]))
  esg_scores.append(float(t[m]))
  esg_scores.append(float(t[m]))
  esg_scores.append(float(t[m]))
  esg_scores.append(float(t[m]))
  esg_scores.append(float(t[m]))
  esg_scores.append(float(t[m]))
  esg_scores.append(float(t[m]))
  esg_scores.append(float(t[m]))
  esg_scores.append(float(t[m]))
  esg_scores.append(float(t[m]))
  esg_scor

  0%|          | 0/215 [00:00<?, ?it/s]

  0%|          | 0/215 [00:00<?, ?it/s]

  0%|          | 0/215 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  us_ig_bond_data[date]['sector'] = sectors
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  us_ig_bond_data[date]['sector'] = sectors
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  us_ig_bond_data[date]['sector'] = sectors
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .l

  0%|          | 0/188 [00:00<?, ?it/s]

In [140]:
euro_bond_esg = {rr: euro_bond_esg[euro_bond_esg['rr_id'] == rr] for rr in tqdm(set(euro_bond_esg['rr_id']))}
for rr in euro_bond_esg:
    euro_bond_esg[rr].set_index('date', inplace=True)

for rr in tqdm(euro_bond_esg):
     euro_bond_esg[rr]['trend_score'] = euro_bond_esg[rr]['esg_score'] - euro_bond_esg[rr]['esg_score'].shift(30)
     euro_bond_esg[rr]['percent_score'] = euro_bond_esg[rr]['esg_score'] / (euro_bond_esg[rr]['esg_score'].shift(30) + 0.0001)

euro_bonds_const = {date: euro_bonds_const[euro_bonds_const.index == date] for date in tqdm(set(euro_bonds_const.index))}

dates = sorted(list(euro_bonds_const.keys()))

euro_ig_bond_data = {}

for date in tqdm(dates):
    if not len(euro_bonds_const.get(pd.to_datetime(date), [])):
        continue
    euro_ig_bond_data[date] = euro_bonds_const.get(pd.to_datetime(date))
    sectors = []
    for m in ['trend_score', 'percent_score']:
        esg_scores = []
        for isin in euro_bonds_const[pd.to_datetime(date)]['isin']:
            if not map_isin.get(isin):
                esg_scores.append(np.nan)
            elif not len(euro_bond_esg.get(map_isin[isin], [])):
                esg_scores.append(np.nan)
            else:
                t = euro_bond_esg[map_isin[isin]]
                t = t[t.index == date.strftime('%Y-%m-%d')]
                if not len(t):
                    esg_scores.append(np.nan)
                else:
                    esg_scores.append(float(t[m]))
        euro_ig_bond_data[date][m] = esg_scores

for date in tqdm(dates):
    euro_ig_bond_data[date] = euro_ig_bond_data[date].dropna()

for date in tqdm(dates):
    euro_ig_bond_data[date].set_index(['isin'], inplace=True)

for date in tqdm(dates):
    if not len(euro_bonds_const.get(pd.to_datetime(date), [])):
        continue
    sectors = []
    for isin in euro_ig_bond_data[date].index:
        if not map_sector_bond.get(isin):
            sectors.append(np.nan)
        elif map_sector_bond[isin] in ['financials', 'utilities']:
            sectors.append('financials/utilities')
        else:
            sectors.append('other')
    euro_ig_bond_data[date]['sector'] = sectors

df_list = []
for date, df in euro_ig_bond_data.items():
    df['date'] = date  
    df_list.append(df)

euro_ig_df = pd.concat(df_list)

euro_ig_df.set_index(['date', euro_ig_df.index], inplace=True)

euro_ig_df.index.names = ['date', 'isin']

euro_ig_prices = {}

for date in tqdm(set(euro_ig_df.index.get_level_values(0))):
    euro_ig_prices[date] = {}
    for isin in euro_ig_df.loc[date].index:
        try:
            euro_ig_prices[date][isin] = euro_ig_df.loc[date, isin]['ask_price']
        except Exception:
            print(date, isin)

  0%|          | 0/1455 [00:00<?, ?it/s]

  0%|          | 0/1455 [00:00<?, ?it/s]

  0%|          | 0/196 [00:00<?, ?it/s]

  0%|          | 0/196 [00:00<?, ?it/s]

  esg_scores.append(float(t[m]))
  esg_scores.append(float(t[m]))
  esg_scores.append(float(t[m]))
  esg_scores.append(float(t[m]))
  esg_scores.append(float(t[m]))
  esg_scores.append(float(t[m]))
  esg_scores.append(float(t[m]))
  esg_scores.append(float(t[m]))
  esg_scores.append(float(t[m]))
  esg_scores.append(float(t[m]))
  esg_scores.append(float(t[m]))
  esg_scores.append(float(t[m]))
  esg_scores.append(float(t[m]))
  esg_scores.append(float(t[m]))
  esg_scores.append(float(t[m]))
  esg_scores.append(float(t[m]))
  esg_scores.append(float(t[m]))
  esg_scores.append(float(t[m]))
  esg_scores.append(float(t[m]))
  esg_scores.append(float(t[m]))
  esg_scores.append(float(t[m]))
  esg_scores.append(float(t[m]))
  esg_scores.append(float(t[m]))
  esg_scores.append(float(t[m]))
  esg_scores.append(float(t[m]))
  esg_scores.append(float(t[m]))
  esg_scores.append(float(t[m]))
  esg_scores.append(float(t[m]))
  esg_scores.append(float(t[m]))
  esg_scores.append(float(t[m]))
  esg_scor

  0%|          | 0/196 [00:00<?, ?it/s]

  0%|          | 0/196 [00:00<?, ?it/s]

  0%|          | 0/196 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  euro_ig_bond_data[date]['sector'] = sectors
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  euro_ig_bond_data[date]['sector'] = sectors
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  euro_ig_bond_data[date]['sector'] = sectors
A value is trying to be set on a copy of a slice from a DataFrame.
Try us

  0%|          | 0/189 [00:00<?, ?it/s]

In [141]:
us_ig_df.to_csv('simulation_data/us_ig_df.csv')
euro_ig_df.to_csv('simulation_data/euro_ig_df.csv')