# Generating RRI Scores
2024-08-22

This notebook calculates the RRI score based on incident data provided from Mark Thompson

In [2]:
import pandas as pd
import numpy as np
import csv
import ujson 
import json
from tqdm.notebook import tqdm

In [4]:
# ISIN to RR ID
rr_isin = pd.read_csv('data/REPRISK_ISINs.csv', quoting=csv.QUOTE_NONE)
rr_isin.columns = ['rr', 'ISIN']
rr_isin['ISIN'] = rr_isin['ISIN'].str.replace('\"', '')
map_isin = rr_isin.set_index('ISIN')['rr'].to_dict()

# Incident Data
incident_data = pd.read_csv('data/REPRISK_EVENTS.csv', quoting=csv.QUOTE_NONE)
incident_data.columns = ['date', 'ID_incident', 'ID_RR', 'reach', 'severity', 'unsharp', 'novelty', 
                         'environment', 'social', 'government', 'crosscutting']

incident_data['date'] = pd.to_datetime(incident_data['date'])
incident_data['severity'].replace({0: 1}, inplace=True)
incident_data['reach'].replace({0: 1}, inplace=True)
incident_data['novelty'].replace({0: 1}, inplace=True)

severity_weights = {1: 1, 
                    2: 10, 
                    3: 100}
reach_weights = {1: 1, 
                 2: 2, 
                 3: 3}
novelty_weights = {1: 1,
                   2: 2}

incident_data["Incident Score"] = (incident_data['severity'].replace(severity_weights)
                                   *incident_data['reach'].replace(reach_weights)
                                   *incident_data['novelty'].replace(novelty_weights))

n_years = 2
n_days = 365*n_years
curvature_ = 1
weights = np.arange(1,n_days + 1)
weights = weights/weights.max()
weights = ((2**weights) - 1)**curvature_
time_weights = pd.Series(weights)

start = pd.Timestamp("2007-01-01")
todate = max(incident_data['date']).strftime('%Y-%m-%d')
end = pd.Timestamp(todate)
date_range_extended = pd.date_range(start=start-pd.Timedelta(days=n_days), end=end)
date_range = pd.date_range(start=start, end=end)
incident_data.set_index("ID_RR", inplace=True)

def get_raw_scores(id_, incident_data, col="Incident Score"):
    try:
        company_incidents = incident_data.reset_index().set_index('date')
        company_incidents = company_incidents[company_incidents['ID_RR']==id_]
        incident_scores = company_incidents[col].groupby(level=0).sum().reindex(date_range_extended).fillna(0)
        stacked_incident_score = incident_scores.rolling(n_days).apply(lambda scores: np.dot(scores, weights)).fillna(0)
        stacked_incident_score = stacked_incident_score.reindex(date_range)
    except KeyError:
        stacked_incident_score = pd.Series(0, index=date_range)
    stacked_incident_score.name = "Incident Score"
    return stacked_incident_score  

def scale(series, lambda_=0.000105, curvature=5.3):
    return 100*((1 - np.exp(-lambda_*series))**(1/curvature))

scaling = scale(pd.Series(range(750)))

def max_decay(series, decay=0.5**(1/365)):
    new_series = pd.Series(index=series.index, dtype="float")
    previous = 0
    for date, value in series.items():
        if previous > 0:
            if value/previous < decay:
                new_series[date] = previous*decay
            else:
                new_series[date] = value
        else:
            new_series[date] = value
            
        previous = new_series[date]
    return new_series

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  incident_data['severity'].replace({0: 1}, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  incident_data['reach'].replace({0: 1}, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are

In [10]:
# read spx historical constituents
const = pd.read_csv("esg_data/spx_constituents.csv", quoting=csv.QUOTE_NONE, delimiter=' ')
const.columns = ['date'] + [x.replace('\"', '') for x in const.columns[1:]]
const['date'] = const['date'].str.replace('\"', '')
const.set_index('date', inplace=True)

# map each date to its spx constituents
spx_const = {}

for i in tqdm(range(len(const))):
    row = const.iloc[i]
    date = const.index[i]
    spx_const[date] = set()
    for isin in const.columns:
        if row[isin] == 1:
            spx_const[date].add(isin)

# all the spx isin constituents ever
all_spx_isin = set(const.columns)

# read historical euro constituents list
const = pd.read_csv("esg_data/stoxx_constituents.csv", quoting=csv.QUOTE_NONE, delimiter=' ')
const.columns = ['date'] + [x.replace('\"', '') for x in const.columns[1:]]
const['date'] = const['date'].str.replace('\"', '')
const.set_index('date', inplace=True)

# map each date to its eurostoxx constituents
euro_const = {}

for i in tqdm(range(len(const))):
    row = const.iloc[i]
    date = const.index[i]
    euro_const[date] = set()
    for isin in const.columns:
        if row[isin] == 1:
            euro_const[date].add(isin)

all_euro_isin = set(const.columns)

  0%|          | 0/6323 [00:00<?, ?it/s]

  0%|          | 0/6364 [00:00<?, ?it/s]

In [None]:
spx_esg = {}

# calculate and plot ESG sub-scores and overall score for each isin in SPX
esgx = ["environment", "social", "government", "crosscutting"]
for isin in tqdm(all_spx_isin.intersection(set(map_isin.keys()))):
    if len(spx_esg.get(isin, [])):
        continue
    id_ = map_isin[isin]
    scores = {}
    
    series = get_raw_scores(id_, incident_data)
    series = max_decay(scale(series))
    scores['esg_score'] = series
    
    for subscore in esgx:
        series = get_raw_scores(id_, incident_data[incident_data[subscore] == 1])
        series = max_decay(scale(series))
        scores[subscore] = series
    
    spx_esg[isin] = pd.DataFrame(scores)

for isin in tqdm(all_spx_isin.intersection(set(map_isin.keys()))):
    spx_esg[isin] = spx_esg[isin].to_dict(orient='index')

def convert_timestamp_keys(d):
    for isin, nested_dict in tqdm(d.items()):
        d[isin] = {str(timestamp): inner_dict for timestamp, inner_dict in nested_dict.items()}
    return d

spx_esg_serializable = convert_timestamp_keys(spx_esg)

with open('spx_esg.json', 'w') as json_file:
    ujson.dump(spx_esg_serializable, json_file, indent=4)

In [None]:
euro_esg = {}

# calculate and plot ESG sub-scores and overall score for each isin in Eurostoxx 600
esgx = ["environment", "social", "government", "crosscutting"]
for isin in tqdm(all_euro_isin.intersection(set(map_isin.keys()))):
    if len(euro_esg.get(isin, [])):
        continue
    id_ = map_isin[isin]
    scores = {}
    
    series = get_raw_scores(id_, incident_data)
    series = max_decay(scale(series))
    scores['esg_score'] = series
    
    for subscore in esgx:
        series = get_raw_scores(id_, incident_data[incident_data[subscore] == 1])
        series = max_decay(scale(series))
        scores[subscore] = series
    
    euro_esg[isin] = pd.DataFrame(scores)

for isin in tqdm(all_euro_isin.intersection(set(map_isin.keys()))):
    euro_esg[isin] = euro_esg[isin].to_dict(orient='index')

def convert_timestamp_keys(d):
    for isin, nested_dict in tqdm(d.items()):
        d[isin] = {str(timestamp): inner_dict for timestamp, inner_dict in nested_dict.items()}
    return d

euro_esg_serializable = convert_timestamp_keys(euro_esg)

with open('euro_esg.json', 'w') as json_file:
    ujson.dump(euro_esg_serializable, json_file, indent=4)

In [None]:
# FILE = "bond_data/historical.constituents_iBoxx_USD_HIGH_YIELD_COMPONENTS.csv"
FILE = "bond_data/historical.constituents_iBoxx_USD_INVESTMENT_GRADE_COMPONENTS.csv" # bond dtta US investment grade location
# FILE = "bond_data/historical.constituents_iBoxx_EUR_HIGH_YIELD_COMPONENTS.csv"

df = pd.read_csv(FILE, quoting=csv.QUOTE_NONE) # convert to pandas dataframe

df.columns = ['date', 'isin', 'index_weight', 'bid_price', 'ask_price']

for key in ['date', 'isin', 'index_weight', 'bid_price', 'ask_price']:
    df[key] = df[key].str.replace('\"', '')
    if key not in {'date', 'isin'}:
        df[key] = df[key].astype(float)

df = df.dropna()

df['date'] = pd.to_datetime(df['date']) # set date as datetime object and make it the index
df.set_index('date', inplace=True)
us_bonds_const = {} # dictionary for bond constituents for each date
for date in tqdm(set(df.index)):
    us_bonds_const[date] = df[df.index == date]

dates = sorted(list(set(df.index))) # sort given dates (monthly)

us_bond_esg = {}
all_us_bond_isin = set(df['isin'])

s = set()
for isin in all_us_bond_isin.intersection(set(map_isin.keys())):
    s.add(map_isin[isin])

esgx = ["environment", "social", "government", "crosscutting"]
for id_ in tqdm(s):
    if len(us_bond_esg.get(id_, [])):
        continue
    scores = {}
    
    series = get_raw_scores(id_, incident_data)
    series = max_decay(scale(series))
    scores['esg_score'] = series
    
    for subscore in esgx:
        series = get_raw_scores(id_, incident_data[incident_data[subscore] == 1])
        series = max_decay(scale(series))
        scores[subscore] = series
    
    us_bond_esg[id_] = pd.DataFrame(scores)