In [1]:
import pandas as pd
import ast
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from openai import OpenAI
from IPython.display import display, Markdown
from tqdm import tqdm

# Settings
pd.set_option('display.max_colwidth', 500)
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)

#### A. Create two dataset with 2019 and 2024 ECB Guide

In [2]:
# Load the dataset
file_path = 'ecb_guide_comparison.csv'
full_text = pd.read_csv(file_path)

# Convert the string representations of embeddings to lists
full_text['body_embedding'] = full_text['body_embedding'].apply(ast.literal_eval).tolist()

# Remove numbers from sourve
full_text['source_clean'] = full_text['source'].str.replace('\s+\d+', '', regex=True)
full_text['source_clean'] = full_text['source_clean'].str.replace(' > nan.*', '', regex=True)
full_text['source_clean'] = full_text['source_clean'].str.replace('\s+', ' ', regex=True)

# Create two dataframes with the new and old ECB Guide
new_guide = full_text[full_text['file_name'] != '2019_ecb_guide'].copy()
old_guide = full_text[full_text['file_name'] == '2019_ecb_guide'].copy()

#### B. Compare individual paragraphs of the new ECB Guide with the paragraphs from the 2019 Guide

In [3]:
# Initialize the columns
new_guide['best_similarity'] = 0.0
new_guide['matching_text'] = ''

for index, row in tqdm(new_guide.iterrows(), total=new_guide.shape[0]):
    # Calculate cosine similarity with all paragraphs in old_guide
    similarities = cosine_similarity([row['body_embedding']], list(old_guide['body_embedding']))
    
    # Find the index of the highest similarity amd add data to the new guide
    max_index = np.argmax(similarities)
    new_guide.loc[index, 'best_similarity'] = float(similarities[0, max_index])
    new_guide.loc[index, 'matching_text'] = old_guide.iloc[max_index]['body_of_the_text']

100%|██████████████████████████████████████████████████████████████████████████████| 1414/1414 [02:25<00:00,  9.72it/s]


In [4]:
# Create list of Level 2 headings with the highest number of limited matches
idx_1 = new_guide['best_similarity'] < 0.95
idx_2 = new_guide['word_count'] > 30
limited_matches = new_guide[idx_1 & idx_2]

(limited_matches.groupby('source_clean')['heading_1']
                .count()
                .sort_values(ascending=False)
                .reset_index()
                .head(10))

Unnamed: 0,source_clean,heading_1
0,Credit risk > Definition of default,39
1,Counterparty credit risk > Risks not in effective expected positive exposure,33
2,Credit risk > Probability of default,19
3,Credit risk > Loss given default,18
4,General topics > Roll-out and permanent partial use,12
5,Counterparty credit risk > Margin period of risk and cash flows,9
6,Market risk > Methodology for IRC models focusing on default risk,8
7,Counterparty credit risk > Use test,8
8,Credit risk > Use of data,6
9,General topics > Overarching principles for internal models,6


In [5]:
# Compare number of Level 2 paragraphs in the 2019 and 2024 Guide
outcome = pd.pivot_table(full_text, 
                         values='word_count', 
                         index='source_clean', 
                         columns='file_name', 
                         aggfunc='count', 
                         margins=True).sort_values(by='All', ascending=False).reset_index()
outcome['changes'] = outcome['2024_ecb_guide'] - outcome['2019_ecb_guide']

outcome.head(15)

file_name,source_clean,2019_ecb_guide,2024_ecb_guide,All,changes
0,All,1280.0,1412.0,2692,132.0
1,Market risk > Scope of the internal model approach,89.0,95.0,184,6.0
2,Credit risk > Probability of default,66.0,86.0,152,20.0
3,Market risk > Regulatory back-testing of VaR models,76.0,72.0,148,-4.0
4,Market risk > Methodology for IRC models focusing on default risk,61.0,67.0,128,6.0
5,Credit risk > Loss given default,51.0,77.0,128,26.0
6,Market risk > Methodology for VaR and stressed VaR,59.0,66.0,125,7.0
7,General topics > Overarching principles for internal models,55.0,68.0,123,13.0
8,Credit risk > Use of data,44.0,52.0,96,8.0
9,General topics > Internal validation,55.0,39.0,94,-16.0


In [6]:
# Random sample of paragraphs from the new Guide with limited matches in the Old Guide
cols = ['file_name', 
        'heading_1', 
        'heading_2', 
        'body_of_the_text', 
        'page_number', 
        'source_clean', 
        'word_count', 
        'best_similarity', 
        'matching_text']

limited_matches[cols].sample(3)

Unnamed: 0,file_name,heading_1,heading_2,body_of_the_text,page_number,source_clean,word_count,best_similarity,matching_text
2570,2024_ecb_guide,Counterparty credit risk,13 Risks not in effective expected positive exposure,"Regarding RNIEPE that are identified as not being substantial according to one of the processes described in paragraph (<>)104 of this chapter, the ECB recommends that institutions either define quantitative thresholds using theabove or similar metrics or define qualitative methodological criteria to identify RNIEPE that are not substantial but require at least the monitoring described in paragraph (<>)100 of this chapter.",269.0,Counterparty credit risk > Risks not in effective expected positive exposure,50,0.690902,"In accordance with Article 368(1)(e), an institution must have established procedures for monitoring and ensuring compliance with a documented set of internal policies and controls concerning the overall operation of its internal models. Therefore, the ECB considers that in order to ensure a comprehensive coverage of such risks, the institution should clearly describe and document each RNIME in an inventory, as part of its RNIME framework.In order to properly monitor each RNIME, the ECB cons..."
2650,2024_ecb_guide,Glossary,Commission Delegated Regulation (EU) No 2022/439,"Commission Delegated Regulation (EU) 2022/439 of October 2021 supplementing Regulation (EU) No 575/2013 of the European Parliament and of the Council with regard to regulatory technical standards for the specification of the assessment methodology competent authorities are to follow when assessing the compliance of credit institutions and investment firms with the requirements to use the Internal Ratings Based Approach (OJ L 90, 18.3.2022, p. 1)",281.0,Glossary > Commission Delegated Regulation (EU) No/439,53,0.770571,Commission Delegated Regulation (EU) No 529/2014 of 12 March 2014 supplementing Regulation (EU) No 575/2013 of the European Parliament and of the Council with regard to regulatory technical standards for assessing the materiality of extensions and changes of the Internal Ratings
1381,2024_ecb_guide,General topics,2 Roll-out and permanent partial use,"Article 7(3) of Commission Delegated Regulation (EU) No 2022/43939 sets out the conditions under which competent authorities may approve any changes to the sequence and time period of the plan. The ECB intends to assess any application for a change to a roll-out plan against these conditions, on the basis of the documentation provided by the institution regarding the rationale for the change, the materiality of the portfolios affected, and governance arrangements for the change (e.g. which b...",17.0,General topics > Roll-out and permanent partial use,130,0.890978,"Article 7(3) of the Final Draft RTS on assessment methodology for IRB33 (<>)provides a good understanding of the conditions under which competent authorities may approve any changes to the sequence and time period of the plan. The ECB intends to assess any application for a change to a roll-out plan against these conditions, on the basis of the documentation provided by the institution regarding the rationale for the change, the materiality of the portfolios affected, and governance arrangem..."


#### C. Compare larger subsections

In [None]:
# Create a single string with the specified format
section = 'Credit risk > Probability of default'
idx_1 = new_guide['source_clean'] == section
idx_2 = old_guide['source_clean'] == section

new_guide_text = '\n\n'.join('\n'.join([row['body_of_the_text']]) for index, row in new_guide[idx_1].iterrows())
old_guide_text = '\n\n'.join('\n'.join([row['body_of_the_text']]) for index, row in old_guide[idx_2].iterrows())

client = OpenAI()

response = client.chat.completions.create(
  model="gpt-4-0125-preview",
  messages=[
    {"role": "system", "content": "You are a helpful regulatory assistant, who is capable to go into details."},
    {"role": "user", "content": "Please outline the most important differences between the two documents."},
    {"role": "assistant", "content": "Please provide the first document."},
    {"role": "user", "content": f"This is the ECB Guide from 2024: {new_guide_text}"},
    {"role": "assistant", "content": "Please provide the second document."},
    {"role": "user", "content": f"This is the ECB Guide from 2019: {old_guide_text}"}
  ]
)

display(Markdown(response.choices[0].message.content))