In [1]:
import pandas as pd
from pypdf import PdfReader
import re
from pdfminer.high_level import extract_text

pd.set_option('display.max_colwidth', 500)
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)

In [2]:
# Extract document using different readers
def extract_pdf_pypdf(pdf_path):
    reader = PdfReader(pdf_path)
    full_text = ""
    for page in reader.pages:
        full_text += page.extract_text()
    return full_text

def extract_pdf_pdfminer(pdf_path):
    return extract_text(pdf_path)

def save_to_txt(text, output_path):
    with open(output_path, 'w', encoding='utf-8') as file:
        file.write(text)

# Extract and save files
pdf_path = "ssm.pubcon230622_guide.en.pdf"
text = extract_pdf_pypdf(pdf_path)
text_pdfminer = extract_pdf_pdfminer(pdf_path)

save_to_txt(text, 'trim_guide_pypdf.txt')
save_to_txt(text_pdfminer, 'trim_guide_pdfminer.txt')

In [3]:
def count_words(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
        
    # Use regex to tokenize the text and find words with more than 3 letters
    words = re.findall(r'\b\w{4,}\b', text)
    return len(words)

# Paths to the three text files
file1 = 'trim_guide_pypdf.txt'
file2 = 'trim_guide_pdfminer.txt'
file3 = 'ssm.pubcon230622_guide.en.txt'

# Get word counts for each file
count1 = count_words(file1)
count2 = count_words(file2)
count3 = count_words(file3)

# Print comparison
print(f"Words with more than 3 letters in {file1}: {count1}")
print(f"Words with more than 3 letters in {file2}: {count2}")
print(f"Words with more than 3 letters in {file3}: {count3}")

Words with more than 3 letters in trim_guide_pypdf.txt: 64221
Words with more than 3 letters in trim_guide_pdfminer.txt: 64181
Words with more than 3 letters in ssm.pubcon230622_guide.en.txt: 62748


In [4]:
# Remove header and footer
def remove_ecb_guide_from_file(filename):
    # Pattern to match the unwanted lines
    pattern = r'^ECB guide to internal models\s*–.*?\d+'
    
    # Read the file line by line and clean each line
    with open(filename, 'r', encoding='utf-8') as file:
        lines = file.readlines()
        cleaned_lines = [re.sub(pattern, '', line) for line in lines]
    
    # Write the cleaned lines back to the file
    with open('trim_guide_pypdf_no_footer.txt', 'w', encoding='utf-8') as file:
        file.writelines(cleaned_lines)

# Call the function
filename = "trim_guide_pypdf.txt"
remove_ecb_guide_from_file(filename)


In [5]:
def process_text_file(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    # Add a sentinel line at the end to make sure we process the last line correctly
    lines.append('Sentinel line')

    processed_lines = []
    i = 0
    while i < len(lines) - 1:
        line = lines[i].strip()
        
        # New Rule: Appending # to a sentence if the next non-empty line starts with 1 and matches the criteria of Rule 2
        j = i + 1
        while j < len(lines) and not lines[j].strip():  # Skip empty lines
            j += 1
        
        if j < len(lines) and lines[j].startswith('1') and re.match(r'^\d+ [A-Z]', lines[j]) and re.match(r'^[A-Z]', line):
            processed_lines.append('# ' + line + '\n')
        # Rule 1
        elif re.match(r'^\d+\.\d+ [A-Z]', line):
            processed_lines.append('\n')
            processed_lines.append('### ' + line + '\n')
            processed_lines.append('\n')
        # Rule 3
        elif re.match(r'^\d+\.\d+\.\d+ [A-Z]', line):
            processed_lines.append('\n')
            processed_lines.append('#### ' + line + '\n')
            processed_lines.append('\n')
        # Rule 2 todo
        elif re.match(r'^\d+ [A-Z]', line) and (re.match(r'^\d+\.\d+ [A-Z]', lines[i + 1]) or re.match(r'^\d+\. [A-Z]', lines[i + 1])):
            processed_lines.append('\n')
            processed_lines.append('## ' + line + '\n')
            processed_lines.append('\n')
        # Rule 4
        elif re.match(r'^\d+\.', line):
            processed_lines.append('\n')
            processed_lines.append(line + '\n')
        else:
            processed_lines.append(line + '\n')

        i += 1

    # Write to the modified file
    with open('trim_guide_pypdf_mod.txt', 'w', encoding='utf-8') as f:
        f.writelines(processed_lines)

# Call the function
filename = 'trim_guide_pypdf_no_footer.txt'
process_text_file(filename)


In [6]:
def extract_data(text):
    lines = text.split('\n')
    data = []
    level_0_label, level_1_label, level_2_label, level_3_label, level_4_label = None, None, None, None, None
    body_buffer = []
    special = False
    in_special_mode = False

    for i, line in enumerate(lines):
        stripped_line = line.strip()

        # Detect start of special paragraph
        if stripped_line.startswith("&&"):
            in_special_mode = True
            stripped_line = stripped_line[2:].strip()  # remove the "&&"

        # Detect end of special paragraph
        if stripped_line.endswith("^^") and in_special_mode:
            in_special_mode = False
            stripped_line = stripped_line[:-2].strip()  # remove the "^^"
            special = True

        # If in special mode, just append to body buffer and continue
        if in_special_mode:
            body_buffer.append(stripped_line)
            continue

        # Detect headings based on our markers
        if stripped_line.startswith("######"):
            level_5_label = stripped_line.replace("######", "").strip()
        elif stripped_line.startswith("#####"):
            level_4_label = stripped_line.replace("#####", "").strip()
            level_5_label = None
        elif stripped_line.startswith("####"):
            level_3_label = stripped_line.replace("####", "").strip()
            level_4_label, level_5_label = None, None
        elif stripped_line.startswith("###"):
            level_2_label = stripped_line.replace("###", "").strip()
            level_3_label, level_4_label, level_5_label = None, None, None
        elif stripped_line.startswith("##"):
            level_1_label = stripped_line.replace("##", "").strip()
            level_2_label, level_3_label, level_4_label, level_5_label = None, None, None, None
        elif stripped_line.startswith("#"):
            level_0_label = stripped_line.replace("#", "").strip()
            level_1_label, level_2_label, level_3_label, level_4_label, level_5_label = None, None, None, None, None
        else:
            body_buffer.append(stripped_line)

        # Empty line or end of special paragraph indicates end of a paragraph
        if (not stripped_line or special) and body_buffer:
            # Exclude blank lines from body buffer
            cleaned_body_buffer = [line for line in body_buffer if line]
            if cleaned_body_buffer:
                data.append([level_0_label, level_1_label, level_2_label, level_3_label, level_4_label, ' '.join(body_buffer), special])
            body_buffer = []
            special = False

    # Add the last buffered body text if present
    if body_buffer:
        data.append([level_0_label, level_1_label, level_2_label, level_3_label, level_4_label, ' '.join(body_buffer), special])

    # Convert data to Pandas DataFrame
    df = pd.DataFrame(data, columns=['Level_0_Label', 'Level_1_Label', 'Level_2_Label', 'Level_3_Label', 'Level_4_Label', 'Body', 'Special'])

    def starts_with_integer(s):
        return 1 if re.match(r'^\d+\.', s) else 0

    # Count words with 3 or more letters
    df['num_of_words'] = df['Body'].apply(lambda x: sum(1 for word in x.split() if len(word) >= 3))
    
    # Add a new column based on the condition
    df['In scope'] = df['Body'].apply(starts_with_integer)
    
    # Check if the string ends with a .
    df['Body'] = df['Body'].str.strip()
    df['ends_with_period'] = df['Body'].str.endswith('.')

    return df


In [7]:
def clean_text_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
    """
    Cleans text columns based on specified patterns.
    
    Parameters:
    - df: DataFrame containing the columns to clean.
    - columns: List of column names to apply the cleaning on.
    
    Returns:
    - DataFrame with cleaned columns.
    """
    for col in columns:
        df[col] = df[col].str.replace('  ', ' ', regex=True) \
                         .str.replace(' -', '-', regex=True) \
                         .str.replace('regulato ry', 'regulatory', regex=False) \
                         .str.replace('B anking', 'Banking', regex=False) \
                         .str.replace('t he', 'the', regex=False) \
                         .str.replace('fo llow', 'follow', regex=False) \
                         .str.replace('\(  ', '(', regex=True) 
    return df

In [8]:
with open('trim_guide_pypdf_mod.txt', 'r', encoding='utf-8') as file:
    text = file.read()

df = extract_data(text)
columns_to_clean = ['Body', 'Level_0_Label', 'Level_1_Label', 'Level_2_Label']
df = clean_text_columns(df, columns_to_clean)

df.to_excel('trim_guide_v2.xlsx', index=False)

df.head(30)

Unnamed: 0,Level_0_Label,Level_1_Label,Level_2_Label,Level_3_Label,Level_4_Label,Body,Special,num_of_words,In scope,ends_with_period
0,Counterparty credit risk 215,,,,,"ECB guide to internal models June 2023 Document releases Release Date of issue Release number Sections/Chapters modified Rationale underlying the release Guide for the Targeted Review of Internal Models (TRIM) – First version 17/02/2017 1.0- Public Consultation – ECB guide to internal models – general topics chapter 28/03/2018 2.0a General topics Incorporation of institutions’ feedback on version 1.0, outcomes of the supervisory review on general topics, and related horizontal analyses Publi...",False,379,0,False
1,Counterparty credit risk 215,,,,,Foreword,False,1,0,False
2,Counterparty credit risk 215,,,,,"1. Articles 143, 283 and 363 of Regulation (EU) No 575/2013 (CRR)1 require the European Central Bank (ECB) to grant permission to use internal models for credit risk, counterparty credit risk and market risk where the requirements set out in the corresponding chapters of the CRR are met by the institution(s) concerned. Based on the current applicable European Union (EU) and national law, the ECB guide to internal models provides transparency on how the ECB understands those rules and how it ...",False,79,1,True
3,Counterparty credit risk 215,,,,,"2. The guide is also intended as a document for the internal use of the different supervisory teams, with the aim of ensuring a common and consistent approach to matters related to internal models. When applying the relevant regulatory framework in specific cases, the ECB will take into due consideration the particular circumstances of the institution concerned.",False,46,1,True
4,Counterparty credit risk 215,,,,,"3. This guide should not be construed as going beyond the current existing applicable EU law including, am ong others, adopted regulatory technical standards (RTS), and national law and therefore is not intended to replace or overrule applicable EU and national law. In accordance with the requirements set out in the CRR, the European Banking Authority (EBA) has drafted RTS. These include the Final draft RTS on the specification of the assessment meth odology for competent authorities regardi...",False,153,1,True
5,Counterparty credit risk 215,,,,,"1 Regulation (EU) No 575/2013 of the European Parliament and of the Council of 26 June 2013 on prudential requirements for credit institutions and investment firms and amending Regulation (EU) No 648/2012 (OJ L 176, 27.6.2013 , p. 1), referred to in this guide as the “CRR” . 2 Final draft Regulatory Technical Standards on the specification of the assessment methodology for competent authorities regarding compliance of an institution with the requirements to use internal models for market ris...",False,246,0,True
6,General topics,,,,,"3 Commission Delegated Regulation (EU) 2021/930 of 1 March 2021 supplementing the CR R with regard to regulatory technical standards specifying the nature, severity and duration of an economic downturn referred to in Article 181(1), point (b), and Article 182(1), point (b), of that Regulation (OJ L 204, 10.6.2021, p. 1). 4 Commission Dele gated Regulation (EU) 2022/439 of 20 October 2021 supplementing Regulation (EU) No 575/2013 of the European Parliament and of the Council with regard to re...",False,150,0,True
7,General topics,1 Overarching principles for internal models,1.1 Relevant regulatory references,,,"Table 1 Date of issue Article Paragraph/Point Legal background Commission Delegated Regulation (EU) No 2022/439 20/10/2021 3, 9, 10, 12, 13, 14, 16, 17, 30, 31, 32 CRD IV,1 as implemented in the relevant national law 26/06/2013 3, 76, 85 Paragraph 1, sub- paragraphs 7, 9 and 11 CRR 26/06/2013 175, 179, 185, 189, 190, 191, 287, 288, 292, 293, 368 Other references Final draft RTS on assessment methodology for IMA and significant share2 22/11/2016 7-34 EBA Guidelines on SREP3 18/03/2022 235 EBA...",False,72,0,False
8,General topics,1 Overarching principles for internal models,1.1 Relevant regulatory references,,,"Currently the RTS on assessment methodology for IMA and significant share only exist in the final draft version. Once adopted, these RTS will become an additional relevant regulatory reference for this guide.",False,29,0,True
9,General topics,1 Overarching principles for internal models,1.1 Relevant regulatory references,,,"1. The principles listed in this section relate to internal models that are subject to supervisory approval for the calculation of own funds requirements for credit, market and counterparty credit risk (Pillar 1 models), unless stated otherwi se.",False,32,1,True


1. **Rule 1:** For lines that have a format of "x.y [text]" (like "1.1 Relevant regulatory references"):
   - Insert an empty line before and after this line.
   - Prefix the line with "###".
   
2. **Rule 2:** For lines that have a format of "x [text]" (like "1 Overarching principles for internal models") AND the immediate next line follows the format of Rule 1:
   - Insert an empty line before and after this line.
   - Prefix the line with "##".
   
3. **Rule 3:** For lines that have a format of "x.x.x [text]" (like "2.1.1 Some other text"):
   - Insert an empty line before and after this line.
   - Prefix the line with "####".

4. **Rule 4:** For lines that have a format of "x. [text]" (like "5. Add an empty line in front of it"):
   - Insert an empty line before this line.

In [9]:
df['Level_1_Label'].unique()

array([None, '1 Overarching principles for internal models',
       '2 Roll-out and permanent partial use', '3 Internal governance',
       '4 Internal validation', '5 Internal audit', '6 Model use',
       '7 Management of changes to the IRB approach',
       '8 Third-party involvement', '1 Scope of the credit risk chapter',
       '2 Data maintenance for the IRB approach', '3 Use of data',
       '4 Definition of default', '5 Probability of default',
       '6 Loss given default', '7 Conversion factors',
       '8 Model-related MoC', '9 Review of estimates',
       '10 Calculation of m aturity for non-retail exposures',
       '1 Scope of the market risk chapter',
       '2 Scope of the internal model approach',
       '3 Regulatory back-testing of VaR models',
       '4 Aspects of internal validation of market risk models',
       '5 Methodology for VaR and stressed VaR',
       '6 Methodology for IRC models focusing on default risk',
       '7 Risks-not-in-the-model engines',
     

In [10]:
df['Level_2_Label'].unique()

array([None, '1.1 Relevant regulatory references',
       '1.2 Guidelines at consolidated and subsidiary levels',
       '1.3 Documentation of internal models',
       '1.4 Implementation of a model risk management framework',
       '1.5 Identification of management body and senior management',
       '1.6 General principles for internal validation',
       '1.7 General principles for internal audit',
       '1.8 General principles on climate-related and environmental risks',
       '1.9 General principles for the implementation of a changed or',
       '2.1 Relevant regulatory references',
       '2.2 Application of the IRB approach',
       '2.3 Governance of the roll-out plan for the IRB approach',
       '2.4 Changes to the roll-out plan for the IRB approach',
       '2.5 Monitoring of compliance with permanent partial use provisions',
       '2.6 Reversion to a less sophisticated app roach',
       '2.7 Internal models in the context of consolidations',
       '3.1 Relevant regul

In [11]:
df['Level_0_Label'].unique()

array(['Counterparty credit risk 215', 'General topics', 'Credit risk',
       'Market risk', 'Counterparty credit risk'], dtype=object)