In [1]:
import PyPDF2
import pandas as pd
from pypdf import PdfReader
import re

pd.set_option('display.max_colwidth', 500)
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)

In [2]:
def extract_all_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    full_text = ""
    for page in reader.pages:
        full_text += page.extract_text()
    return full_text

pdf_path = "ssm.pubcon230622_guide.en.pdf"
text = extract_all_text_from_pdf(pdf_path)

# If you want to save this to a .txt file
with open('trim_guide_pypdf.txt', 'w', encoding='utf-8') as f:
    f.write(text)

In [3]:
def process_text_file(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    # Add a sentinel line at the end to make sure we process the last line correctly
    lines.append('Sentinel line')

    processed_lines = []
    i = 0
    while i < len(lines) - 1:
        line = lines[i].strip()
        
        # Check for the next line to account for Rule 2
        next_line = lines[i + 1].strip()

        # Rule 1
        if re.match(r'^\d+\.\d+ ', line):
            processed_lines.append('\n')
            processed_lines.append('### ' + line + '\n')
            processed_lines.append('\n')
        # Rule 3
        elif re.match(r'^\d+\.\d+\.\d+ ', line):
            processed_lines.append('\n')
            processed_lines.append('#### ' + line + '\n')
            processed_lines.append('\n')
        # Rule 2
        elif re.match(r'^\d+ ', line) and re.match(r'^\d+\.\d+ ', next_line):
            processed_lines.append('\n')
            processed_lines.append('## ' + line + '\n')
            processed_lines.append('\n')
        # Rule 4
        elif re.match(r'^\d+\.', line):
            processed_lines.append('\n')
            processed_lines.append(line + '\n')
        else:
            processed_lines.append(line + '\n')

        i += 1

    # Write to the modified file
    with open('trim_guide_pypdf_mod.txt', 'w', encoding='utf-8') as f:
        f.writelines(processed_lines)

# Call the function
filename = 'trim_guide_pypdf.txt'
process_text_file(filename)


In [4]:
# Extract headings and body
def extract_data(text):
    lines = text.split('\n')
    data = []
    level_1_label, level_2_label, level_3_label, level_4_label = None, None, None, None
    body_buffer = []
    special = False
    in_special_mode = False

    for i, line in enumerate(lines):
        stripped_line = line.strip()

        # Detect start of special paragraph
        if stripped_line.startswith("&&"):
            in_special_mode = True
            stripped_line = stripped_line[2:].strip()  # remove the "&&"

        # Detect end of special paragraph
        if stripped_line.endswith("^^") and in_special_mode:
            in_special_mode = False
            stripped_line = stripped_line[:-2].strip()  # remove the "^^"
            special = True

        # If in special mode, just append to body buffer and continue
        if in_special_mode:
            body_buffer.append(stripped_line)
            continue

        # Detect headings based on our markers
        if stripped_line.startswith("#####"):
            level_4_label = stripped_line.replace("#####", "").strip()
        elif stripped_line.startswith("####"):
            level_3_label = stripped_line.replace("####", "").strip()
            level_4_label = None
        elif stripped_line.startswith("###"):
            level_2_label = stripped_line.replace("###", "").strip()
            level_3_label, level_4_label = None, None
        elif stripped_line.startswith("##"):
            level_1_label = stripped_line.replace("##", "").strip()
            level_2_label, level_3_label, level_4_label = None, None, None
        else:
            body_buffer.append(stripped_line)

        # Empty line or end of special paragraph indicates end of a paragraph
        if (not stripped_line or special) and body_buffer:
            data.append([level_1_label, level_2_label, level_3_label, level_4_label, ' '.join(body_buffer), special])
            body_buffer = []
            special = False

    # Add the last buffered body text if present
    if body_buffer:
        data.append([level_1_label, level_2_label, level_3_label, level_4_label, ' '.join(body_buffer), special])

    return data

In [5]:
with open('trim_guide_pypdf_mod.txt', 'r', encoding='utf-8') as file:
    text = file.read()

data = extract_data(text)

# Convert data to Pandas DataFrame
df = pd.DataFrame(data, columns=['Level_1_Label', 'Level_2_Label', 'Level_3_Label', 'Level_4_Label', 'Body', 'Special'])

df.head(30)

Unnamed: 0,Level_1_Label,Level_2_Label,Level_3_Label,Level_4_Label,Body,Special
0,,,,,"ECB guide to internal models June 2023 Document releases Release Date of issue Release number Sections/Chapters modified Rationale underlying the release Guide for the Targeted Review of Internal Models (TRIM) – First version 17/02/2017 1.0 - Public Consultation – ECB guide to internal models – general topics chapter 28/03/2018 2.0a General topics Incorporation of institutions’ feedback on version 1.0, outcomes of the supervisory review on general topics, and related horizontal an...",False
1,,,,,ECB guide to internal models – Foreword 3 Foreword,False
2,,,,,"1. Articles 143, 283 and 363 of Regulation (EU) No 575/2013 (CRR)1 require the European Central Bank (ECB) to grant permission to use internal models for credit risk, counterparty credit risk and market risk where the requirements set out in the corresponding chapters of the CRR are met by the institution(s) concerned. Based on the current applicable European Union (EU) and national law, the ECB guide to internal models provides transparency on how the ECB understands those rules and how it ...",False
3,,,,,"2. The guide is also intended as a document for the internal use of the different supervisory teams, with the aim of ensuring a common and consistent approach to matters related to internal models. When applying the relevant regulatory framework in specific cases, the ECB will take into due consideration the particular circumstances of the institution concerned.",False
4,,,,,"3. This guide should not be construed as going beyond the current existing applicable EU law including, am ong others, adopted regulatory technical standards (RTS), and national law and therefore is not intended to replace or overrule applicable EU and national law. In accordance with the requirements set out in the CRR, the European Banking Authority (EBA) has drafted RTS. These include the Final draft RTS on the specification of the assessment meth odology for competent authorities regar...",False
5,,,,,"1 Regulation (EU) No 575/2013 of the European Parliament and of the Council of 26 June 2013 on prudential requirements for credit institutions and investment firms and amending Regulation (EU) No 648/2012 (OJ L 176, 27.6.2013 , p. 1), referred to in this guide as the “CRR” . 2 Final draft Regulatory Technical Standards on the specification of the assessment methodology for competent authorities regarding compliance of an institution with the requirements to use internal models for market ...",False
6,,,,,,False
7,,,,,"3 Commission Delegated Regulation (EU) 2021/930 of 1 March 2021 supplementing the CR R with regard to regulatory technical standards specifying the nature, severity and duration of an economic downturn referred to in Article 181(1), point (b), and Article 182(1), point (b), of that Regulation (OJ L 204, 10.6.2021, p. 1). 4 Commission Dele gated Regulation (EU) 2022/439 of 20 October 2021 supplementing Regulation (EU) No 575/2013 of the European Parliament and of the Council with regard to ...",False
8,1 Overarching principles for internal models,,,,,False
9,1 Overarching principles for internal models,,,,,False


In [6]:
xxx

NameError: name 'xxx' is not defined

1. **Rule 1:** For lines that have a format of "x.y [text]" (like "1.1 Relevant regulatory references"):
   - Insert an empty line before and after this line.
   - Prefix the line with "###".
   
2. **Rule 2:** For lines that have a format of "x [text]" (like "1 Overarching principles for internal models") AND the immediate next line follows the format of Rule 1:
   - Insert an empty line before and after this line.
   - Prefix the line with "##".
   
3. **Rule 3:** For lines that have a format of "x.x.x [text]" (like "2.1.1 Some other text"):
   - Insert an empty line before and after this line.
   - Prefix the line with "####".

4. **Rule 4:** For lines that have a format of "x. [text]" (like "5. Add an empty line in front of it"):
   - Insert an empty line before this line.

In [None]:
this is not