<a href="https://colab.research.google.com/github/alexandrastna/AI-for-ESG/blob/main/Notebooks/3_Thesis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Thesis 3 - Sentence Extraction from PDF Reports
This notebook extracts clean and meaningful sentences from PDF reports (Annual Reports, Sustainability Reports, etc.) using PyMuPDF and spaCy.
Each document is processed page by page, removing repetitive headers/footers and excluding index pages.
The final output is a CSV file containing all valid sentences, ready for NLP classification.

In [None]:
# Install PyMuPDF and spaCy
!pip install pymupdf
!pip install spacy
!python -m spacy download en_core_web_sm


Collecting pymupdf
  Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m80.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.26.3
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m71.0 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Res

In [None]:
# Step 0 – Mount Google Drive and install dependencies
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Step 1 – Import libraries
import pandas as pd
import os
import fitz  # PyMuPDF
import spacy
import re
from tqdm import tqdm

In [None]:
# Step 2 – Load SpaCy and the document metadata CSV
nlp = spacy.load("en_core_web_sm")
csv_path = "/content/drive/MyDrive/Thèse Master/Data/df_merged_clean.csv"
df_merged = pd.read_csv(csv_path)

# Step 3 – Dynamic header/footer cleaning function
def clean_redundant_headers(text, company, doc_type, year):
    company_escaped = re.escape(company)
    doc_type_escaped = re.escape(doc_type)

    patterns = [
        fr"{company_escaped}",
        fr"{doc_type_escaped} {year}",
        fr"{doc_type_escaped} Report {year}",
        fr"{doc_type_escaped} \d{{4}}",
        fr"{year}",
        r"Page\s\d+(\s+of\s+\d+)?"
    ]

    for pattern in patterns:
        text = re.sub(pattern, "", text, flags=re.IGNORECASE)

    return text

# Step 4 – Clean sentence extraction function
def extract_sentences_from_pdf_clean(path, company, doc_type, year, index_threshold=10):
    try:
        doc = fitz.open(path)
        sentences = []

        for page_num, page in enumerate(doc, start=1):
            text = page.get_text("text")

            # Skip index-heavy pages (table of contents)
            pattern_count = len(re.findall(r'\b\w+,\s*\d+', text))
            if pattern_count > index_threshold:
                continue

            # Clean up repetitive content
            text = clean_redundant_headers(text, company, doc_type, year)

            # Sentence segmentation
            doc_spacy = nlp(text)
            for sent in doc_spacy.sents:
                s = sent.text.strip().replace("\n", " ")

                # Noise filters
                if (
                    len(s) < 30 or
                    s.isupper() or
                    re.fullmatch(r"[\W\d\s]+", s) or
                    not s[0].isalpha()
                ):
                    continue

                sentences.append(s)

        return sentences

    except Exception as e:
        print(f"❌ Error parsing {path}: {e}")
        return []

# Step 5 – Apply extraction to all documents
rows = []

for idx, row in tqdm(df_merged.iterrows(), total=len(df_merged)):
    company = row['Company']
    year = row['Year']
    doc_type = row['Document Type']
    path = row['Path']

    if not os.path.isfile(path):
        print(f"⚠️ Fichier non trouvé : {path}")
        continue

    sents = extract_sentences_from_pdf_clean(path, company, doc_type, year)
    for sent in sents:
        rows.append({
            "company": company,
            "year": year,
            "document_type": doc_type,
            "sentence": sent
        })

# Step 6 – Export results to Drive
df_sentences = pd.DataFrame(rows)
output_csv = "/content/drive/MyDrive/Thèse Master/Exports2/parsed_sentences.csv"
os.makedirs(os.path.dirname(output_csv), exist_ok=True)
df_sentences.to_csv(output_csv, index=False)

print(f"✅ Export terminé vers : {output_csv}")



 96%|█████████▌| 193/202 [28:32<00:21,  2.37s/it]

# 🧹 Post-processing cleanup after manual checks

In [None]:
import pandas as pd

# Load the extracted sentences (from Excel)
df_sentences = pd.read_excel("/content/drive/MyDrive/Thèse Master/Exports2/parsed_sentences.xlsx")

# 1. Keep only sentences between 10 and 1000 characters
df_sentences = df_sentences[df_sentences["sentence"].str.len().between(10, 1000)]

# 2. Remove sentences made only of non-alphabetic characters
df_sentences = df_sentences[~df_sentences["sentence"].str.contains(r'^[^A-Za-z]*$', na=False)]

# 3. Remove numeric-only sentences
df_sentences = df_sentences[~df_sentences["sentence"].str.match(r"^\d+$", na=False)]

# 4. Remove sentences with more than 15 special characters
symbols = set("!@#$%^&*()[]{}:;,.?~`+=|\\/<>-")
def count_symbols(text):
    return sum(1 for char in str(text) if char in symbols)

df_sentences = df_sentences[df_sentences["sentence"].apply(count_symbols) <= 15]

# Reset index
df_sentences = df_sentences.reset_index(drop=True)

# Save cleaned CSV
df_sentences.to_csv("/content/drive/MyDrive/Thèse Master/Exports2/parsed_sentences.csv", index=False)
print(f"✅ Nettoyage terminé : {len(df_sentences)} phrases sauvegardées.")


✅ Nettoyage terminé : 201247 phrases sauvegardées.
