In [None]:
!python -m spacy download en_core_web_sm
!python -m spacy download en_core_web_md

In [115]:
!pip install PyPDF2 pdfplumber spacy nltk transformers torch

Successfully installed filelock-3.16.1 fsspec-2024.9.0 huggingface-hub-0.25.1 mpmath-1.3.0 networkx-3.3 safetensors-0.4.5 sympy-1.13.3 tokenizers-0.20.0 torch-2.4.1 transformers-4.45.1


In [139]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from IPython.display import IFrame
import pdfplumber
import spacy
import PyPDF2
import re
import torch

from collections import Counter
from transformers import pipeline

# Extract text from PDF using pdfplumber
def pdf_to_text_pdfplumber(file_path):
    text = ""    
    with pdfplumber.open(file_path) as pdf:        
        for page in pdf.pages:            
            text += page.extract_text()
    return text

# Extract text from PDF using PyPDF2
def deprecated_pdf_to_text_pypdf2(file_path):
    with open(file_path, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        text = ""
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            text += page.extract_text()
        return text

def pdf_to_text_pypdf2(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text() + '\n'
    return text

In [117]:
def find_with_spacy_label(text):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    return [ent.text for ent in doc.ents if ent.label_ in ["ORG", "PRODUCT"]]

In [132]:
def find_submission_vicinity(text, limit=10):
    lines = text.split('\n')[:limit]
    for line in lines:
        if "submission" in line.lower():
            potential_orgs = find_with_spacy_label(line)
            if potential_orgs:
                return potential_orgs[0]
    return None

In [133]:
def find_in_footer(text):
    footer_pattern = r'(?:©|\(c\)).*?([A-Z][a-z]+(?:[\s&]+[A-Z][a-z]+)+)'
    match = re.search(footer_pattern, text)
    return match.group(1) if match else None

In [134]:
def find_acronym(text):
    lines = text.split('\n')[:15]
    acronym_pattern = r'\b([A-Z]{2,})\b'
    for line in lines:
        match = re.search(acronym_pattern, line)
        if match:
            return match.group(1)
    return None

In [135]:
def find_in_signature(text):
    last_page = text.split('\n')[-20:]  # Assume signature is in the last 20 lines
    signature_pattern = r'(?:Sincerely|Yours truly|Regards),?(?:\s*\n)*\s*([A-Z][a-z]+(?:[\s&]+[A-Z][a-z]+)+)'
    for i in range(len(last_page) - 1):
        two_lines = ' '.join(last_page[i:i+2])
        match = re.search(signature_pattern, two_lines)
        if match:
            return match.group(1)
    return None

In [136]:
def find_participant(pdf_path):
    text = pdf_to_text_pypdf2(pdf_path)
    
    potential_matches = []
    
    # Method 1: Organization with "submission"
    with_submission = find_submission_vicinity(text)
    if with_submission:
        potential_matches.append(with_submission)
    
    # Method 2: Organization in footer
    in_footer = find_in_footer(text)
    if in_footer:
        potential_matches.append(in_footer)
    
    # Method 3: Organization acronym
    in_acronym = find_acronym(text)
    if in_acronym:
        potential_matches.append(in_acronym)
    
    # Method 4: Organization in signature
    in_signature = find_in_signature(text)
    if in_signature:
        potential_matches.append(in_signature)
    
    # Method 5: Using NER to find all potential organizations
    with_spacy_label = find_with_spacy_label(text)
    potential_matches.extend(with_spacy_label)

    device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")

    # Use a pre-trained model for organization entity recognition as a final check
    ner_pipeline = pipeline("ner", model="dslim/bert-base-NER", device=device)
    ner_results = ner_pipeline(text[:1000])  # Limit to first 1000 characters for efficiency
    ner_orgs = [result['word'] for result in ner_results if result['entity'] in ['B-ORG', 'I-ORG']]
    potential_matches.extend(ner_orgs)
    
    # Count occurrences and select the most common organization
    match_counts = Counter(potential_matches)
    most_common_match = match_counts.most_common(1)[0][0] if match_counts else None
    
    return most_common_match

In [147]:
# Worked
#file_path = "./pdfs/Tenox-Consulting-submission-on-draft-sports-broadcasting-services-amendment-regulations-2018 - 3.pdf"
#file_path = "./pdfs/South-African-Youth-Council-submission-on-draft-sports-broadcasting-services-amendment-regulations-2018 - 1.pdf"

# Partially Worked
# Gave SARU
#file_path = "./pdfs/SARU-submission-on-draft-sports-broadcasting-services-amendment-regulations-2018 - 30.pdf"
#
# Failed
#file_path = "./pdfs/UCT-submission-on-draft-sports-broadcasting-services-amendment-regulations-2018 - 5.pdf" 
#file_path = "./pdfs/University-of-Pretoria-submission-on-draft-sports-broadcasting-services-amendment-regulations-2018 - 4.pdf"
#file_path = "./pdfs/BMI-submission-on-draft-sports-broadcasting-services-amendment-regulations-2018 - 71.pdf"

#file_path = "your_pdf_file_path"
# IFrame(file_path, width=800, height=600)

In [148]:
participant = find_participant(file_path)
print(f"The extracted participant is: {participant}")

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


The extracted participant is: SABC


In [None]:
#print(nlp.get_pipe("ner").labels)

In [None]:
#text = pdf_to_text_pdfplumber(file_path)
text = pdf_to_text_pypdf2(file_path)
# print(text)

In [None]:
# Preprocess text: remove stop words, convert to lowercase
stop_words = set(stopwords.words("english"))
tokens = word_tokenize(text.lower())
tokens = [t for t in tokens if t not in stop_words]
doc = nlp(" ".join(tokens))

In [None]:
# Process the text with SpaCy
doc = nlp(text)

In [None]:
# 1. Rule-based Extraction: Look for patterns like "Submitted by:", "Prepared by:"
possible_names_1 = []
for i, token in enumerate(doc):
    if token.text.lower() in ("submitted", "prepared", "response", "input"):
        if doc[i+1].text.lower() == "by":
            possible_names_1.append(" ".join([t.text for t in doc[i+2:i+6] if t.text.isalpha()]))

print(possible_names_1)

In [None]:
# 2. Extract from Header/Footer
possible_names_2 = []
for line in text.split("\n")[:5]:  # Check first 5 lines for header
    possible_names_2.extend([ent.text for ent in nlp(line).ents if ent.label_ == "ORG"])
for line in text.split("\n")[-5:]: # Check last 5 lines for footer
    possible_names_2.extend([ent.text for ent in nlp(line).ents if ent.label_ == "ORG"])

print(possible_names_2)

In [None]:
# 3. Extract from First Few Lines
possible_names_3 = []
possible_names_3.extend([ent.text for ent in nlp(" ".join(text.split("\n")[:10])).ents if ent.label_ == "ORG"])

print(possible_names_3)

In [None]:
# 4. Extract from Signature Section (New - Assumes signature is in last 10 lines)
possible_names_4 = []
possible_names_4.extend([ent.text for ent in nlp(" ".join(text.split("\n")[-10:])).ents if ent.label_ == "ORG"])

print(possible_names_4)

In [None]:
# 5. If no names found using above methods, extract all organizations
possible_names_5 = []
#if not possible_names:
possible_names_5 = [ent.text for ent in doc.ents if ent.label_ == "ORG"]

print(possible_names_5)

In [None]:
# 6. Find the most similar name to the last few lines, first few lines, and the entire document (Modified)
if possible_names:
    last_lines = " ".join(text.split("\n")[-5:])
    first_lines = " ".join(text.split("\n")[:5:])
    most_similar_name = max(possible_names, key=lambda name: 
                           nlp(name).similarity(nlp(last_lines))
                        + nlp(name).similarity(nlp(first_lines))
#                        + nlp(name).similarity(nlp(text))) 
print(most_similar_name)