In [26]:
!pip install PyPDF2 transformers torch spacy



In [28]:
import PyPDF2
from transformers import pipeline
import torch
import re
import spacy
from collections import Counter

# Extract text from PDF using PyPDF2
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text() + '\n'
    return text

In [29]:
import PyPDF2
from transformers import pipeline
import torch
import re
import spacy
from collections import Counter

def extract_text_from_pdf(pdf_path):
    try:
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            text = ''
            for page in reader.pages:
                text += page.extract_text() + '\n'
        return text
    except Exception as e:
        print(f"Error extracting text from PDF: {e}")
        return ""

def extract_organizations_transformer(text, ner_pipeline):
    chunks = [text[i:i+500] for i in range(0, len(text), 400)]  # 100 character overlap
    
    all_entities = []
    for chunk in chunks:
        entities = ner_pipeline(chunk)
        all_entities.extend(entities)
    
    org_names = []
    current_org = ""
    for entity in all_entities:
        if entity['entity'].startswith('B-ORG'):
            if current_org:
                org_names.append(current_org.strip())
            current_org = entity['word']
        elif entity['entity'].startswith('I-ORG'):
            current_org += " " + entity['word']
        else:
            if current_org:
                org_names.append(current_org.strip())
                current_org = ""
    
    if current_org:
        org_names.append(current_org.strip())
    
    return org_names

def extract_organizations_spacy(text):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    return [ent.text for ent in doc.ents if ent.label_ == "ORG"]

def extract_organizations_regex(text):
    # This pattern looks for capitalized words that might represent an organization
    org_pattern = r'\b(?:[A-Z][a-z]+ )?(?:[A-Z][a-z]+ )*(?:Inc\.|LLC|Ltd\.|Company|Corporation|Corp\.|Organization|Association)\b'
    return re.findall(org_pattern, text)

def clean_org_name(name):
    # Remove special characters and extra whitespace
    cleaned = ' '.join(name.split())
    return cleaned

def extract_participant_org(pdf_path):
    # Determine the appropriate device
    device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
    
    # Load NER models
    ner_pipeline_roberta = pipeline("ner", model="Jean-Baptiste/roberta-large-ner-english", device=device)
    ner_pipeline_bert = pipeline("ner", model="dslim/bert-base-NER", device=device)
    
    text = extract_text_from_pdf(pdf_path)
    
    if not text:
        print("Failed to extract text from PDF. Please check the file and permissions.")
        return []
    
    print(f"Extracted text length: {len(text)} characters")
    print("First 500 characters of extracted text:")
    print(text[:500])
    
    # Extract organizations using multiple methods
    orgs_roberta = extract_organizations_transformer(text, ner_pipeline_roberta)
    orgs_bert = extract_organizations_transformer(text, ner_pipeline_bert)
    orgs_spacy = extract_organizations_spacy(text)
    orgs_regex = extract_organizations_regex(text)
    
    # Combine and clean all extracted organizations
    all_orgs = orgs_roberta + orgs_bert + orgs_spacy + orgs_regex
    cleaned_orgs = [clean_org_name(org) for org in all_orgs]
    
    # Count occurrences
    org_counts = Counter(cleaned_orgs)
    
    # Get the top 5 most common organizations
    top_orgs = org_counts.most_common(5)
    
    return top_orgs

In [30]:
# Worked
file_path = "./pdfs/Tenox-Consulting-submission-on-draft-sports-broadcasting-services-amendment-regulations-2018 - 3.pdf"
#file_path = "./pdfs/South-African-Youth-Council-submission-on-draft-sports-broadcasting-services-amendment-regulations-2018 - 1.pdf"

# Partially Worked
# Gave SARU
#file_path = "./pdfs/SARU-submission-on-draft-sports-broadcasting-services-amendment-regulations-2018 - 30.pdf"
#
# Failed
#file_path = "./pdfs/UCT-submission-on-draft-sports-broadcasting-services-amendment-regulations-2018 - 5.pdf" 
#file_path = "./pdfs/University-of-Pretoria-submission-on-draft-sports-broadcasting-services-amendment-regulations-2018 - 4.pdf"
#file_path = "./pdfs/BMI-submission-on-draft-sports-broadcasting-services-amendment-regulations-2018 - 71.pdf"

#file_path = "your_pdf_file_path"
# IFrame(file_path, width=800, height=600)

In [31]:

top_participant_orgs = extract_participant_org(file_path)

print("\nTop 5 potential participant organizations:")
for org, count in top_participant_orgs:
    print(f"{org}: {count} occurrences")

print(f"\nUsing device: {torch.device('mps' if torch.backends.mps.is_available() else 'cuda' if torch.cuda.is_available() else 'cpu')}")

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Extracted text length: 7757 characters
First 500 characters of extracted text:
 Birchwood Court, West Wing  
43 Montrose Street  
Vorna Valley, 1686  
Tel (011) 655 7257  
Fax (011) 655  7011  
E-mail:  thami@tenox.co.za  
Cell:  083 222 5227  
Contact: Thami Xulu  Suite 10  
232 Boom Street  
Pietermaritzburg, 3201  
Tel (033) 342 0 765 
Fax 086  669 1758  
E-mail: njabulo@tenox.co.za  
Cell: 072  607 6050  
Contact: Njabulo Ndlovu  
Reg. No. 2014/072994/21 ;   VAT No. 4270260690 ;   SAIPA A ccredited Training Centre (ATC) No. 2526  
 
• Accounting        • Independent Re

Top 5 potential participant organizations:
SA ##IP ##A: 4 occurrences
Ten ##ox Management Consul ##tan ##cy Inc: 4 occurrences
Tenox Management Consultancy Inc.: 4 occurrences
ICASA: 4 occurrences
AT ##C: 3 occurrences

Using device: mps
