In [19]:
!pip install PyPDF2 transformers torch



In [20]:
import PyPDF2
import torch
from collections import Counter
from transformers import pipeline

# Extract text from PDF using PyPDF2
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text() + '\n'
    return text

In [21]:
def extract_organizations(text, ner_pipeline):
    # Split the text into chunks of 500 characters with 100 character overlap
    chunk_size = 500
    overlap = 100
    chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size-overlap)]
    
    all_entities = []
    for chunk in chunks:
        entities = ner_pipeline(chunk)
        all_entities.extend(entities)
    
    # Extract organization names
    org_names = []
    current_org = ""
    for entity in all_entities:
        if entity['entity'].startswith('B-ORG'):
            if current_org:
                org_names.append(current_org.strip())
            current_org = entity['word']
        elif entity['entity'].startswith('I-ORG'):
            current_org += " " + entity['word']
        else:
            if current_org:
                org_names.append(current_org.strip())
                current_org = ""
    
    if current_org:
        org_names.append(current_org.strip())
    
    return org_names

def clean_org_name(name):
    # Remove special characters and extra whitespace
    cleaned = ' '.join(name.split())
    return cleaned

def extract_participant_org(pdf_path):
    # Determine the appropriate device
    device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
    
    # Load a more accurate NER model
    ner_pipeline = pipeline("ner", model="Jean-Baptiste/roberta-large-ner-english", device=device)
    
    text = extract_text_from_pdf(pdf_path)
    org_names = extract_organizations(text, ner_pipeline)
    
    # Clean and count organization names
    cleaned_org_names = [clean_org_name(name) for name in org_names]
    org_counts = Counter(cleaned_org_names)
    
    # Get the top 3 most common organizations
    top_orgs = org_counts.most_common(3)
    
    return top_orgs

In [22]:

#file_path = "your_pdf_file_path"
# IFrame(file_path, width=800, height=600)

In [24]:
top_participant_orgs = extract_participant_org(file_path)
print("Top 3 potential participant organizations:")
for org, count in top_participant_orgs:
    print(f"{org}: {count} occurrences")

print(f"\nUsing device: {torch.device('mps' if torch.backends.mps.is_available() else 'cuda' if torch.cuda.is_available() else 'cpu')}")

Top 3 potential participant organizations:

Using device: mps
