In [None]:
!python -m spacy download en_core_web_sm
!python -m spacy download en_core_web_md

In [None]:
!pip install PyPDF2 pdfplumber spacy nltk transformers torch
!pip install os
!pip install python-dotenv

In [80]:
from IPython.display import IFrame
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import spacy
import PyPDF2
import re
import torch
import os
from dotenv import load_dotenv

from collections import Counter
from transformers import pipeline

In [81]:

def pdf_to_text_pypdf2(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text() + '\n'
    return text

def find_with_spacy_label(text):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    return [ent.text for ent in doc.ents if ent.label_ in ["ORG", "PRODUCT"]]

def find_submission_vicinity(text, limit=10):
    lines = text.split('\n')[:limit]
    for line in lines:
        if "submission" in line.lower():
            potential_orgs = find_with_spacy_label(line)
            if potential_orgs:
                return potential_orgs[0]
    return None

def find_in_footer(text):
    footer_pattern = r'(?:©|\(c\)|Copyright).*?([A-Z][a-z]+(?:[\s&]+[A-Z][a-z]+)+)'
    matches = re.findall(footer_pattern, text, re.IGNORECASE)
    return matches

def find_acronym(text):
    lines = text.split('\n')[:15]
    acronym_pattern = r'\b([A-Z]{2,})\b'
    acronyms = []
    for line in lines:
        matches = re.findall(acronym_pattern, line)
        acronyms.extend(matches)
    return acronyms

def find_in_signature(text):
    last_page = text.split('\n')[-30:]  # Last 30 lines
    signature_pattern = r'(?:Sincerely|Yours truly|Regards|Submitted by|On behalf of),?(?:\s*\n)*\s*([A-Z][a-z]+(?:[\s&]+[A-Z][a-z]+)+)'
    matches = []
    for i in range(len(last_page) - 1):
        three_lines = ' '.join(last_page[i:i+3])
        found_matches = re.findall(signature_pattern, three_lines, re.IGNORECASE)
        matches.extend(found_matches)
    return matches

def find_participant(pdf_path):
    text = pdf_to_text_pypdf2(pdf_path)
    
    potential_matches = []
    
    # Method 1: Organization with "submission" (high priority)
    with_submission = find_submission_vicinity(text)
    if with_submission:
        potential_matches.append((with_submission, 3))
    
    # Method 2: Organization in footer
    in_footer = find_in_footer(text)
    potential_matches.extend((match, 2) for match in in_footer)
    
    # Method 3: Organization acronym
    in_acronym = find_acronym(text)
    potential_matches.extend((match, 1) for match in in_acronym)
    
    # Method 4: Organization in signature
    in_signature = find_in_signature(text)
    potential_matches.extend((match, 2) for match in in_signature)
    
    # Method 5: Using NER to find all potential organizations
    with_spacy_label = find_with_spacy_label(text)
    potential_matches.extend((match, 1) for match in with_spacy_label)

    device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")

    # Use a pre-trained model for organization entity recognition as a final check
    ner_pipeline = pipeline("ner", model="dslim/bert-base-NER", device=device)
    ner_results = ner_pipeline(text[:2000])  # First 2000 characters
    ner_orgs = [result['word'] for result in ner_results if result['entity'] in ['B-ORG', 'I-ORG']]
    potential_matches.extend((match, 1) for match in ner_orgs)
    
    # Count occurrences, considering priorities
    match_counts = Counter()
    for match, priority in potential_matches:
        match_counts[match] += priority
    
    # Select the most likely participant
    if match_counts:
        most_common_match = max(match_counts.items(), key=lambda x: x[1])
        return most_common_match[0] if most_common_match[1] > 1 else None
    return None

In [82]:
load_dotenv()


files = {
         "tennox" : os.getenv("tennox"), 
         "sayc": os.getenv("sayc"), 
         "saru" : os.getenv("saru"), 
         "uct" : os.getenv("uct"), 
         "up":  os.getenv("up"), 
         "bmi" : os.getenv("bmi")
        }
# print(file_path_tennox)
# IFrame(file_path, width=800, height=600)

In [83]:
result = {}
for key, value in files.items():
    if value:  # Only call if value is not None
        result[key] = find_participant(value)

print(result)

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. ini

{'tennox': 'Tenox Management Consultancy Inc', 'sayc': 'the SAYC by', 'saru': 'SARU', 'uct': 'SA', 'up': 'ICASA', 'bmi': 'SABC'}


In [None]:
#print(nlp.get_pipe("ner").labels)