In [1]:
import PyPDF2
import pandas as pd
import spacy

In [2]:
nlp = spacy.load("en_core_web_lg")

### Mask names in the given text with asterisks.

* The input text containing named entities to mask.

* nlp (spacy.language.Language): A spaCy NLP pipeline for entity recognition.

* The input text with names replaced by masked versions.

In [3]:
def mask_names(text, nlp):
    doc = nlp(text)
    masked_text = text
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            name = ent.text
            masked_name = name[0] + '*' * (len(name) - 2) + name[-1]
            masked_text = masked_text.replace(name, masked_name)
    return masked_text

In [4]:
data = {
    'page number': [],
    'page content': [],
    'masked content': [],
    'Extracted Names': []
}

In [5]:
with open('training_data.pdf', 'rb') as pdf_file:
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    
    # Process the first 100 pages
    for page_num in range(100):
        page = pdf_reader.pages[page_num]
        page_text = page.extract_text()
        
        # Mask human names using spaCy NER
        masked_text = mask_names(page_text, nlp)
        
        data['page number'].append(page_num + 1)
        data['page content'].append(page_text)
        data['masked content'].append(masked_text)
        data['Extracted Names'].append([ent.text for ent in nlp(page_text).ents if ent.label_ == "PERSON"])

In [6]:
df = pd.DataFrame(data)

In [7]:
df.head()

Unnamed: 0,page number,page content,masked content,Extracted Names
0,1,"Akshay Kumar, Fair Superstar: Why criticism ov...","A**********r, Fair Superstar: Why criticism ov...","[Akshay Kumar, Akshay Kumar]"
1,2,"Remembering Shammi Kapoor, India's answer to E...","Remembering S***********r, India's answer to E...","[Shammi Kapoor, Elvis Presley]"
2,3,"After Article 370, new style: This can be a fr...","After Article 370, new style: This can be a fr...",[]
3,4,The original female superstar: How Vyjanthimal...,The original female superstar: How V**********...,"[Vyjanthimala, Bollywood]"
4,5,13 YO Kabhi Alvida Naa Kehna: A progressive fi...,13 Y******i A**************a: A progressive fi...,"[YO Kabhi, Alvida Naa Kehna]"
