In [19]:
import re
import spacy

'''
Loads a pre-trained SpaCy NLP model named en_core_web_sm,
which is optimized for English.
This model is capable of performing various NLP tasks,
including Named Entity Recognition (NER) used in this code.
'''

def apply_model(input_file, output_file):
    with open(input_file, "r") as file:
        text = file.read()

    # Load the pre-trained NER model
    nlp = spacy.load("en_core_web_sm")


    # Process the text with SpaCy
    doc = nlp(text)

    # Extract entities identified as PERSON
    names = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]

    # Remove duplicates
    unique_names = list(set(names))

    # Use the output_file parameter to create or overwrite the file
    with open(output_file, 'w') as file:
        for name in unique_names:
            file.write(name + "\n")

def clean_names(input_file, output_file):
    with open(input_file, "r") as file:
        names = file.readlines()

    cleaned_names = set()
    for name in names:
        # Find the pattern where 'View' might appear and remove everything after it
        cleaned_name = re.sub(r'View.*', '', name).strip()

        # the expression matches any character that is not a:
        # letter, number, underscore, whitespace, or hyphen.
        cleaned_name = re.sub(r'[^\w\s-]', '', cleaned_name).strip()

        name_parts = cleaned_name.split()

        # If there's only a first name, remove it
        if len(name_parts) == 1:
            continue

        # If a potential middle name is present, remove it
        if len(name_parts) == 3:
            cleaned_name = name_parts[0] + ' ' + name_parts[2]

        # Only add non-empty names
        if cleaned_name:
            cleaned_names.add(cleaned_name)

    with open(output_file, "w") as file:
        # Sort names before writing
        for name in sorted(cleaned_names):
            file.write(name + "\n")


apply_model("Icims.txt", "Names.txt")
clean_names("Names.txt","Names.txt")

print("Names have been finalized and saved to Names.txt.")


# for i in range(5):
  # apply_model("Names.txt", "Names.txt")
  # clean_names("Names.txt","Names.txt")
  # print("Applied model & cleaned names - Iteration:", i+1)



Names have been finalized and saved to Names.txt.



# What is SpaCy?
SpaCy is a popular open-source library for advanced natural language processing in Python. It's designed for practical, real-world tasks and is widely used for tasks like tokenization, part-of-speech tagging, named entity recognition, and more. SpaCy comes with pre-trained models for various languages, which can perform these tasks out of the box.

# Named Entity Recognition (NER)
Named Entity Recognition (NER) is a process in NLP where the algorithm identifies named entities (like person names, locations, companies, quantities, etc.) in a text and classifies them into pre-defined categories. NER is crucial for extracting information from texts and is widely used in information retrieval, question answering systems, content classification, and more.

# How Does SpaCy Perform NER?
SpaCy's NER model is a part of its larger language processing pipeline. When you load a SpaCy model (like en_core_web_sm) and process a text with it, the text goes through several processing steps, one of which is named entity recognition. The NER model in SpaCy uses a deep learning approach, typically a convolutional neural network (CNN) or a transformer-based model, trained on a large annotated corpus where entities have been labeled according to their types.