### Method 1: Model stacking for entities extraction process

In [3]:
text = "When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously. “I can tell you very senior CEOs of major American car companies would shake my hand and turn away because I wasn’t worth talking to,” said Thrun, now the co-founder and CEO of online higher education startup Udacity, in an interview with Recode earlier this week. A little less than a decade later, dozens of self-driving startups have cropped up while automakers around the world clamor, wallet in hand, to secure their place in the fast-moving world of fully automated transportation."

In [6]:
# Loading the pipeline from hub
from transformers import pipeline
# Pipeline handles the preprocessing and post processing steps
model_checkpoint_m1 = "balamurugan1603/bert-finetuned-ner"
namedEntityRecogniser_m1 = pipeline(
    "token-classification", model=model_checkpoint_m1, aggregation_strategy="simple"
)

In [7]:
m1_results=namedEntityRecogniser_m1([text])
print(m1_results)

[[{'entity_group': 'PER', 'score': 0.9975978, 'word': 'Sebastian Thrun', 'start': 5, 'end': 20}, {'entity_group': 'ORG', 'score': 0.98961675, 'word': 'Google', 'start': 61, 'end': 67}, {'entity_group': 'MISC', 'score': 0.9944423, 'word': 'American', 'start': 173, 'end': 181}, {'entity_group': 'PER', 'score': 0.9945845, 'word': 'Thrun', 'start': 271, 'end': 276}, {'entity_group': 'ORG', 'score': 0.97429544, 'word': 'Udacity', 'start': 340, 'end': 347}, {'entity_group': 'ORG', 'score': 0.9724045, 'word': 'Recode', 'start': 370, 'end': 376}]]


In [8]:
# Model 2: for skill recognition
model_checkpoint_m2 = "algiraldohe/lm-ner-linkedin-skills-recognition"
namedEntityRecogniser_m2 = pipeline(
    "token-classification", model=model_checkpoint_m2, aggregation_strategy="simple"
)

Labels in this model: ['B-BUS', 'B-SOFT', 'B-TECHNICAL', 'B-TECHNOLOGY', 'I-BUS', 'I-SOFT', 'I-TECHNICAL', 'I-TECHNOLOGY', 'O']

In [9]:
m2_results = namedEntityRecogniser_m2([text])
print(m2_results)

[[{'entity_group': 'BUS', 'score': 0.9896613, 'word': 'higher education', 'start': 315, 'end': 331}, {'entity_group': 'TECHNOLOGY', 'score': 0.9974293, 'word': 'less', 'start': 405, 'end': 409}, {'entity_group': 'BUS', 'score': 0.99113655, 'word': 'startups', 'start': 454, 'end': 462}]]


extracted results from both the models.

##### Merging results

In [10]:
m1_results.extend(m2_results)

In [11]:
# Merge the two lists into a single list
merged_results = m1_results[0]
for entity in m1_results[1]:
    existing_entity = next((ent for ent in merged_results if ent['start'] == entity['start']), None)
    if existing_entity is None or entity['score'] > existing_entity['score']:
        merged_results.append(entity)

# Sort the merged list based on 'start' index in ascending order
sorted_results= sorted(merged_results, key=lambda x: x['start'])

In [12]:
print(sorted_results)

[{'entity_group': 'PER', 'score': 0.9975978, 'word': 'Sebastian Thrun', 'start': 5, 'end': 20}, {'entity_group': 'ORG', 'score': 0.98961675, 'word': 'Google', 'start': 61, 'end': 67}, {'entity_group': 'MISC', 'score': 0.9944423, 'word': 'American', 'start': 173, 'end': 181}, {'entity_group': 'PER', 'score': 0.9945845, 'word': 'Thrun', 'start': 271, 'end': 276}, {'entity_group': 'BUS', 'score': 0.9896613, 'word': 'higher education', 'start': 315, 'end': 331}, {'entity_group': 'ORG', 'score': 0.97429544, 'word': 'Udacity', 'start': 340, 'end': 347}, {'entity_group': 'ORG', 'score': 0.9724045, 'word': 'Recode', 'start': 370, 'end': 376}, {'entity_group': 'TECHNOLOGY', 'score': 0.9974293, 'word': 'less', 'start': 405, 'end': 409}, {'entity_group': 'BUS', 'score': 0.99113655, 'word': 'startups', 'start': 454, 'end': 462}]


In [13]:
final_result=[sorted_results]

##### Visualization

In [14]:
from spacy import displacy

In [15]:
def visualize(pipeline_output, texts):
    for i in range(len(final_result)):
        entities = []
        for ents in final_result[i]:
            entities.append({"end": ents["end"], "label": ents["entity_group"], "start": ents["start"]})
        displacy.render({
            "ents": entities,
            "text": texts[i]
        }, style="ent", manual=True)
visualize(final_result,[text])

### Method 2: Using Spacy transformer based model to extract entities

In [19]:
pip install --upgrade spacy[cuda111,transformers]

Collecting cupy-cuda111<13.0.0,>=5.0.0b4 (from spacy[cuda111,transformers])
  Downloading cupy_cuda111-12.1.0-cp311-cp311-win_amd64.whl (83.0 MB)
                                              0.0/83.0 MB ? eta -:--:--
                                              0.0/83.0 MB ? eta -:--:--
                                              0.2/83.0 MB 1.8 MB/s eta 0:00:46
                                              0.3/83.0 MB 2.0 MB/s eta 0:00:43
                                              0.4/83.0 MB 2.0 MB/s eta 0:00:41
                                              0.5/83.0 MB 2.2 MB/s eta 0:00:39
                                              0.6/83.0 MB 2.2 MB/s eta 0:00:38
                                              0.7/83.0 MB 2.3 MB/s eta 0:00:36
                                              0.9/83.0 MB 2.3 MB/s eta 0:00:36
                                              1.0/83.0 MB 2.4 MB/s eta 0:00:34
                                              1.0/83.0 MB 2.5 MB/s eta 0:00:34

In [4]:
import spacy
from spacy import displacy
nlp_trf = spacy.load("en_core_web_trf")

doc = nlp_trf(text)
displacy.render(doc, style="ent")

### Successfully extracted entities with different labels such as Person, Organization, Location, Skills , Miscellaneous etc. Finalized model: Spacy Model [en transformer] : less efforts but with model stacking concept we can use different model attributes based on the requirement 

# Relationship Extraction Task

In [2]:
import spacy

# Load the SpaCy model for dependency parsing
nlp = spacy.load("en_core_web_sm")

In [3]:
# Step 2: Dependency Parsing concept : analyzes the grammatical structure of a sentence and represents it as a dependency tree.
#In this tree, each word is a node, and the edges represent the syntactic relationships between the words.
dependency_tree = dependency_parsing(text)

# Step 3: Relationship Identification
relationships = []
for entity1 in final_result:
    for entity2 in final_result:
        if entity1 != entity2:
            relationship = identify_relationship(dependency_tree, entity1, entity2)
            if relationship:
                relationships.append((entity1, relationship, entity2))

# Step 4: Classify Relationships (Optional)
classified_relationships = classify_relationships(relationships)

# Print the final relationships
for entity1, relationship, entity2 in classified_relationships:
    print(f"{entity1} has a {relationship} relationship with {entity2}.")


NameError: name 'dependency_parsing' is not defined

In [4]:
import spacy

# Load the SpaCy model for dependency parsing
nlp = spacy.load("en_core_web_sm")

# Sample text
text = "Myself Akshat, worked as ML intern in Carelon Global Solutions from May to June in Bangalore."

# Step 2: Dependency Parsing using SpaCy
doc = nlp(text)
dependency_tree = [(token.text, token.dep_, token.head.text) for token in doc]

# Step 3: Relationship Identification
relationships = []
for entity1 in final_list:
    for entity2 in final_list:
        if entity1 != entity2:
            relationship = identify_relationship(dependency_tree, entity1, entity2)
            if relationship:
                relationships.append((entity1, relationship, entity2))

# Step 4: Classify Relationships (Optional)
classified_relationships = classify_relationships(relationships)

# Print the final relationships
for entity1, relationship, entity2 in classified_relationships:
    print(f"{entity1} has a {relationship} relationship with {entity2}.")


NameError: name 'final_list' is not defined