### Method 1: Model stacking for entities extraction process

In [3]:
text = "When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously. “I can tell you very senior CEOs of major American car companies would shake my hand and turn away because I wasn’t worth talking to,” said Thrun, now the co-founder and CEO of online higher education startup Udacity, in an interview with Recode earlier this week. A little less than a decade later, dozens of self-driving startups have cropped up while automakers around the world clamor, wallet in hand, to secure their place in the fast-moving world of fully automated transportation."

In [4]:
# Loading the pipeline from hub
from transformers import pipeline
# Pipeline handles the preprocessing and post processing steps
model_checkpoint_m1 = "balamurugan1603/bert-finetuned-ner"
namedEntityRecogniser_m1 = pipeline(
    "token-classification", model=model_checkpoint_m1, aggregation_strategy="simple"
)

In [5]:
m1_results=namedEntityRecogniser_m1([text])
print(m1_results)

[[{'entity_group': 'PER', 'score': 0.9975978, 'word': 'Sebastian Thrun', 'start': 5, 'end': 20}, {'entity_group': 'ORG', 'score': 0.98961675, 'word': 'Google', 'start': 61, 'end': 67}, {'entity_group': 'MISC', 'score': 0.9944423, 'word': 'American', 'start': 173, 'end': 181}, {'entity_group': 'PER', 'score': 0.9945845, 'word': 'Thrun', 'start': 271, 'end': 276}, {'entity_group': 'ORG', 'score': 0.97429544, 'word': 'Udacity', 'start': 340, 'end': 347}, {'entity_group': 'ORG', 'score': 0.9724045, 'word': 'Recode', 'start': 370, 'end': 376}]]


In [6]:
# Model 2: for skill recognition
model_checkpoint_m2 = "algiraldohe/lm-ner-linkedin-skills-recognition"
namedEntityRecogniser_m2 = pipeline(
    "token-classification", model=model_checkpoint_m2, aggregation_strategy="simple"
)

Labels in this model: ['B-BUS', 'B-SOFT', 'B-TECHNICAL', 'B-TECHNOLOGY', 'I-BUS', 'I-SOFT', 'I-TECHNICAL', 'I-TECHNOLOGY', 'O']

In [7]:
m2_results = namedEntityRecogniser_m2([text])
print(m2_results)

[[{'entity_group': 'BUS', 'score': 0.9896613, 'word': 'higher education', 'start': 315, 'end': 331}, {'entity_group': 'TECHNOLOGY', 'score': 0.9974293, 'word': 'less', 'start': 405, 'end': 409}, {'entity_group': 'BUS', 'score': 0.99113655, 'word': 'startups', 'start': 454, 'end': 462}]]


extracted results from both the models.

##### Merging results

In [8]:
m1_results.extend(m2_results)

In [9]:
# Merge the two lists into a single list
merged_results = m1_results[0]
for entity in m1_results[1]:
    existing_entity = next((ent for ent in merged_results if ent['start'] == entity['start']), None)
    if existing_entity is None or entity['score'] > existing_entity['score']:
        merged_results.append(entity)

# Sort the merged list based on 'start' index in ascending order
sorted_results= sorted(merged_results, key=lambda x: x['start'])

In [10]:
print(sorted_results)

[{'entity_group': 'PER', 'score': 0.9975978, 'word': 'Sebastian Thrun', 'start': 5, 'end': 20}, {'entity_group': 'ORG', 'score': 0.98961675, 'word': 'Google', 'start': 61, 'end': 67}, {'entity_group': 'MISC', 'score': 0.9944423, 'word': 'American', 'start': 173, 'end': 181}, {'entity_group': 'PER', 'score': 0.9945845, 'word': 'Thrun', 'start': 271, 'end': 276}, {'entity_group': 'BUS', 'score': 0.9896613, 'word': 'higher education', 'start': 315, 'end': 331}, {'entity_group': 'ORG', 'score': 0.97429544, 'word': 'Udacity', 'start': 340, 'end': 347}, {'entity_group': 'ORG', 'score': 0.9724045, 'word': 'Recode', 'start': 370, 'end': 376}, {'entity_group': 'TECHNOLOGY', 'score': 0.9974293, 'word': 'less', 'start': 405, 'end': 409}, {'entity_group': 'BUS', 'score': 0.99113655, 'word': 'startups', 'start': 454, 'end': 462}]


In [11]:
final_result=[sorted_results]

##### Visualization

In [12]:
from spacy import displacy



In [13]:
def visualize(pipeline_output, texts):
    for i in range(len(final_result)):
        entities = []
        for ents in final_result[i]:
            entities.append({"end": ents["end"], "label": ents["entity_group"], "start": ents["start"]})
        displacy.render({
            "ents": entities,
            "text": texts[i]
        }, style="ent", manual=True)
visualize(final_result,[text])

### Method 2: Using Spacy transformer based model to extract entities

In [14]:
pip install --upgrade spacy[cuda111,transformers]

Note: you may need to restart the kernel to use updated packages.


In [15]:
import spacy
from spacy import displacy
nlp_trf = spacy.load("en_core_web_trf")

doc = nlp_trf(text)
displacy.render(doc, style="ent")

info about spacy englishh transformer based model: https://spacy.io/models/en#en_core_web_trf

### Successfully extracted entities with different labels such as Person, Organization, Location, Skills , Miscellaneous etc. Finalized model: Spacy Model [en transformer] : less efforts but with model stacking concept we can use different model attributes based on the requirement 

In [16]:
for ent in doc.ents:
    print(f"Entity: {ent.text} | Label: {ent.label_}")

Entity: Sebastian Thrun | Label: PERSON
Entity: Google | Label: ORG
Entity: 2007 | Label: DATE
Entity: American | Label: NORP
Entity: Thrun | Label: PERSON
Entity: Udacity | Label: ORG
Entity: Recode | Label: ORG
Entity: earlier this week | Label: DATE
Entity: A little less than a decade later | Label: DATE
Entity: dozens | Label: CARDINAL


In [17]:
nlp=spacy.load("en_core_web_trf") #transformer based model : model used is RoBERTa
unique_labels = list(nlp.get_pipe("ner").labels)

# Print the unique labels
print(unique_labels)

['CARDINAL', 'DATE', 'EVENT', 'FAC', 'GPE', 'LANGUAGE', 'LAW', 'LOC', 'MONEY', 'NORP', 'ORDINAL', 'ORG', 'PERCENT', 'PERSON', 'PRODUCT', 'QUANTITY', 'TIME', 'WORK_OF_ART']


The nlp.pipe() function efficiently processes the texts in batches and yields processed Doc objects one by one.

# Relationship Extraction Task

For explicit relationship, we can identify relational information between entities directly just by mapping it with each other but for implicit relationship we need to understand the grammatical formation btw entities . So using Spacy dependency parsing concept here 

#### following this notebook : https://colab.research.google.com/drive/1lS3B7v_BHzRcynDEaPSz-aJ4wiA_n1Q-#scrollTo=CzWFRpeQX5z6 

In [None]:
[(i.text, i.ent_iob_ + "-" + i.ent_type_) for i in doc]
# to get info about all the tokens and labels assigned to it 

we will be using these tokens in dependecy parsing to understand contextual information. 

once we've embedded our tokens, we want to encode them in a way incorporates their context in the sentence and their role in the downstream task. (E.g., "security" has a very different meaning if it's preceeded by "national" or "social"). BERT takes care of this part as it covers context of word birdirectionally 

In [None]:
#dependency visualizer
doc = nlp(text)
sent = list(doc.sents)[4]
displacy.render(sent, style="dep")
# so every element of list is one sentence that represents grammatical info of that sentence

In [None]:
dependencies = [(token.text, token.dep_, token.head.text) for token in doc]
print(dependencies)

Labels available in the model: 'CARDINAL', 'DATE', 'EVENT', 'FAC', 'GPE', 'LANGUAGE', 'LAW', 'LOC', 'MONEY', 'NORP', 'ORDINAL', 'ORG', 'PERCENT', 'PERSON', 'PRODUCT', 'QUANTITY', 'TIME', 'WORK_OF_ART'

In [None]:
# Process the text with the SpaCy model
doc = nlp_trf(text)

# List to store the entities
entities_list = []

# Extract the entities from the doc object and store in the entities_list
for ent in doc.ents:
    entity_dict = {'text': ent.text, 'label': ent.label_}
    entities_list.append(entity_dict)

# Print the entities_list
print(entities_list)

In [None]:
def person_to_verb(tok):
    verb_phrase = []
    # First, iterate through all the ancestors of the token
    for i in tok.ancestors:
        # When you get to a verb (using a POS tag)...
        if i.pos_ == "VERB":
            # ...add the verb to the verb phrase list
            verb_phrase.append(i)
            # Then, also add the direct object(s) of the verb, as long as the original token
            # is in the same subtree as the direct object
            verb_phrase.extend([j for j in i.children if j.dep_ == "dobj" and tok in i.subtree])
            break # as we want to find first verb only that will give us info about actions of the entity
    # Expand out the verb phrase to get modifiers ("amod") of the direct object
    for i in verb_phrase:
        for j in i.children:
            if j.dep_ == "amod":
                verb_phrase.append(j)

    # Sort the tokens by their position in the original sentence
    new_list = sorted(verb_phrase, key=lambda x: x.i)
    # Join them together with the correct whitespace and return
    return ''.join([i.text_with_ws for i in new_list]).strip()

In [None]:
# List to store the relationships
relationships_list = []

# Loop through the entities and find relationships for each entity
for entity in entities_list:
    # Find the corresponding token object for the current entity
    entity_text = entity['text']
    entity_token = None
    for token in doc:
        if token.text == entity_text:
            entity_token = token
            break

    # If the token corresponding to the entity is found, find relationships using loc_to_verb
    if entity_token:
        relationship = person_to_verb(entity_token)
        # Append the relationship to the relationships_list
        relationships_list.append({'entity': entity_text, 'relationship': relationship})

# Print the relationships
for relationship in relationships_list:
    print(relationship)

In [None]:
text="Last week, Akshat Engineering Services announced the launch of its new product, AkshatGPE. The phone comes with advanced features and cutting-edge technology. The launch event took place in San Francisco and was attended by tech enthusiasts from around the world. company's CEO, Sundar Pichai, delivered a keynote address, highlighting the phone's capabilities. The AkshatGPE is expected to be a game-changer in the smartphone market."

In [None]:
text