In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import transformers

In [2]:
import torch as torch

In [3]:
pip show transformers

Name: transformers
Version: 4.31.0
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: C:\Users\AKSHAT RAI LADDHA\anaconda3\Lib\site-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: 
Note: you may need to restart the kernel to use updated packages.


In [4]:
from transformers import BertModel, BertForMaskedLM

In [5]:
from transformers import BertTokenizer, BertForTokenClassification
import torch

# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForTokenClassification.from_pretrained(model_name)

# Sample text for NER
text = "John works at Microsoft Corp as a software engineer in New York City."

# Tokenize the input text
tokens = tokenizer(text, return_tensors='pt')

# Make predictions
with torch.no_grad():
    outputs = model(**tokens)

# Get the predicted label ids
predicted_label_ids = torch.argmax(outputs.logits, dim=2).squeeze()

# Convert label ids back to entity labels
labels = [tokenizer.decode(label_id) for label_id in predicted_label_ids]

# Post-process to extract entities
entities = []
current_entity = ""
for token, label in zip(tokens['input_ids'][0], labels):
    token_str = tokenizer.decode(token)
    if "##" in token_str:  # Handling subwords
        current_entity += token_str.replace("##", "")
    else:
        if current_entity:
            entities.append((current_entity, label))
            current_entity = ""
        if label != 'O':  # 'O' represents no entity
            entities.append((token_str, label))

# Print the extracted entities
print(entities)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[('[ C L S ]', '[ P A D ]'), ('J o h n', '[ P A D ]'), ('w o r k s', '[ P A D ]'), ('a t', '[ P A D ]'), ('M i c r o s o f t', '[ P A D ]'), ('C o r p', '[ P A D ]'), ('a s', '[ P A D ]'), ('a', '[ P A D ]'), ('s o f t w a r e', '[ P A D ]'), ('e n g i n e e r', '[ P A D ]'), ('i n', '[ P A D ]'), ('N e w', '[ P A D ]'), ('Y o r k', '[ P A D ]'), ('C i t y', '[ P A D ]'), ('.', '[ P A D ]'), ('[ S E P ]', '[ P A D ]')]


### Model 1 :BERT NER model 

In [6]:
from transformers import pipeline

# Load the NER pipeline
ner_pipeline = pipeline("ner") #using ner trained model by hugging face to extract entites 

# Example text
text = "Myself Akshat, worked as ML intern in Carelon Global Solutions from May to June in Bangalore"

# Perform Named Entity Recognition on the text
entities = ner_pipeline(text)

# Print the recognized entities
for entity in entities:
    print(f"Entity: {entity['word']} | Type: {entity['entity']}")


No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Entity: A | Type: I-PER
Entity: ##ks | Type: I-PER
Entity: ##hat | Type: I-PER
Entity: Care | Type: I-ORG
Entity: ##lon | Type: I-ORG
Entity: Global | Type: I-ORG
Entity: Solutions | Type: I-ORG
Entity: Bangalore | Type: I-LOC


Extracted entities with particular information using NER model from hugging face. Generation of subwords from words such as Akshat to A, ##ks, ##hat this is because the model's tokenizer is based on WordPiece tokenization, where words are split into subword units to handle out-of-vocabulary (OOV) words.

### trying out luck with other fine tuned models 

### Model 2: Fine tuned model by Balamurugan 

In [5]:
from transformers import pipeline

# Loading the pipeline from hub
# Pipeline handles the preprocessing and post processing steps
model_checkpoint = "balamurugan1603/bert-finetuned-ner"
namedEntityRecogniser = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple"
)

In [6]:
text = "Myself Akshat Rai Laddha, working as Machine Learning intern at Carelon global Solution in Bangalore"

In [7]:
sample_output = namedEntityRecogniser([text])

In [8]:
print(sample_output)

[[{'entity_group': 'PER', 'score': 0.9970353, 'word': 'Akshat Rai Laddha', 'start': 7, 'end': 24}, {'entity_group': 'ORG', 'score': 0.82005215, 'word': 'Machine Learning', 'start': 37, 'end': 53}, {'entity_group': 'ORG', 'score': 0.96633375, 'word': 'Carelon', 'start': 64, 'end': 71}, {'entity_group': 'ORG', 'score': 0.84763527, 'word': 'Solution', 'start': 79, 'end': 87}, {'entity_group': 'LOC', 'score': 0.995404, 'word': 'Bangalore', 'start': 91, 'end': 100}]]


In [11]:
from spacy import displacy

In [12]:
def visualize(pipeline_output, texts):
    
    """ Visualizes text and their Named entities.
    
    Args:
        pipeline_output (list): Output of the pipeline.
        texts (list): List containing original text.
    
    Returns:
        Nothing
        
    """
    
    for i in range(len(sample_output)):
        entities = []
        for ents in sample_output[i]:
            entities.append({"end": ents["end"], "label": ents["entity_group"], "start": ents["start"]})
        displacy.render({
            "ents": entities,
            "text": texts[i]
        }, style="ent", manual=True)
        

In [13]:
# results visualization
visualize(sample_output, [text])

## # Model 3: Fine tuned Model by Rashid (additional tags : building, art, misc)

Model details: Deberta-v3-base and finetuned it on Few-NERD, NER based dataset with 180k+ examples 

In [112]:
from transformers import pipeline

# Loading the pipeline from hub
# Pipeline handles the preprocessing and post processing steps
model_checkpoint = "RashidNLP/NER-Deberta"
namedEntityRecogniser = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple"
)

In [113]:
text = "Myself Akshat Rai Laddha, working as ML intern at Carelon global Solution's Bagmane infotech park in Hyderabad"

In [114]:
sample_output = namedEntityRecogniser([text])

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [115]:
print(sample_output)

[[{'entity_group': 'person', 'score': 0.98134595, 'word': 'Akshat Rai Laddha', 'start': 6, 'end': 24}, {'entity_group': 'organization', 'score': 0.72849536, 'word': "Carelon global Solution's", 'start': 49, 'end': 75}, {'entity_group': 'location', 'score': 0.50732577, 'word': 'Bagmane', 'start': 75, 'end': 83}, {'entity_group': 'organization', 'score': 0.4707223, 'word': 'infotech park', 'start': 83, 'end': 97}]]


In [116]:
from spacy import displacy

In [117]:
def visualize(pipeline_output, texts):
    
    """ Visualizes text and their Named entities.
    
    Args:
        pipeline_output (list): Output of the pipeline.
        texts (list): List containing original text.
    
    Returns:
        Nothing
        
    """
    
    for i in range(len(sample_output)):
        entities = []
        for ents in sample_output[i]:
            entities.append({"end": ents["end"], "label": ents["entity_group"], "start": ents["start"]})
        displacy.render({
            "ents": entities,
            "text": texts[i]
        }, style="ent", manual=True)
        

In [118]:
visualize(sample_output, [text])

In [119]:
print(sample_output)

[[{'entity_group': 'person', 'score': 0.98134595, 'word': 'Akshat Rai Laddha', 'start': 6, 'end': 24}, {'entity_group': 'organization', 'score': 0.72849536, 'word': "Carelon global Solution's", 'start': 49, 'end': 75}, {'entity_group': 'location', 'score': 0.50732577, 'word': 'Bagmane', 'start': 75, 'end': 83}, {'entity_group': 'organization', 'score': 0.4707223, 'word': 'infotech park', 'start': 83, 'end': 97}]]


feedback: not capturing hyderabad as our output location label, rejecting this fine tuned model

### Model Stacking concept or pipelineing models 

In [97]:
from transformers import pipeline
# Loading the pipeline from hub
# Pipeline handles the preprocessing and post processing steps
model_checkpoint = "algiraldohe/lm-ner-linkedin-skills-recognition"
namedEntityRecogniser_m2 = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple"
)

In [98]:
text = "Myself Akshat Rai Laddha, working as Machine Learning engineer at Carelon Global Solutions in Bangalore. Proficient in public speaking and coding"

In [99]:
m2_results = namedEntityRecogniser_m2([text])

In [100]:
print(m2_results)

[[{'entity_group': 'TECHNICAL', 'score': 0.99462366, 'word': 'machine learning', 'start': 37, 'end': 53}, {'entity_group': 'SOFT', 'score': 0.7944664, 'word': 'public speaking', 'start': 119, 'end': 134}]]


In [101]:
# Loading the pipeline from hub
# Pipeline handles the preprocessing and post processing steps
model_checkpoint = "balamurugan1603/bert-finetuned-ner"
namedEntityRecogniser_m1 = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple"
)

In [102]:
m1_results = namedEntityRecogniser_m1([text])

In [103]:
print(m1_results)

[[{'entity_group': 'PER', 'score': 0.99715745, 'word': 'Akshat Rai Laddha', 'start': 7, 'end': 24}, {'entity_group': 'ORG', 'score': 0.99314034, 'word': 'Carelon Global Solutions', 'start': 66, 'end': 90}, {'entity_group': 'LOC', 'score': 0.9948565, 'word': 'Bangalore', 'start': 94, 'end': 103}]]


In [104]:
m1_results.extend(m2_results)

In [105]:
print(m1_results)

[[{'entity_group': 'PER', 'score': 0.99715745, 'word': 'Akshat Rai Laddha', 'start': 7, 'end': 24}, {'entity_group': 'ORG', 'score': 0.99314034, 'word': 'Carelon Global Solutions', 'start': 66, 'end': 90}, {'entity_group': 'LOC', 'score': 0.9948565, 'word': 'Bangalore', 'start': 94, 'end': 103}], [{'entity_group': 'TECHNICAL', 'score': 0.99462366, 'word': 'machine learning', 'start': 37, 'end': 53}, {'entity_group': 'SOFT', 'score': 0.7944664, 'word': 'public speaking', 'start': 119, 'end': 134}]]


In [106]:
merged_data=m1_results[0]
merged_data.extend(m1_results[1])

In [107]:
print(merged_data)

[{'entity_group': 'PER', 'score': 0.99715745, 'word': 'Akshat Rai Laddha', 'start': 7, 'end': 24}, {'entity_group': 'ORG', 'score': 0.99314034, 'word': 'Carelon Global Solutions', 'start': 66, 'end': 90}, {'entity_group': 'LOC', 'score': 0.9948565, 'word': 'Bangalore', 'start': 94, 'end': 103}, {'entity_group': 'TECHNICAL', 'score': 0.99462366, 'word': 'machine learning', 'start': 37, 'end': 53}, {'entity_group': 'SOFT', 'score': 0.7944664, 'word': 'public speaking', 'start': 119, 'end': 134}]


In [80]:
# Sort the list based on 'start' index in ascending order
sorted_data = sorted(merged_data, key=lambda x: x['start'])

# Print the sorted entities
for entity in sorted_data:
    print(entity)


{'entity_group': 'PER', 'score': 0.99715745, 'word': 'Akshat Rai Laddha', 'start': 7, 'end': 24}
{'entity_group': 'TECHNICAL', 'score': 0.99462366, 'word': 'machine learning', 'start': 37, 'end': 53}
{'entity_group': 'ORG', 'score': 0.99314034, 'word': 'Carelon Global Solutions', 'start': 66, 'end': 90}
{'entity_group': 'LOC', 'score': 0.9948565, 'word': 'Bangalore', 'start': 94, 'end': 103}
{'entity_group': 'SOFT', 'score': 0.7944664, 'word': 'public speaking', 'start': 119, 'end': 134}


In [82]:
print(sorted_data)

[{'entity_group': 'PER', 'score': 0.99715745, 'word': 'Akshat Rai Laddha', 'start': 7, 'end': 24}, {'entity_group': 'TECHNICAL', 'score': 0.99462366, 'word': 'machine learning', 'start': 37, 'end': 53}, {'entity_group': 'ORG', 'score': 0.99314034, 'word': 'Carelon Global Solutions', 'start': 66, 'end': 90}, {'entity_group': 'LOC', 'score': 0.9948565, 'word': 'Bangalore', 'start': 94, 'end': 103}, {'entity_group': 'SOFT', 'score': 0.7944664, 'word': 'public speaking', 'start': 119, 'end': 134}]


In [84]:
text

'Myself Akshat Rai Laddha, working as Machine Learning engineer at Carelon Global Solutions in Bangalore. Proficient in public speaking and coding'

In [88]:
def visualize(pipeline_output, texts):
    
    """ Visualizes text and their Named entities.
    
    Args:
        pipeline_output (list): Output of the pipeline.
        texts (list): List containing original text.
    
    Returns:
        Nothing
        
    """
    
    for i in range(len(final_list)):
        entities = []
        for ents in final_list[i]:
            entities.append({"end": ents["end"], "label": ents["entity_group"], "start": ents["start"]})
        displacy.render({
            "ents": entities,
            "text": texts[i]
        }, style="ent", manual=True)
        

In [89]:
visualize(sorted_data, [text])

IndexError: list index out of range

In [94]:
def visualize_modified(pipeline_output, texts):
    """
    Visualizes text and their Named entities.

    Args:
        pipeline_output (list): Output of the pipeline.
        texts (list): List containing original text.

    Returns:
        Nothing

    """

    for entities, text in zip(pipeline_output, texts):
        entities_to_render = []
        for entity in entities:
            entities_to_render.append({
                "start": entity["start"],
                "end": entity["end"],
                "label": entity["entity_group"]
            })
        displacy.render({
            "text": text,
            "ents": entities_to_render,
            "title": "Named Entities"
        }, style="ent", manual=True)

# Call the visualize function
visualize(sorted_data,[text])


IndexError: list index out of range

In [96]:
data = [[{'entity_group': 'PER', 'score': 0.99715745, 'word': 'Akshat Rai Laddha', 'start': 7, 'end': 24},
         {'entity_group': 'ORG', 'score': 0.99314034, 'word': 'Carelon Global Solutions', 'start': 66, 'end': 90},
         {'entity_group': 'LOC', 'score': 0.9948565, 'word': 'Bangalore', 'start': 94, 'end': 103}],
        [{'entity_group': 'TECHNICAL', 'score': 0.99462366, 'word': 'machine learning', 'start': 37, 'end': 53},
         {'entity_group': 'SOFT', 'score': 0.7944664, 'word': 'public speaking', 'start': 119, 'end': 134}]]

# Merge the two lists into a single list
merged_data = data[0]
for entity in data[1]:
    existing_entity = next((ent for ent in merged_data if ent['start'] == entity['start']), None)
    if existing_entity is None or entity['score'] > existing_entity['score']:
        merged_data.append(entity)

# Sort the merged list based on 'start' index in ascending order
sorted_data = sorted(merged_data, key=lambda x: x['start'])

# Print the sorted and merged entities
for entity in sorted_data:
    print(entity)


{'entity_group': 'PER', 'score': 0.99715745, 'word': 'Akshat Rai Laddha', 'start': 7, 'end': 24}
{'entity_group': 'TECHNICAL', 'score': 0.99462366, 'word': 'machine learning', 'start': 37, 'end': 53}
{'entity_group': 'ORG', 'score': 0.99314034, 'word': 'Carelon Global Solutions', 'start': 66, 'end': 90}
{'entity_group': 'LOC', 'score': 0.9948565, 'word': 'Bangalore', 'start': 94, 'end': 103}
{'entity_group': 'SOFT', 'score': 0.7944664, 'word': 'public speaking', 'start': 119, 'end': 134}
