In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import transformers

In [2]:
import torch as torch

In [4]:
pip show transformers

Name: transformers
Version: 4.31.0
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: C:\Users\AKSHAT RAI LADDHA\anaconda3\Lib\site-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: 
Note: you may need to restart the kernel to use updated packages.


In [6]:
from transformers import BertModel, BertForMaskedLM

In [8]:
from transformers import BertTokenizer, BertForTokenClassification
import torch

# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForTokenClassification.from_pretrained(model_name)

# Sample text for NER
text = "John works at Microsoft Corp as a software engineer in New York City."

# Tokenize the input text
tokens = tokenizer(text, return_tensors='pt')

# Make predictions
with torch.no_grad():
    outputs = model(**tokens)

# Get the predicted label ids
predicted_label_ids = torch.argmax(outputs.logits, dim=2).squeeze()

# Convert label ids back to entity labels
labels = [tokenizer.decode(label_id) for label_id in predicted_label_ids]

# Post-process to extract entities
entities = []
current_entity = ""
for token, label in zip(tokens['input_ids'][0], labels):
    token_str = tokenizer.decode(token)
    if "##" in token_str:  # Handling subwords
        current_entity += token_str.replace("##", "")
    else:
        if current_entity:
            entities.append((current_entity, label))
            current_entity = ""
        if label != 'O':  # 'O' represents no entity
            entities.append((token_str, label))

# Print the extracted entities
print(entities)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[('[ C L S ]', '[ P A D ]'), ('J o h n', '[ u n u s e d 1 ]'), ('w o r k s', '[ u n u s e d 1 ]'), ('a t', '[ P A D ]'), ('M i c r o s o f t', '[ P A D ]'), ('C o r p', '[ u n u s e d 1 ]'), ('a s', '[ u n u s e d 1 ]'), ('a', '[ u n u s e d 1 ]'), ('s o f t w a r e', '[ P A D ]'), ('e n g i n e e r', '[ P A D ]'), ('i n', '[ u n u s e d 1 ]'), ('N e w', '[ u n u s e d 1 ]'), ('Y o r k', '[ u n u s e d 1 ]'), ('C i t y', '[ u n u s e d 1 ]'), ('.', '[ u n u s e d 1 ]'), ('[ S E P ]', '[ u n u s e d 1 ]')]


### Model 1 :BERT NER model 

In [1]:
from transformers import pipeline

# Load the NER pipeline
ner_pipeline = pipeline("ner") #using ner trained model by hugging face to extract entites 

# Example text
text = "Myself Akshat, worked as ML intern in Carelon Global Solutions from May to June in Bangalore"

# Perform Named Entity Recognition on the text
entities = ner_pipeline(text)

# Print the recognized entities
for entity in entities:
    print(f"Entity: {entity['word']} | Type: {entity['entity']}")


No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Entity: A | Type: I-PER
Entity: ##ks | Type: I-PER
Entity: ##hat | Type: I-PER
Entity: Care | Type: I-ORG
Entity: ##lon | Type: I-ORG
Entity: Global | Type: I-ORG
Entity: Solutions | Type: I-ORG
Entity: Bangalore | Type: I-LOC


Extracted entities with particular information using NER model from hugging face. Generation of subwords from words such as Akshat to A, ##ks, ##hat this is because the model's tokenizer is based on WordPiece tokenization, where words are split into subword units to handle out-of-vocabulary (OOV) words.

### trying out luck with other fine tuned models 

### Model 2: Fine Tuned BERT model for entity extraction task

In [1]:
from transformers import pipeline

# Loading the pipeline from hub
# Pipeline handles the preprocessing and post processing steps
model_checkpoint = "balamurugan1603/bert-finetuned-ner"
namedEntityRecogniser = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple"
)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.03k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading pytorch_model.bin:   0%|          | 0.00/431M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/320 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [8]:
text = "Myself Akshat Rai Laddha, working as ML intern at Carelon global Solution in Bangalore"

In [9]:
sample_output = namedEntityRecogniser([text])

In [10]:
print(sample_output)

[[{'entity_group': 'PER', 'score': 0.99700034, 'word': 'Akshat Rai Laddha', 'start': 7, 'end': 24}, {'entity_group': 'ORG', 'score': 0.90261436, 'word': 'ML', 'start': 37, 'end': 39}, {'entity_group': 'ORG', 'score': 0.8897029, 'word': 'Carelon global Solution', 'start': 50, 'end': 73}, {'entity_group': 'LOC', 'score': 0.9953557, 'word': 'Bangalore', 'start': 77, 'end': 86}]]


In [11]:
from spacy import displacy

In [12]:
def visualize(pipeline_output, texts):
    
    """ Visualizes text and their Named entities.
    
    Args:
        pipeline_output (list): Output of the pipeline.
        texts (list): List containing original text.
    
    Returns:
        Nothing
        
    """
    
    for i in range(len(sample_output)):
        entities = []
        for ents in sample_output[i]:
            entities.append({"end": ents["end"], "label": ents["entity_group"], "start": ents["start"]})
        displacy.render({
            "ents": entities,
            "text": texts[i]
        }, style="ent", manual=True)
        

In [13]:
visualize(sample_output, [text])