In [1]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

# Name Entity Recognition (ner)

In [26]:
text = """Microsoft Corporation announced a $10 billion investment in OpenAI on Monday.
CEO Satya Nadella stated that this partnership will accelerate AI innovation worldwide.
The announcement was made at the company’s headquarters in Redmond, Washington.
"""

In [3]:
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/829 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [27]:
nlp = pipeline("ner", model=model, tokenizer=tokenizer , grouped_entities = True)

Device set to use cuda:0


In [28]:
ner_results = nlp(text)
print(ner_results)

[{'entity_group': 'ORG', 'score': np.float32(0.9992714), 'word': 'Microsoft Corporation', 'start': 0, 'end': 21}, {'entity_group': 'ORG', 'score': np.float32(0.90710396), 'word': 'OpenAI', 'start': 60, 'end': 66}, {'entity_group': 'PER', 'score': np.float32(0.9620562), 'word': 'Satya Nadella', 'start': 83, 'end': 96}, {'entity_group': 'MISC', 'score': np.float32(0.99135613), 'word': 'AI', 'start': 142, 'end': 144}, {'entity_group': 'LOC', 'score': np.float32(0.95440245), 'word': 'Redmond', 'start': 227, 'end': 234}, {'entity_group': 'LOC', 'score': np.float32(0.99818677), 'word': 'Washington', 'start': 236, 'end': 246}]


In [29]:
for entity in ner_results:
    print(f"{entity['word']} ({entity['entity_group']}) - Score: {entity['score']:.2f}")

Microsoft Corporation (ORG) - Score: 1.00
OpenAI (ORG) - Score: 0.91
Satya Nadella (PER) - Score: 0.96
AI (MISC) - Score: 0.99
Redmond (LOC) - Score: 0.95
Washington (LOC) - Score: 1.00


In [36]:
text_2 = """Dr. Sarah Lee from Johns Hopkins Hospital presented a study on the use of Remdesivir for COVID-19 patients.
The findings were published in The Lancet on March 3, 2023.
"""

In [37]:
ner_results_2 = nlp(text_2)
print(ner_results_2)

[{'entity_group': 'PER', 'score': np.float32(0.99948883), 'word': 'Sarah Lee', 'start': 4, 'end': 13}, {'entity_group': 'LOC', 'score': np.float32(0.9991717), 'word': 'Johns Hopkins Hospital', 'start': 19, 'end': 41}, {'entity_group': 'MISC', 'score': np.float32(0.6587718), 'word': 'Re', 'start': 74, 'end': 76}, {'entity_group': 'ORG', 'score': np.float32(0.50892603), 'word': '##desiv', 'start': 77, 'end': 82}, {'entity_group': 'MISC', 'score': np.float32(0.6624677), 'word': 'CO', 'start': 89, 'end': 91}, {'entity_group': 'ORG', 'score': np.float32(0.99575424), 'word': 'The Lancet', 'start': 140, 'end': 150}]


In [38]:
for entity in ner_results_2:
    print(f"{entity['word']} ({entity['entity_group']}) - Score: {entity['score']:.2f}")

Sarah Lee (PER) - Score: 1.00
Johns Hopkins Hospital (LOC) - Score: 1.00
Re (MISC) - Score: 0.66
##desiv (ORG) - Score: 0.51
CO (MISC) - Score: 0.66
The Lancet (ORG) - Score: 1.00


# Text Summerization

In [40]:
Article = """BART is a transformer encoder-encoder (seq2seq) model with a bidirectional (BERT-like) encoder and an autoregressive (GPT-like) decoder. BART is pre-trained by (1) corrupting text with an arbitrary noising function, and (2) learning a model to reconstruct the original text.
BART is particularly effective when fine-tuned for text generation (e.g. summarization, translation) but also works well for comprehension tasks (e.g. text classification, question answering).
This particular checkpoint has been fine-tuned on CNN Daily Mail, a large collection of text-summary pairs."""

In [39]:
model = "facebook/bart-large-cnn"
summarizer = pipeline("summarization", model=model)

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0


In [41]:
print(summarizer(Article, max_length=130, min_length=30, do_sample=False))

[{'summary_text': 'BART is a transformer encoder-encoder (seq2seq) model with a bidirectional (BERT-like) encoder. BART is pre-trained by corrupting text with an arbitrary noising function, and learning a model to reconstruct the original text.'}]


In [43]:
summary = summarizer(Article, max_length=130, min_length=30, do_sample=False)[0]['summary_text']
print(summary)

BART is a transformer encoder-encoder (seq2seq) model with a bidirectional (BERT-like) encoder. BART is pre-trained by corrupting text with an arbitrary noising function, and learning a model to reconstruct the original text.


In [51]:
model_2 = "google/pegasus-xsum"
summarizer_2 = pipeline("summarization", model=model_2)

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/259 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/87.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Device set to use cuda:0


In [57]:
print(summarizer_2(text, max_length=32, min_length=30, do_sample=False))

[{'summary_text': 'The world’s largest software company has teamed up with a leading artificial intelligence (AI) firm, OpenAI, to create the world’s most'}]


In [59]:
summary_2 = summarizer_2(text, max_length=32, min_length=30, do_sample=False)[0]['summary_text']
print(summary_2)

The world’s largest software company has teamed up with a leading artificial intelligence (AI) firm, OpenAI, to create the world’s most


# Sentiment Analysis

In [60]:
sentiment_analysis = pipeline("sentiment-analysis",model="siebert/sentiment-roberta-large-english")

config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/256 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Device set to use cuda:0


In [61]:
print(sentiment_analysis(text))

[{'label': 'POSITIVE', 'score': 0.9985271692276001}]


In [62]:
print(sentiment_analysis(summary_2))

[{'label': 'POSITIVE', 'score': 0.9985944628715515}]
