In [1]:
pip install transformers --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m50.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.8/294.8 kB[0m [31m24.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m92.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m71.0 MB/s[0m eta [36m0:00:00[0m
[?25h

# LLMs for different use cases

## Named Entity Recogntion with BERT using Hugging Face

In [3]:
from transformers import BertTokenizer, BertForTokenClassification
from transformers import pipeline

# load pretrained model and toknizer
tokenizer = BertTokenizer.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
model = BertForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")

# NER Pipeline
nlp_ner= pipeline("ner",model = model, tokenizer=tokenizer)



Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
from transformers.models.luke.tokenization_luke import Entity
text = "James is meeting his friend Jenny in London tomorrow to hand over $5000 to her at Microsoft Inc soon."

entities = nlp_ner(text)
for en in entities:
  print(en)

{'entity': 'I-PER', 'score': 0.9992924, 'index': 1, 'word': 'James', 'start': None, 'end': None}
{'entity': 'I-PER', 'score': 0.99951935, 'index': 6, 'word': 'Jenny', 'start': None, 'end': None}
{'entity': 'I-LOC', 'score': 0.9995435, 'index': 8, 'word': 'London', 'start': None, 'end': None}
{'entity': 'I-ORG', 'score': 0.9997497, 'index': 18, 'word': 'Microsoft', 'start': None, 'end': None}
{'entity': 'I-ORG', 'score': 0.9991792, 'index': 19, 'word': 'Inc', 'start': None, 'end': None}


## Language Translation with MarianMT

In [11]:
!pip install sentencepiece --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-id")
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-id")

In [19]:
text = "Hello how are you, thank you"

In [20]:
tokens = tokenizer(text,return_tensors='pt',truncation=True,padding=True,max_length=512)
translated_ids = model.generate(tokens['input_ids'],attention_mask=tokens['attention_mask'],
                               num_beams=4,max_length=512)
ttext = tokenizer.decode(translated_ids[0], skip_special_tokens=True)
ttext

'Halo bagaimana kabarmu, terima kasih'

## Text Summarization with LLMs

In [None]:
from transformers import BertTokenizer, EncoderDecoderModel
model_name = "patrickvonplaten/bert2bert-cnn_dailymail-fp16"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = EncoderDecoderModel.from_pretrained(model_name)

In [27]:
article = """
NASA on Thursday (September 14) concluded its highly anticipated media briefing, revealing the results of a year-long, $100,000 study into Unidentified Aerial Phenomena (UAPs), also known as UFOs. The primary goal of this report was to shed light on these puzzling phenomena and establish a scientific framework for understanding them.
The briefing took an unexpected turn when questions arose regarding two purportedly "non-human" corpses that had been displayed in glass cases during an official unveiling at Mexico's Congress, sparking excitement within the UFO enthusiast community.
The mummified specimens were said to have been discovered in the city of Cusco, Peru, and were believed to be approximately 1,000 years old.
David Spergel, chair of the NASA UAP study, weighed in on the matter, stating that he had only seen reports about the specimens on social media and did not possess detailed information about their nature.
"We don't know the nature of those samples," he said.
The revelation left members of the Mexican Congress with mixed feelings. Some expressed "thoughts" and "concerns" about the discovery, indicating a desire to "continue talking about this."
The presence of Ryan Graves, a former U.S. Navy pilot who had previously claimed that the number of UFOs or UAPs (unidentified anomalous phenomena) was being "grossly underreported," further added to the intrigue surrounding NASA's big event on Thursday.
"""

tokens = tokenizer.encode("Summarize: "+article,return_tensors='pt',max_length=1024,truncation=True)
summary_ids = model.generate(tokens,length_penalty=2.0, early_stopping=True,
                               num_beams=4,max_length=150,min_length=40)
ttext = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
ttext

'nasa concludes its findings on a year - long, $ 100, 000 study into unidentified aerial phenomena. a former u. s. navy pilot claimed that the number of " non - human " aerial phenomena was being " grossly underreported "'

## Multiple LLMs

In [31]:
import torch

In [30]:
# BERT, GPT and T5

from transformers import BertModel, GPT2Model, T5Model


bert = BertModel.from_pretrained("bert-base-uncased")
gpt2 = GPT2Model.from_pretrained("gpt2-medium")
t5 = T5Model.from_pretrained('t5-small')



Downloading (…)lve/main/config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

In [33]:
# getting embeddings
bert_em =  bert(torch.tensor([[101,2001,2005,2300,1998,3400]]))
gpt2_em = gpt2(torch.tensor([[101,2001,2005,2300,1998,3400]]))
#t5_em = t5(torch.tensor([[101,2001,2005,2300,1998,3400]]))

In [34]:
bert_em[0]

tensor([[[-0.2147, -0.1318, -0.2130,  ..., -0.0333,  0.4217,  0.3951],
         [-0.0014, -0.1292, -0.1355,  ...,  0.0399,  0.7244,  0.5515],
         [ 0.3411,  0.1310, -0.1381,  ..., -0.0497,  0.3147,  0.1988],
         [ 0.2533,  0.0854, -0.0248,  ..., -0.0486,  0.3431,  0.6208],
         [-0.4771,  0.0414, -0.1351,  ...,  0.1538,  0.1319,  0.4882],
         [-0.1300,  0.0299, -0.0933,  ...,  0.0370,  0.3555,  0.3050]]],
       grad_fn=<NativeLayerNormBackward0>)

In [35]:
gpt2_em[0]

tensor([[[-0.4024, -0.5273,  0.1081,  ...,  0.6880, -0.1369,  0.2440],
         [-0.4062, -0.5417,  0.1133,  ...,  0.2075,  0.1925, -0.2087],
         [-0.2172,  0.0138, -0.3432,  ...,  0.2045, -0.0571,  0.1323],
         [-0.1143,  0.1413,  0.2520,  ...,  0.1474, -0.3158, -0.1237],
         [-0.0581,  0.3411,  0.1175,  ..., -0.1701,  0.1268,  0.2409],
         [ 0.3521,  0.0471, -0.3279,  ...,  0.2865, -0.1753, -0.0048]]],
       grad_fn=<ViewBackward0>)