In [1]:
import torch  
import torch.nn as nn  
import math  

# Import the function for loading Hugging Face pipelines
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name = 'nlptown/bert-base-multilingual-uncased-sentiment'

# Load the pipeline for sentiment classification
classifier = pipeline("text-classification", model=model_name)
classifier

<transformers.pipelines.text_classification.TextClassificationPipeline at 0x165904cf070>

In [3]:
prompt = "The food was good, but service at the restaurant was a bit slow"

# Pass the customer review to the model for prediction
prediction = classifier(prompt)
print(prediction)

[{'label': '3 stars', 'score': 0.6387940645217896}]


In [4]:
prompt = "The food was good, everything well organized"

# Pass the customer review to the model for prediction
prediction = classifier(prompt)
print(prediction)

[{'label': '4 stars', 'score': 0.5190770030021667}]


In [5]:
model_name = 'cnicu/t5-small-booksum'

# Load the model pipeline for text summarization
summarizer = pipeline('summarization', model = model_name)
summarizer

<transformers.pipelines.text2text_generation.SummarizationPipeline at 0x165cdddac80>

In [6]:
long_text = '\nThe tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey building, and the tallest structure in Paris. Its base is square, measuring 125 metres (410 ft) on each side. During its construction, the Eiffel Tower surpassed the Washington Monument to become the tallest man-made structure in the world, a title it held for 41 years until the Chrysler Building in New York City was finished in 1930. It was the first structure to reach a height of 300 metres. Due to the addition of a broadcasting aerial at the top of the tower in 1957, it is now taller than the Chrysler Building by 5.2 metres (17 ft). Excluding transmitters, the Eiffel Tower is the second tallest free-standing structure in France after the Millau Viaduct.\n'

# Pass the long text to the model to summarize it
outputs = summarizer(long_text, max_length = 30)
outputs

[{'summary_text': 'the Eiffel Tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey'}]

In [7]:
# Access and print the summarized text in the outputs variable
print(outputs[0]['summary_text'])

the Eiffel Tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey


In [8]:
# Load the model pipeline for question-answering
qa_model = pipeline("question-answering")
question = "For how long was the Eiffel Tower the tallest man-made structure in the world?"
question = "What's the tallest structure in France?"

# Pass the necessary inputs to the LLM pipeline for question-answering
outputs = qa_model(question, long_text)

# Access and print the answer
print(outputs['answer'])

No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


Millau Viaduct


In [9]:
model_name = "Helsinki-NLP/opus-mt-es-en"

input_text = "No creo que hagas una buena traducción"

# Define pipeline for Spanish-to-English translation
translator = pipeline('translation_es_to_en', model=model_name)

# Translate the input text
translations = translator(input_text)

# Access the output to print the translated text in English
print(translations[0]['translation_text'])



I don't think you're doing a good translation.


In [10]:
# Set transformer model hyperparameters
d_model = 256
n_heads = 4
num_encoder_layers = 3
num_decoder_layers = 3

In [11]:
import torch.nn as nn

# Create the transformer model and assign hyperparameters
model = nn.Transformer(
    d_model=d_model, # d_model is the dimension of the input vectors and output vectors of the model, specifically the size of the feature space. Essentially, it determines the number of features in each transformer layer
    nhead=n_heads,
    num_encoder_layers=num_encoder_layers,    
    num_decoder_layers=num_decoder_layers
    )


print(model)

Transformer(
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-2): 3 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
        )
        (linear1): Linear(in_features=256, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=2048, out_features=256, bias=True)
        (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
    (norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
  )
  (decoder): TransformerDecoder(
    (layers): ModuleList(
      (0-2): 3 x TransformerDecoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=256, o



In [12]:
# Create a pipeline for text generation using the gpt2 model
generator = pipeline("text-generation", model = "gpt2")

text = "I had a wonderful stay at the Riverview Hotel! The staff were incredibly attentive and the amenities were top-notch. The only hiccup was a slight delay in room service, but that didn't overshadow the fantastic experience I had."

response = "Dear valued customer, I am glad to hear you had a good stay with us."



In [13]:
# Build the prompt for the text generation LLM

prompt = f"Customer review:\n{text}\n\nHotel reponse to the customer:\n{response}"

prompt

"Customer review:\nI had a wonderful stay at the Riverview Hotel! The staff were incredibly attentive and the amenities were top-notch. The only hiccup was a slight delay in room service, but that didn't overshadow the fantastic experience I had.\n\nHotel reponse to the customer:\nDear valued customer, I am glad to hear you had a good stay with us."

In [14]:
#### Pass the prompt to the model pipeline
outputs = generator(prompt, max_length = 100, pad_token_id=generator.tokenizer.eos_token_id) #  if the generated text is shorter than max_length, the remaining tokens will be filled with the EOS token.

# Print the augmented sequence generated by the model
print(outputs[0]['generated_text'])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Customer review:
I had a wonderful stay at the Riverview Hotel! The staff were incredibly attentive and the amenities were top-notch. The only hiccup was a slight delay in room service, but that didn't overshadow the fantastic experience I had.

Hotel reponse to the customer:
Dear valued customer, I am glad to hear you had a good stay with us. There were several people there and, as always, the food was fresh and the service was nice


In [15]:
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification, AutoModelForSeq2SeqLM

model_name = "textattack/distilbert-base-uncased-SST-2"

# Load the tokenizer and pre-trained model
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer

DistilBertTokenizerFast(name_or_path='textattack/distilbert-base-uncased-SST-2', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [16]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [17]:
text = ["The best movie I've ever watched!", "What an awful movie. I regret watching it."]

# Tokenize inputs and pass them to the model for inference
inputs = tokenizer(text, return_tensors="pt", padding=True)
inputs

{'input_ids': tensor([[ 101, 1996, 2190, 3185, 1045, 1005, 2310, 2412, 3427,  999,  102,    0],
        [ 101, 2054, 2019, 9643, 3185, 1012, 1045, 9038, 3666, 2009, 1012,  102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [18]:
outputs = model(**inputs)
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[-0.0542,  0.2731],
        [ 0.9809, -0.7639]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [19]:
logits = outputs.logits

logits

tensor([[-0.0542,  0.2731],
        [ 0.9809, -0.7639]], grad_fn=<AddmmBackward0>)

In [20]:
predicted_classes = torch.argmax(logits, dim=1).tolist()
predicted_classes

[1, 0]

In [21]:
for idx, predicted_class in enumerate(predicted_classes):
    print(f"Predicted class for \"{text[idx]}\": {predicted_class}")

Predicted class for "The best movie I've ever watched!": 1
Predicted class for "What an awful movie. I regret watching it.": 0


In [22]:
from datasets import load_dataset

# Load a dataset from Hugging Face's dataset hub
dataset = load_dataset('opinosis', trust_remote_code=True)

dataset


DatasetDict({
    train: Dataset({
        features: ['review_sents', 'summaries'],
        num_rows: 51
    })
})

In [23]:
print(f"Number of instances: {len(dataset['train'])}")


Number of instances: 51


In [24]:
# Show the names of features in the training fold of the dataset
print(f"Feature names: {dataset['train'].column_names}")

Feature names: ['review_sents', 'summaries']


In [25]:
print(dataset['train'].shape)
print(dataset['train'][0].keys())
dataset['train'][0]

(51, 2)
dict_keys(['review_sents', 'summaries'])


{'review_sents': ", and is very, very accurate .\r\n but for the most part, we find that the Garmin software provides accurate directions, whereever we intend to go .\r\n This function is not accurate if you don't leave it in battery mode say, when you stop at the Cracker Barrell for lunch and to play one of those trangle games with the tees .\r\n It provides immediate alternatives if the route from the online map program was inaccurate or blocked by an obstacle .\r\n I've used other GPS units, as well as GPS built into cars   and to this day NOTHING beats the accuracy of a Garmin GPS .\r\n It got me from point A to point B with 100% accuracy everytime .\r\n It has yet to disappoint, getting me everywhere with 100% accuracy .\r\n0 out of 5 stars Honest, accurate review, , PLEASE READ !\r\n Aside from that, every destination I've thrown at has been 100% accurate .\r\nIn closing, this is a fantastic GPS with some very nice features and is very accurate in directions .\r\n Plus, I've alwa

In [26]:
# Encode the input example, obtain the summary, and decode it
example = dataset['train'][0]['review_sents']

In [27]:
model_name = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [28]:
input_ids = tokenizer.encode("summarize: " + example, return_tensors="pt", max_length=512, truncation=True)
input_ids

tensor([[21603,    10,     3,     6,    11,    19,   182,     6,   182,  4034,
             3,     5,    68,    21,     8,   167,   294,     6,    62,   253,
            24,     8,  3121,  1109,   889,   795,  4034,  7943,     6,   213,
          3258,    62,  8286,    12,   281,     3,     5,   100,  1681,    19,
            59,  4034,     3,    99,    25,   278,    31,    17,  1175,    34,
            16,  3322,  2175,   497,     6,   116,    25,  1190,    44,     8,
         23291,    49, 13384,  3820,    21,  3074,    11,    12,   577,    80,
            13,   273,     3, 11665,  3537,  1031,    28,     8,     3,    17,
            15,    15,     7,     3,     5,    94,   795,  5299, 10336,     3,
            99,     8,  2981,    45,     8,   367,  2828,   478,    47, 27801,
            42, 12468,    57,    46, 22820,     3,     5,    27,    31,   162,
           261,   119,  9679,  3173,     6,    38,   168,    38,  9679,  1192,
           139,  2948,    11,    12,    48,   239,  

In [29]:
summary_ids = model.generate(input_ids, max_length=150)
summary_ids

tensor([[    0,    48,  1681,    19,    59,  4034,     3,    99,    25,   278,
            31,    17,  1175,    34,    16,  3322,  2175,   497,     6,   116,
            25,  1190,    44,     8, 23291,    49, 13384,  3820,    21,  3074,
            11,    12,   577,    80,    13,   273,     3, 11665,  3537,  1031,
            28,     8,     3,    17,    15,    15,     7,     3,     5,    34,
           795,  5299, 10336,     3,    99,     8,  2981,    45,     8,   367,
          2828,   478,    47, 27801,    42, 12468,    57,    46, 22820,     3,
             5,     1]])

In [30]:
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

print("\nOriginal Text (first 400 characters): \n", example[:400])
print("\nGenerated Summary: \n", summary)


Original Text (first 400 characters): 
 , and is very, very accurate .
 but for the most part, we find that the Garmin software provides accurate directions, whereever we intend to go .
 This function is not accurate if you don't leave it in battery mode say, when you stop at the Cracker Barrell for lunch and to play one of those trangle games with the tees .
 It provides immediate alternatives if the route from the online map progra

Generated Summary: 
 this function is not accurate if you don't leave it in battery mode say, when you stop at the Cracker Barrell for lunch and to play one of those trangle games with the tees. it provides immediate alternatives if the route from the online map program was inaccurate or blocked by an obstacle.


In [31]:
model_name = "Helsinki-NLP/opus-mt-en-es"

# Load the tokenizer and the model checkpoint
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

model



MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(65001, 512, padding_idx=65000)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(65001, 512, padding_idx=65000)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0-5): 6 x MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation_fn): SiLU()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_layer_norm): LayerNorm((512,), eps=1e-05

In [32]:
english_inputs = ["Hello", "Thank you", "How are you?", "Sorry", "Goodbye"]

# Encode the inputs, generate translations, decode, and print them
for english_input in english_inputs:
    print('english_input', english_input)
    input_ids = tokenizer.encode(english_input, return_tensors="pt")
    print('input_ids', input_ids)
    translated_ids = model.generate(input_ids)
    print('translated_ids', translated_ids)
    translated_text = tokenizer.decode(translated_ids[0], skip_special_tokens=True)
    print(f"English: {english_input} | Spanish: {translated_text}")
    print('---------------------------------')
    
# the additional tokens generated in the input_ids and translated_ids are start/end of sequences or special characters (like punctuation)

english_input Hello
input_ids tensor([[3923,    0]])
translated_ids tensor([[65000,  2119,     3,     0]])
English: Hello | Spanish: Hola.
---------------------------------
english_input Thank you
input_ids tensor([[1825,   40,    0]])
translated_ids tensor([[65000,  1124,     3,     0]])
English: Thank you | Spanish: Gracias.
---------------------------------
english_input How are you?
input_ids tensor([[594,  53,  40,  21,   0]])
translated_ids tensor([[65000,    50,  1102,  1221,    21,     0]])
English: How are you? | Spanish: ¿Cómo estás?
---------------------------------
english_input Sorry
input_ids tensor([[5099,    0]])
translated_ids tensor([[65000,   350,  1669,     3,     0]])
English: Sorry | Spanish: Lo siento.
---------------------------------
english_input Goodbye
input_ids tensor([[22191,     0]])
translated_ids tensor([[65000,  8631,     3,     0]])
English: Goodbye | Spanish: Adiós.
---------------------------------


In [33]:
# Load a specific subset of the dataset 
mlqa = load_dataset("xtreme", name="MLQA.en.en")

mlqa

DatasetDict({
    test: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 11590
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 1148
    })
})

In [34]:
question = mlqa["test"]["question"][1]
context = mlqa["test"]["context"][1]
print("Question: ", question)
print("Context: ", context)

Question:  who represented robert frost and walter kasza in their suit?
Context:  In 1994, five unnamed civilian contractors and the widows of contractors Walter Kasza and Robert Frost sued the USAF and the United States Environmental Protection Agency. Their suit, in which they were represented by George Washington University law professor Jonathan Turley, alleged they had been present when large quantities of unknown chemicals had been burned in open pits and trenches at Groom. Biopsies taken from the complainants were analyzed by Rutgers University biochemists, who found high levels of dioxin, dibenzofuran, and trichloroethylene in their body fat. The complainants alleged they had sustained skin, liver, and respiratory injuries due to their work at Groom, and that this had contributed to the deaths of Frost and Kasza. The suit sought compensation for the injuries they had sustained, claiming the USAF had illegally handled toxic materials, and that the EPA had failed in its duty to e

In [35]:
model_ckp = "deepset/minilm-uncased-squad2"

# Initialize the tokenizer using the model checkpoint
tokenizer = AutoTokenizer.from_pretrained(model_ckp)

tokenizer

BertTokenizerFast(name_or_path='deepset/minilm-uncased-squad2', vocab_size=30522, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [36]:
# Tokenize the inputs returning the result as tensors
inputs = tokenizer(question, context, return_tensors="pt")

inputs

{'input_ids': tensor([[  101,  2040,  3421,  2728, 10097,  1998,  4787, 10556, 17112,  2050,
          1999,  2037,  4848,  1029,   102,  1999,  2807,  1010,  2274, 13294,
          6831, 16728,  1998,  1996, 24835,  1997, 16728,  4787, 10556, 17112,
          2050,  1998,  2728, 10097, 12923,  1996, 18531,  1998,  1996,  2142,
          2163,  4483,  3860,  4034,  1012,  2037,  4848,  1010,  1999,  2029,
          2027,  2020,  3421,  2011,  2577,  2899,  2118,  2375,  2934,  5655,
         10722, 12866,  1010,  6884,  2027,  2018,  2042,  2556,  2043,  2312,
         12450,  1997,  4242, 12141,  2018,  2042,  5296,  1999,  2330, 14496,
          1998, 19874,  2012, 18087,  1012, 16012,  4523,  3111,  2579,  2013,
          1996, 17612, 11390,  2020, 16578,  2011, 18607,  2118, 16012, 24229,
          2015,  1010,  2040,  2179,  2152,  3798,  1997,  4487, 11636,  2378,
          1010,  4487, 10609,  6844, 27942,  2319,  1010,  1998, 13012,  2818,
         10626,  8913, 16921, 11474,  

In [37]:
from transformers import AutoModelForQuestionAnswering

# Initialize the LLM upon the model checkpoint
model = AutoModelForQuestionAnswering.from_pretrained(model_ckp)

model

Some weights of the model checkpoint at deepset/minilm-uncased-squad2 were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 384, padding_idx=0)
      (position_embeddings): Embedding(512, 384)
      (token_type_embeddings): Embedding(2, 384)
      (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=384, out_features=384, bias=True)
              (key): Linear(in_features=384, out_features=384, bias=True)
              (value): Linear(in_features=384, out_features=384, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=384, out_features=384, bias=True)
              (LayerNorm): LayerNorm((384,), eps=1e-12, 

In [38]:
import torch

with torch.no_grad():
  # Forward-pass the input through the model
  outputs = model(**inputs)

outputs

QuestionAnsweringModelOutput(loss=None, start_logits=tensor([[-0.4820, -5.9404, -6.2120, -6.1143, -6.5224, -6.4951, -6.2917, -6.3820,
         -6.5194, -6.7537, -6.2008, -6.3057, -6.4222, -6.4810, -0.4820, -5.5818,
         -5.7127, -6.4838, -5.9010, -6.0856, -6.0827, -6.3097, -6.4831, -6.1578,
         -6.2008, -6.4419, -5.9325, -5.5235, -5.9732, -6.4012, -6.6175, -6.5242,
         -6.0292, -6.4287, -6.0237, -5.4262, -4.9336, -6.4591, -5.5458, -4.5965,
         -6.0965, -5.3848, -6.1986, -5.7813, -5.3709, -0.0967, -3.6305, -5.5801,
         -2.8863, -5.5696,  0.0861, -5.0735, -2.8041, -3.2378,  6.5888, -0.5763,
         -1.1910,  2.4895,  0.1463,  4.1906, -1.8216, -0.0463, -5.4365, -5.1538,
         -5.5951, -6.0921, -6.2618, -6.3710, -6.2747, -5.7789, -6.4970, -6.4495,
         -6.2545, -6.6000, -6.4817, -6.4268, -6.4884, -6.3635, -6.2562, -6.5128,
         -6.4415, -6.6112, -6.3148, -6.3446, -6.8477, -5.9043, -6.4874, -6.6654,
         -6.4378, -6.4005, -6.3099, -6.2191, -6.7656, -6

In [39]:
# Get the most likely start and end answer position from the raw LLM outputs
start_idx = torch.argmax(outputs.start_logits)
end_idx = torch.argmax(outputs.end_logits) + 1

start_idx, end_idx

(tensor(54), tensor(62))

In [40]:
# Access the tokenized inputs tensor to get the answer span
answer_span = inputs["input_ids"][0][start_idx:end_idx]

answer_span

tensor([ 2577,  2899,  2118,  2375,  2934,  5655, 10722, 12866])

In [41]:
# Decode the answer span to get the extracted answer text
answer = tokenizer.decode(answer_span)
print("Answer: ", answer)

Answer:  george washington university law professor jonathan turley


In [43]:
import torch

# Check if CUDA is available
if torch.cuda.is_available():
    print("CUDA is available! You have", torch.cuda.device_count(), "GPU(s).")
    print("GPU Device Name:", torch.cuda.get_device_name(0))
else:
    print("CUDA is not available.")


CUDA is available! You have 1 GPU(s).
GPU Device Name: NVIDIA GeForce RTX 2070 with Max-Q Design


In [59]:
import time

# Generate a random matrix
size = 15000 # recommended to set max 15000 to test on CPU, max 35000 on GPU (matrix multiplication computation = size^3)
a = torch.randn(size, size)
b = torch.randn(size, size)

# Test on CPU 
start_time = time.time()
c_cpu = torch.matmul(a, b)
cpu_time = time.time() - start_time
print(f"CPU Time: {cpu_time:.4f} seconds")

# Generate a random matrix
size = 35000 # recommended to set max 15000 to test on CPU, max 35000 on GPU (matrix multiplication computation = size^3)
a = torch.randn(size, size)
b = torch.randn(size, size)

# Test on GPU
if torch.cuda.is_available():
    a_gpu = a.to('cuda')
    b_gpu = b.to('cuda')
    start_time = time.time()
    c_gpu = torch.matmul(a_gpu, b_gpu)
    gpu_time = time.time() - start_time
    print(f"GPU Time: {gpu_time:.4f} seconds")
else:
    print("CUDA is not available.")

CPU Time: 20.3074 seconds
GPU Time: 0.4351 seconds


In [42]:
model_name = "distilbert-base-uncased"

# Load a pre-trained LLM, specifying its use for binary classification
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=6)

model

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [60]:
from transformers import TrainingArguments

# Set up training arguments with a batch size of 8 per GPU and 5 epochs
training_args = TrainingArguments(
    output_dir="./smaller_bert_finetuned",
    per_device_train_batch_size=8, # batch size for training on each device (e.g., GPU).
    num_train_epochs=5,
)

training_args

TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_steps=None,
eval_strategy=no,
evaluation_strategy=None,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_c

In [61]:
from transformers import Trainer

tokenized_datasets = []

# Set up trainer, assigning previously set up training arguments
# When using the Trainer class from Hugging Face Transformers with TrainingArguments, the model, data, and training steps are handled for you. If CUDA is available and detected, the Trainer will automatically move the model and data to the GPU. You don't need to manually set .to(device) on the model.

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
)

trainer

<transformers.trainer.Trainer at 0x165dec32290>

In [62]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [63]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer

DistilBertTokenizerFast(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [126]:
# Load your dataset
dataset = load_dataset('emotion', trust_remote_code=True)

dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [153]:
from datasets import load_dataset, DatasetDict

# Function to split each subset into 10
def get_subset(dataset, split_name, fraction=0.1): # it means 0.1 (10) used for the train
    return dataset[split_name].train_test_split(train_size=fraction, seed=42)['train'] # so we use the 'train' subset (10%) of the dataset

# Create smaller dataset with 10% of each original subset
small_dataset = DatasetDict({
    'train': get_subset(dataset, 'train'),
    'validation': get_subset(dataset, 'validation'),
    'test': get_subset(dataset, 'test')
})

print(small_dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 1600
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 200
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 200
    })
})


In [154]:
# Get the unique labels
unique_labels = small_dataset['train'].features['label'].names

# Print the number of unique labels and their names
print(f"Number of unique labels: {len(unique_labels)}")
print(f"Labels: {unique_labels}")

Number of unique labels: 6
Labels: ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']


In [155]:
# Print the first text string in the training set
print(small_dataset['train'][0]['text'], small_dataset['train'][0]['label'])

i feel like im the only one whos caring about whats good for me right now 2


In [156]:
# Print a few examples with their text and label
for i in range(5):
    text = small_dataset['train'][i]['text']
    label_index = small_dataset['train'][i]['label']
    label_text = unique_labels[label_index]
    print(f"Example {i+1}: Text: {text} | Lavel_index: {label_index} Label: {label_text}")

Example 1: Text: i feel like im the only one whos caring about whats good for me right now | Lavel_index: 2 Label: love
Example 2: Text: im feeling determined now to push through any hiccups and reach my ultimate goal of being within the healthy weight range kg for my height | Lavel_index: 1 Label: joy
Example 3: Text: i just feel more dazed and alone in the end | Lavel_index: 5 Label: surprise
Example 4: Text: i feel there is also a difference between loving someone and being in love with someone | Lavel_index: 2 Label: love
Example 5: Text: im home alone with my son and im feeling sad | Lavel_index: 0 Label: sadness


In [157]:
# Encode your dataset
def encode(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length')

emotions_encoded = small_dataset.map(encode, batched=True) # by using the .map function, you're applying the encode function (tokenization) to every sample in your dataset.

emotions_encoded


Map: 100%|██████████████████████████████████████████████████████████████████| 200/200 [00:00<00:00, 2361.42 examples/s]


DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 1600
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 200
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 200
    })
})

In [158]:
# Initialize the trainer and assign a training and validation set to it
trainer = Trainer(model=model, args=training_args,
    			compute_metrics=compute_metrics,
    			train_dataset=emotions_encoded["train"],
    			eval_dataset=emotions_encoded["validation"],
    			tokenizer=tokenizer
)

trainer

<transformers.trainer.Trainer at 0x165f9f785b0>

In [159]:
# Print the keys of the first example in the training dataset
print(emotions_encoded["train"][0].keys())

# Tokenizing single sequences doesn't require segment differentiation, hence no 'token_type_ids'.

dict_keys(['text', 'label', 'input_ids', 'attention_mask'])


In [102]:
# unique_labels = set()
# for example in emotions_encoded["train"]:
#     unique_labels.add(example["label"])
# print(f"Unique labels: {sorted(list(unique_labels))}")

In [160]:
# Training loop to fine-tune the model

trainer.train()

Step,Training Loss
500,0.2709
1000,0.0314


2024/10/12 15:54:30 ERROR mlflow.utils.async_logging.async_logging_queue: Run Id 1b816ed5dc594b8aab83cb89da609b77: Failed to log run data: Exception: Changing param values is not allowed. Param with key='problem_type' was already logged with value='None' for run ID='1b816ed5dc594b8aab83cb89da609b77'. Attempted logging new value 'single_label_classification'.


TrainOutput(global_step=1000, training_loss=0.15114902210235595, metrics={'train_runtime': 539.185, 'train_samples_per_second': 14.837, 'train_steps_per_second': 1.855, 'total_flos': 1059814785024000.0, 'train_loss': 0.15114902210235595, 'epoch': 5.0})

In [161]:
input_texts = ["It's dark and rainy outside", "I love penguins!", "I'm scared to enter there", "It's really unexpected", "That makes me feel good"]

# Tokenize the input sequences and pass them to the model
inputs = tokenizer(input_texts, return_tensors="pt", padding=True, truncation=True)
inputs = {key: value.to(device) for key, value in inputs.items()} # move tokenized tensors to device

inputs

{'input_ids': tensor([[  101,  2009,  1005,  1055,  2601,  1998, 16373,  2648,   102],
         [  101,  1045,  2293, 18134,   999,   102,     0,     0,     0],
         [  101,  1045,  1005,  1049,  6015,  2000,  4607,  2045,   102],
         [  101,  2009,  1005,  1055,  2428,  9223,   102,     0,     0],
         [  101,  2008,  3084,  2033,  2514,  2204,   102,     0,     0]],
        device='cuda:0'),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1],
         [1, 1, 1, 1, 1, 1, 0, 0, 0],
         [1, 1, 1, 1, 1, 1, 1, 1, 1],
         [1, 1, 1, 1, 1, 1, 1, 0, 0],
         [1, 1, 1, 1, 1, 1, 1, 0, 0]], device='cuda:0')}

In [162]:
model.to(device) # Move model to the device (cuda)

with torch.no_grad():
    outputs = model(**inputs)

outputs

SequenceClassifierOutput(loss=None, logits=tensor([[ 2.3327, -1.0188, -3.3314,  3.3021, -1.0555, -5.1176],
        [-2.0071, -0.0838,  5.6982, -1.9928, -3.1754, -2.0236],
        [-2.0161, -1.7759, -3.0611, -1.4618,  6.8023, -2.1666],
        [-2.6330, -1.8602, -2.7156, -2.1896,  1.2098,  4.5404],
        [-2.6366,  7.8942, -1.6927, -2.4569, -2.4308, -2.4874]],
       device='cuda:0'), hidden_states=None, attentions=None)

In [163]:
# Obtain class labels from raw predictions
predicted_label_index = torch.argmax(outputs.logits, dim=1).tolist()

predicted_label_index

[3, 2, 4, 5, 1]

In [164]:
for i, predicted_label in enumerate(predicted_label_index):
    print(f"\n Input Text {i + 1}: {input_texts[i]}")
    print(f"Predicted Label Index: {predicted_label_index[i]}")
    print(f"Predicted Label: {unique_labels[predicted_label_index[i]]}")



 Input Text 1: It's dark and rainy outside
Predicted Label Index: 3
Predicted Label: anger

 Input Text 2: I love penguins!
Predicted Label Index: 2
Predicted Label: love

 Input Text 3: I'm scared to enter there
Predicted Label Index: 4
Predicted Label: fear

 Input Text 4: It's really unexpected
Predicted Label Index: 5
Predicted Label: surprise

 Input Text 5: That makes me feel good
Predicted Label Index: 1
Predicted Label: joy


In [167]:
sentiment_analysis = pipeline("sentiment-analysis")

test_examples = [{'text': 'I love this product!', 'label': 1},
                 {'text': 'The service was terrible.', 'label': 0},
                 {'text': 'This movie is amazing.', 'label': 1},
                 {'text': "I'm disappointed with the quality.", 'label': 0},
                ]

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [168]:
# Pass the four input texts (without labels) to the pipeline
predictions = sentiment_analysis([example["text"] for example in test_examples])

predictions

[{'label': 'POSITIVE', 'score': 0.9998855590820312},
 {'label': 'NEGATIVE', 'score': 0.9996507167816162},
 {'label': 'POSITIVE', 'score': 0.9998838901519775},
 {'label': 'NEGATIVE', 'score': 0.9997726082801819}]

In [174]:
true_labels = [example["label"] for example in test_examples]
true_labels

[1, 0, 1, 0]

In [175]:
predicted_labels = [1 if pred["label"] == "POSITIVE" else 0 for pred in predictions]
predicted_labels

[1, 0, 1, 0]

In [176]:
from sklearn.metrics import accuracy_score

# assuming true_labels and predicted_labels are defined
result = accuracy_score(true_labels, predicted_labels)
print(result)

1.0


In [177]:
import evaluate
# Load the accuracy metric
accuracy = evaluate.load("accuracy")

result = accuracy.compute(references=true_labels, predictions=predicted_labels)
print(result)

{'accuracy': 1.0}


In [178]:
# Load the accuracy, precision, recall and F1 score metrics
accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")

# Obtain a description of each metric
print(accuracy.description)
print(precision.description)
print(recall.description)
print(f1.description)


Accuracy is the proportion of correct predictions among the total number of cases processed. It can be computed with:
Accuracy = (TP + TN) / (TP + TN + FP + FN)
 Where:
TP: True positive
TN: True negative
FP: False positive
FN: False negative


Precision is the fraction of correctly labeled positive examples out of all of the examples that were labeled as positive. It is computed via the equation:
Precision = TP / (TP + FP)
where TP is the True positives (i.e. the examples correctly labeled as positive) and FP is the False positive examples (i.e. the examples incorrectly labeled as positive).


Recall is the fraction of the positive examples that were correctly labeled by the model as positive. It can be computed with the equation:
Recall = TP / (TP + FN)
Where TP is the true positives and FN is the false negatives.


The F1 score is the harmonic mean of the precision and recall. It can be computed with the equation:
F1 = 2 * (precision * recall) / (precision + recall)



In [180]:
test_examples = [
    "Fantastic hotel, exceeded expectations!",
    "Quiet despite central location, great stay.",
    "Friendly staff, welcoming atmosphere.",
    "Spacious, comfy room—a perfect retreat.",
    "Cleanliness could improve, overall decent stay.",
    "Disappointing stay, noisy and unclean room.",
    "Terrible service, unfriendly staff, won't return."
]

test_labels = [1, 1, 1, 1, 0, 0, 0]

In [181]:
predictions = sentiment_analysis([example for example in test_examples])
predicted_labels = [1 if pred["label"] == "POSITIVE" else 0 for pred in predictions]

predicted_labels

[1, 1, 1, 1, 1, 0, 0]

In [182]:
# Compute the metrics by comparing real and predicted labels
print(f1.compute(references=test_labels, predictions=predicted_labels))
print(precision.compute(references=test_labels, predictions=predicted_labels))
print(recall.compute(references=test_labels, predictions=predicted_labels))

{'f1': 0.888888888888889}
{'precision': 0.8}
{'recall': 1.0}


In [183]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "gpt2"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)


In [199]:
prompt = "Current stock data show that by 2030 "

# Encode the prompt, generate text and decode it
prompt_ids = tokenizer.encode(prompt, return_tensors="pt")

prompt_ids

tensor([[11297,  4283,  1366,   905,   326,   416, 25054,   220]])

In [200]:
output = model.generate(prompt_ids, max_length=50)
output

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


tensor([[11297,  4283,  1366,   905,   326,   416, 25054,   220,  1849,  1169,
          2811,  1605,   481,   423,   257,  2010,  2861,   286,   720,    16,
            13,    18, 12989,    11,   393,   720,    16,    13,    18, 12989,
           517,   621,   262,  2811,  1605,   287,   262,   938,  5707,    13,
           198,   464,  2811,  1605,   481,   423,   257,  2010,  2861,   286]])

In [201]:
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

print("Generated Text: ", generated_text)

Generated Text:  Current stock data show that by 2030  the average American will have a net worth of $1.3 trillion, or $1.3 trillion more than the average American in the last decade.
The average American will have a net worth of


In [214]:
import numpy as np
# Load and compute the perplexity score
perplexity = evaluate.load("perplexity", module_type="metric")
results = perplexity.compute(model_id=model_name,
                             predictions=generated_text)
print("Perplexity: ", results['mean_perplexity'])
print(len(results['perplexities']))
print(len(generated_text))
results['perplexities'][:10]

100%|██████████████████████████████████████████████████████████████████████████████████| 14/14 [00:00<00:00, 40.29it/s]

Perplexity:  3308.342305777208
212
212





[139.0388641357422,
 2412.967041015625,
 2127.130126953125,
 2127.130126953125,
 1623.478515625,
 1117.8956298828125,
 1159.115234375,
 12869.310546875,
 503.7154846191406,
 1159.115234375]

In [215]:
# if using squared brackets around 'generated_text' to compute the perplexity

results = perplexity.compute(model_id=model_name, 
                             predictions=[generated_text])

# Print results
print("Perplexity: ", results['mean_perplexity'])
print(len(results['perplexities']))
print(len(generated_text))
results['perplexities']

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 33.40it/s]

Perplexity:  8.184481620788574
1
212





[8.184481620788574]

In [216]:
# ! pip install rouge_score

# Load the rouge metric
rouge = evaluate.load("rouge")

predictions = ["""Pluto is a dwarf planet in our solar system, located in the Kuiper Belt beyond Neptune, and was formerly considered the ninth planet until its reclassification in 2006."""]
references = ["""Pluto is a dwarf planet in the solar system, located in the Kuiper Belt beyond Neptune, and was previously deemed as a planet until it was reclassified in 2006."""]

# Calculate the rouge scores between the predicted and reference summaries
results = rouge.compute(predictions=predictions,references=references)
print("ROUGE results: ", results)

ROUGE results:  {'rouge1': 0.7719298245614034, 'rouge2': 0.6181818181818182, 'rougeL': 0.736842105263158, 'rougeLsum': 0.736842105263158}


In [217]:
meteor = evaluate.load("meteor")

llm_outputs = ["He thought it right and necessary to become a knight-errant, roaming the world in armor, seeking adventures and practicing the deeds he had read about in chivalric tales."]
references = ["He believed it was proper and essential to transform into a knight-errant, traveling the world in armor, pursuing adventures, and enacting the heroic deeds he had encountered in tales of chivalry."]

# Compute and print the METEOR score
results = meteor.compute(predictions=llm_outputs, references=references)
print("Meteor: ", results)

Downloading builder script: 100%|█████████████████████████████████████████████████████████| 7.02k/7.02k [00:00<?, ?B/s]
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Alienware\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Alienware\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Alienware\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Meteor:  {'meteor': 0.5350702240481536}


In [218]:
exact_match = evaluate.load("exact_match")

predictions = ["The cat sat on the mat.", "Theaters are great.", "It's like comparing oranges and apples."]
references = ["The cat sat on the mat?", "Theaters are great.", "It's like comparing apples and oranges."]

# Compute the exact match and print the results
results = exact_match.compute(references=references, predictions=predictions)
print("EM results: ", results)

EM results:  {'exact_match': 0.3333333333333333}


In [219]:
input_sentence_1 = "Hola, ¿cómo estás?"

reference_1 = [
     ["Hello, how are you?", "Hi, how are you?"]
     ]

input_sentences_2 = ["Hola, ¿cómo estás?", "Estoy genial, gracias."]

references_2 = [
     ["Hello, how are you?", "Hi, how are you?"],
     ["I'm great, thanks.", "I'm great, thank you."]
     ]

In [None]:
# The reason why there are multiple reference sentences for each input sentence is because of the inherent ambiguity and variability in translation. There can be several equally correct translations for a given sentence, depending on factors like context, tone, and style. By providing multiple reference translations, we can capture some of this variability and get a more robust estimate of the model’s performance.

# In the code you posted, the BLEU score is being calculated for the translations. The BLEU score is a metric that measures the quality of a translation by comparing it to one or more reference translations. It does this by counting the number of n-gram matches between the translation and the reference(s), and then normalizing by the total number of n-grams in the translation. The more the translation resembles the reference(s), the higher the BLEU score will be.

# In your example, the first input sentence “Hola, ¿cómo estás?” is translated and then the translation is compared to two reference translations: “Hello, how are you?” and “Hi, how are you?”. The BLEU score is then computed for this translation.

# The same process is repeated for the second set of input sentences and references. The final BLEU score is a measure of how well the translations match the reference translations

In [220]:
import evaluate
bleu = evaluate.load("bleu")

translator = pipeline("translation", model="Helsinki-NLP/opus-mt-es-en")

# Translate the first input sentence
translated_output = translator(input_sentence_1)

translated_sentence = translated_output[0]['translation_text']

print("Translated:", translated_sentence)

# Calculate BLEU metric for translation quality
results = bleu.compute(predictions=[translated_sentence], references=reference_1)
print(results)



Translated: Hey, how are you?
{'bleu': 0.7598356856515925, 'precisions': [0.8333333333333334, 0.8, 0.75, 0.6666666666666666], 'brevity_penalty': 1.0, 'length_ratio': 1.0, 'translation_length': 6, 'reference_length': 6}


In [221]:
# Translate the input sentences, extract the translated text, and compute BLEU score
translated_outputs = translator(input_sentences_2)

translated_outputs

[{'translation_text': 'Hey, how are you?'},
 {'translation_text': "I'm great, thanks."}]

In [222]:
predictions = [translated_output['translation_text'] for translated_output in translated_outputs]

predictions

['Hey, how are you?', "I'm great, thanks."]

In [223]:
results = bleu.compute(predictions=predictions, references=references_2)
print(results)

{'bleu': 0.8627788640890415, 'precisions': [0.9090909090909091, 0.8888888888888888, 0.8571428571428571, 0.8], 'brevity_penalty': 1.0, 'length_ratio': 1.0, 'translation_length': 11, 'reference_length': 11}


In [224]:
# !pip install trl # (Transformer Reinforcement Learning) library
from trl import PPOTrainer, PPOConfig, create_reference_model, AutoModelForCausalLMWithValueHead

# PPOTrainer and PPOConfig are part of the trl (Transformer Reinforcement Learning) library. They facilitate training models using the Proximal Policy Optimization (PPO) algorithm, which is a reinforcement learning technique. PPO can fine-tune transformer models to optimize for specific objectives, such as generating more human-like text.

# PPOTrainer:
# Purpose: Manages the training process using PPO.
# Functionality: Handles the policy updates and interactions between the agent (model) and environment (data).
# Usage: Trains models to improve performance by optimizing rewards through a structured approach.

# PPOConfig:
# Purpose: Configures the PPO training parameters.
# Components: Includes settings like learning rate, batch size, number of epochs, and more.
# Usage: Customizes the PPO training process to suit specific needs and goals.

model = AutoModelForCausalLMWithValueHead.from_pretrained('sshleifer/tiny-gpt2')

model

AutoModelForCausalLMWithValueHead(
  (pretrained_model): GPT2LMHeadModel(
    (transformer): GPT2Model(
      (wte): Embedding(50257, 2)
      (wpe): Embedding(1024, 2)
      (drop): Dropout(p=0.1, inplace=False)
      (h): ModuleList(
        (0-1): 2 x GPT2Block(
          (ln_1): LayerNorm((2,), eps=1e-05, elementwise_affine=True)
          (attn): GPT2Attention(
            (c_attn): Conv1D()
            (c_proj): Conv1D()
            (attn_dropout): Dropout(p=0.1, inplace=False)
            (resid_dropout): Dropout(p=0.1, inplace=False)
          )
          (ln_2): LayerNorm((2,), eps=1e-05, elementwise_affine=True)
          (mlp): GPT2MLP(
            (c_fc): Conv1D()
            (c_proj): Conv1D()
            (act): NewGELUActivation()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
      (ln_f): LayerNorm((2,), eps=1e-05, elementwise_affine=True)
    )
    (lm_head): Linear(in_features=2, out_features=50257, bias=False)
  )
  (v_head): Value

In [228]:
# Instantiate a reference model

# When you call create_reference_model(model), it creates a copy of the model and freezes its parameters. This means that the weights of the reference model will not be updated during training.
# This reference model is then used to compare with the updated model at each step of the training process. The idea is to ensure that the policy (i.e., the behavior of the model) does not change too drastically from one update to the next

model_ref = create_reference_model(model)

model_ref

AutoModelForCausalLMWithValueHead(
  (pretrained_model): GPT2LMHeadModel(
    (transformer): GPT2Model(
      (wte): Embedding(50257, 2)
      (wpe): Embedding(1024, 2)
      (drop): Dropout(p=0.1, inplace=False)
      (h): ModuleList(
        (0-1): 2 x GPT2Block(
          (ln_1): LayerNorm((2,), eps=1e-05, elementwise_affine=True)
          (attn): GPT2Attention(
            (c_attn): Conv1D()
            (c_proj): Conv1D()
            (attn_dropout): Dropout(p=0.1, inplace=False)
            (resid_dropout): Dropout(p=0.1, inplace=False)
          )
          (ln_2): LayerNorm((2,), eps=1e-05, elementwise_affine=True)
          (mlp): GPT2MLP(
            (c_fc): Conv1D()
            (c_proj): Conv1D()
            (act): NewGELUActivation()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
      (ln_f): LayerNorm((2,), eps=1e-05, elementwise_affine=True)
    )
    (lm_head): Linear(in_features=2, out_features=50257, bias=False)
  )
  (v_head): Value

In [225]:
# To check if the parameters of a model are frozen, you can iterate over the parameters and check their requires_grad attribute. Here’s a small function that can do this:

def check_if_frozen(model):
    for name, param in model.named_parameters():
        if param.requires_grad:
            print(f"{name} is not frozen")
        else:
            print(f"{name} is frozen")

In [226]:
check_if_frozen(model)

pretrained_model.transformer.wte.weight is not frozen
pretrained_model.transformer.wpe.weight is not frozen
pretrained_model.transformer.h.0.ln_1.weight is not frozen
pretrained_model.transformer.h.0.ln_1.bias is not frozen
pretrained_model.transformer.h.0.attn.c_attn.weight is not frozen
pretrained_model.transformer.h.0.attn.c_attn.bias is not frozen
pretrained_model.transformer.h.0.attn.c_proj.weight is not frozen
pretrained_model.transformer.h.0.attn.c_proj.bias is not frozen
pretrained_model.transformer.h.0.ln_2.weight is not frozen
pretrained_model.transformer.h.0.ln_2.bias is not frozen
pretrained_model.transformer.h.0.mlp.c_fc.weight is not frozen
pretrained_model.transformer.h.0.mlp.c_fc.bias is not frozen
pretrained_model.transformer.h.0.mlp.c_proj.weight is not frozen
pretrained_model.transformer.h.0.mlp.c_proj.bias is not frozen
pretrained_model.transformer.h.1.ln_1.weight is not frozen
pretrained_model.transformer.h.1.ln_1.bias is not frozen
pretrained_model.transformer.h.1

In [229]:
check_if_frozen(model_ref)

pretrained_model.transformer.wte.weight is frozen
pretrained_model.transformer.wpe.weight is frozen
pretrained_model.transformer.h.0.ln_1.weight is frozen
pretrained_model.transformer.h.0.ln_1.bias is frozen
pretrained_model.transformer.h.0.attn.c_attn.weight is frozen
pretrained_model.transformer.h.0.attn.c_attn.bias is frozen
pretrained_model.transformer.h.0.attn.c_proj.weight is frozen
pretrained_model.transformer.h.0.attn.c_proj.bias is frozen
pretrained_model.transformer.h.0.ln_2.weight is frozen
pretrained_model.transformer.h.0.ln_2.bias is frozen
pretrained_model.transformer.h.0.mlp.c_fc.weight is frozen
pretrained_model.transformer.h.0.mlp.c_fc.bias is frozen
pretrained_model.transformer.h.0.mlp.c_proj.weight is frozen
pretrained_model.transformer.h.0.mlp.c_proj.bias is frozen
pretrained_model.transformer.h.1.ln_1.weight is frozen
pretrained_model.transformer.h.1.ln_1.bias is frozen
pretrained_model.transformer.h.1.attn.c_attn.weight is frozen
pretrained_model.transformer.h.1.a

In [230]:
from transformers import AutoModelForCausalLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('sshleifer/tiny-gpt2')

if tokenizer._pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

tokenizer

GPT2TokenizerFast(name_or_path='sshleifer/tiny-gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '[PAD]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	50257: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [231]:
# Initialize trainer configuration
# this code is setting up a configuration for a PPO trainer with specific batch and mini-batch sizes. This configuration would be used when training a model using the PPO algorithm. 

ppo_config = PPOConfig(batch_size=1, mini_batch_size=1)

ppo_config

PPOConfig(exp_name='ipykernel_launcher', seed=0, log_with=None, task_name=None, model_name='gpt2', query_dataset='imdb', reward_model='sentiment-analysis:lvwerra/distilbert-imdb', remove_unused_columns=True, tracker_kwargs={}, accelerator_kwargs={}, project_kwargs={}, tracker_project_name='trl', push_to_hub_if_best_kwargs={}, steps=20000, learning_rate=1.41e-05, adap_kl_ctrl=True, init_kl_coef=0.2, kl_penalty='kl', target=6, horizon=10000, gamma=1, lam=0.95, cliprange=0.2, cliprange_value=0.2, vf_coef=0.1, batch_size=1, forward_batch_size=None, mini_batch_size=1, gradient_accumulation_steps=1, world_size=None, ppo_epochs=4, max_grad_norm=None, optimize_cuda_cache=None, optimize_device_cache=False, early_stopping=False, target_kl=1, compare_steps=1, ratio_threshold=10.0, use_score_scaling=False, use_score_norm=False, score_clip=None, whiten_rewards=False, gradient_checkpointing=False, is_encoder_decoder=None, is_peft_model=None, backward_batch_size=1, global_backward_batch_size=None, gl

In [232]:
# Create a PPOTrainer instance
# this line of code is setting up a PPO trainer with a specific configuration, model, reference model, and tokenizer. The trainer can then be used to train the model using the PPO algorithm.Typically, the trainer would have a method like train() that you can call to start the training process. The training process involves repeatedly sampling data, using the data to update the model, and then evaluating the performance of the model. The goal is to improve the model’s performance on some task, such as generating text. The PPO algorithm is particularly well-suited to tasks where the data is sequential or temporal in nature. It’s also known for its stability and efficiency, which makes it a popular choice for many reinforcement learning tasks.
# N.B. Even though PPO is on-policy (it can learn only from actions taken by current policy), the reference model is not used to gather new data but to stabilize and guide the policy updates during training.

ppo_trainer = PPOTrainer(ppo_config, model, model_ref, tokenizer)

ppo_trainer



<trl.trainer.ppo_trainer.PPOTrainer at 0x165ce525690>

In [233]:
prompt = "Next year, I "

input = tokenizer.encode(prompt, return_tensors="pt")

input

tensor([[10019,   614,    11,   314,   220]])

In [236]:
from trl.core import respond_to_batch

response  = respond_to_batch(model.to(device), input.to(device)) # function to generate a response from the model. The function takes the model and the encoded input as arguments.

# this code is using a pretrained language model to generate a response to a given prompt. The response is calculated by feeding the encoded input into the model and then decoding the model’s output back into text. The response represents what the model thinks is the most likely continuation of the input prompt. The exact details of how the response is calculated depend on the specifics of the model and the respond_to_batch function. 

response

tensor([[15197, 24174, 12929, 42507,  4255, 19083,  7718, 30414, 30593, 48668,
         46900, 32275, 24018, 46815, 18304, 28796,  3941,  8724, 31644, 19266]],
       device='cuda:0')

In [237]:
# In a more complex scenario, you might want to design a reward function that gives higher rewards for better responses and lower rewards for worse ones. This would require a way to evaluate the quality of the responses, which could be based on various factors such as the relevance of the response to the input, the grammatical correctness of the response, etc. This is typically the challenging part in reinforcement learning - designing a good reward function.

import torch
reward = [torch.tensor(1.0)]
reward

[tensor(1.)]

In [244]:
# Train LLM for one step with PPO

# The below function ppo_trainer.step([input[0]], [response[0]], reward) is used to update the PPO model based on the inputs, generated responses, and the received rewards. By iteratively calling this function with various inputs, responses, and rewards, the PPOTrainer refines the model to generate better responses over time. 

# while step() is used for a single step of training, train() is used for full-scale training over multiple epochs. The code is likely a simplified example or a debugging scenario where only a single step of training is being performed. For training a model to completion, you would generally use a train() function or similar.

train_stats = ppo_trainer.step([input[0]], [response[0]], reward) # The step function is used to perform one step of training, where the model’s parameters are updated to maximize the expected reward.

train_stats

# The train_stats dictionary contains various statistics and metrics that are calculated during the training step. Here’s a brief explanation of some of the key metrics:
# ‘objective/kl’: This is the Kullback-Leibler (KL) divergence, which measures how one probability distribution diverges from a second, expected probability distribution. In this case, it’s 0.0, indicating no divergence.
# ‘objective/logprobs’: These are the log probabilities of the actions taken by the model. They are used in the calculation of the policy gradient.
# ‘ppo/mean_scores’: This is the mean of the scores (rewards) obtained during the training step.
# ‘tokens/queries_len_mean’: This is the average length of the queries processed in the training step.
# ‘ppo/loss/policy’, ‘ppo/loss/value’, ‘ppo/loss/total’: These are the losses for the policy, value function, and the total loss respectively. The policy loss is related to how well the model is doing in terms of taking the right actions. The value loss is related to how well the model is predicting the expected future rewards.
# ‘ppo/policy/entropy’: This is the entropy of the policy. It’s a measure of the randomness of the policy. A higher entropy means the policy is more random, while a lower entropy means the policy is more deterministic.
# ‘ppo/returns/mean’: This is the mean of the returns (sum of rewards) obtained during the training step.
# ‘ppo/val/vpred’: This is the predicted value of the state by the model.
# ‘time/ppo/total’: This is the total time taken for the training step.
# The warnings about degrees of freedom being less than or equal to 0 are due to the standard deviation (std()) function being called on a dataset with insufficient size. This can happen when the batch size or mini-batch size is too small.

  std_scores = data["scores"].std()
  stats["tokens/queries_len_std"] = torch.std(query_lens).cpu().numpy().item()
  stats["tokens/responses_len_std"] = torch.std(response_lens).cpu().numpy().item()


{'objective/kl': 0.0,
 'objective/kl_dist': 0.0,
 'objective/logprobs': array([[-10.803115, -10.834714, -10.795107, -10.808006, -10.81736 ,
         -10.846797, -10.826721, -10.803664, -10.758843, -10.889929,
         -10.850676, -10.812679, -10.761826, -10.849661, -10.760221,
         -10.799231, -10.85004 , -10.812998, -10.813044, -10.78842 ,
         -10.863429, -10.875851, -10.839982, -10.794012]], dtype=float32),
 'objective/ref_logprobs': array([[-10.803115, -10.834714, -10.795107, -10.808006, -10.81736 ,
         -10.846797, -10.826721, -10.803664, -10.758843, -10.889929,
         -10.850676, -10.812679, -10.761826, -10.849661, -10.760221,
         -10.799231, -10.85004 , -10.812998, -10.813044, -10.78842 ,
         -10.863429, -10.875851, -10.839982, -10.794012]], dtype=float32),
 'objective/kl_coef': 0.2,
 'objective/entropy': 216.41537475585938,
 'ppo/mean_non_score_reward': 0.0,
 'ppo/mean_scores': 1.0,
 'ppo/std_scores': nan,
 'tokens/queries_len_mean': 5.0,
 'tokens/querie

In [245]:
# Number of training steps
num_steps = 100

# The example below is a semplification. In reality, input, response, and reward should be dynamically generated and updated, so that the training process will be much more meaningful and reflective of the model’s performance

# Training loop
for i in range(1, num_steps+1):
    # Generate input and response here...
    # ...
    reward = [torch.tensor(1.0)]
    train_stats = ppo_trainer.step([input[0]], [response[0]], reward)
    if i % 10 == 0:
        print(i, 'training steps')
    
train_stats


10 training steps
20 training steps
30 training steps
40 training steps
50 training steps
60 training steps
70 training steps
80 training steps
90 training steps
100 training steps


{'objective/kl': -0.1894083023071289,
 'objective/kl_dist': -0.1894083023071289,
 'objective/logprobs': array([[-10.804715, -10.814219, -10.793684, -10.821551, -10.827355,
         -10.82836 , -10.841099, -10.872051, -10.734868, -10.863733,
         -10.8736  , -10.789813, -10.740524, -10.825425, -10.780062,
         -10.819388, -10.859491, -10.851975, -10.810377, -10.876074,
         -10.83882 , -10.882045, -10.817121, -10.872608]], dtype=float32),
 'objective/ref_logprobs': array([[-10.803115, -10.834714, -10.795107, -10.808006, -10.81736 ,
         -10.846797, -10.826721, -10.803664, -10.758843, -10.889929,
         -10.850676, -10.812679, -10.761826, -10.849661, -10.760221,
         -10.799231, -10.85004 , -10.812998, -10.813044, -10.78842 ,
         -10.863429, -10.875851, -10.839982, -10.794012]], dtype=float32),
 'objective/kl_coef': 0.1996003957414051,
 'objective/entropy': 216.6047821044922,
 'ppo/mean_non_score_reward': 0.001890298561193049,
 'ppo/mean_scores': 1.0,
 'ppo/std

In [246]:
# Toxicity refers to the presence of harmful or offensive language within a text.
# Calculate the individual toxicities, maximum toxicities, and toxicity ratios

emp_1 = ["Everyone in the team adores him",
           "He is a true genius, pure talent"]
emp_2 = ["Nobody in the team likes him",
           "He is a useless 'good-for-nothing'"]

In [247]:
from evaluate import load

toxicity_metric = load("toxicity")

toxicity_1 = toxicity_metric.compute(predictions=emp_1)
toxicity_2 = toxicity_metric.compute(predictions=emp_2)
print("Toxicities (emp. 1):", toxicity_1['toxicity'])
print("Toxicities (emp. 2): ", toxicity_2['toxicity'])



Toxicities (emp. 1): [0.0001386617950629443, 0.00013368602958507836]
Toxicities (emp. 2):  [0.00014245195779949427, 0.010071290656924248]


In [248]:
toxicity_1_max = toxicity_metric.compute(predictions=emp_1, aggregation="maximum")
toxicity_2_max = toxicity_metric.compute(predictions=emp_2, aggregation="maximum")
print("Maximum toxicity (emp. 1):", toxicity_1_max['max_toxicity'])
print("Maximum toxicity (emp. 2): ", toxicity_2_max['max_toxicity'])

Maximum toxicity (emp. 1): 0.0001386617950629443
Maximum toxicity (emp. 2):  0.010071290656924248


In [249]:
toxicity_1_ratio = toxicity_metric.compute(predictions=emp_1, aggregation="ratio")
toxicity_2_ratio = toxicity_metric.compute(predictions=emp_2, aggregation="ratio")
print("Toxicity ratio (emp. 1):", toxicity_1_ratio['toxicity_ratio'])
print("Toxicity ratio (emp. 2): ", toxicity_2_ratio['toxicity_ratio'])

Toxicity ratio (emp. 1): 0.0
Toxicity ratio (emp. 2):  0.0


In [250]:
group1 = ['abc are described as loyal employees', 
          'abc are ambitious in their career expectations']
group2 = ['abc are known for causing lots of team conflicts',
          'abc are verbally violent']

In [251]:
import evaluate

# Load the regard and regard-comparison metrics
# The code is measuring the "regard" or sentiment (positivity/negativity) of two groups of sentences using a pre-trained sentiment evaluation model.

regard = evaluate.load("regard")
regard_comp = evaluate.load("regard", "compare")
# regard

In [252]:
# Compute the regard (polarities) of each group separately

polarity_results_1 = regard.compute(data=group1)
print("Polarity in group 1:\n", polarity_results_1)

polarity_results_2 = regard.compute(data=group2)
print("Polarity in group 2:\n", polarity_results_2)


Polarity in group 1:
 {'regard': [[{'label': 'positive', 'score': 0.9098387956619263}, {'label': 'neutral', 'score': 0.05939687788486481}, {'label': 'other', 'score': 0.02646809071302414}, {'label': 'negative', 'score': 0.004296251572668552}], [{'label': 'positive', 'score': 0.7809810638427734}, {'label': 'neutral', 'score': 0.18085992336273193}, {'label': 'other', 'score': 0.030492978170514107}, {'label': 'negative', 'score': 0.007666022051125765}]]}
Polarity in group 2:
 {'regard': [[{'label': 'negative', 'score': 0.9658734202384949}, {'label': 'other', 'score': 0.021555889397859573}, {'label': 'neutral', 'score': 0.012026479467749596}, {'label': 'positive', 'score': 0.0005441228277049959}], [{'label': 'negative', 'score': 0.9774737358093262}, {'label': 'other', 'score': 0.012994571588933468}, {'label': 'neutral', 'score': 0.008945493958890438}, {'label': 'positive', 'score': 0.0005862839752808213}]]}


In [253]:
# Compute the relative regard between the two groups for comparison

polarity_results_comp = regard_comp.compute(data=group1, references=group2)
print("Polarity comparison between groups:\n", polarity_results_comp)

Polarity comparison between groups:
 {'regard_difference': {'positive': 0.844844726350857, 'neutral': 0.10964241391047835, 'other': 0.011205303948372602, 'negative': -0.9656924412120134}}
