In [1]:
import torch  
import torch.nn as nn  
import math  

# Import the function for loading Hugging Face pipelines
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name = 'nlptown/bert-base-multilingual-uncased-sentiment'

# Load the pipeline for sentiment classification
classifier = pipeline("text-classification", model=model_name)
classifier

<transformers.pipelines.text_classification.TextClassificationPipeline at 0x2750373b250>

In [3]:
prompt = "The food was good, but service at the restaurant was a bit slow"

# Pass the customer review to the model for prediction
prediction = classifier(prompt)
print(prediction)

[{'label': '3 stars', 'score': 0.6387940645217896}]


In [4]:
prompt = "The food was good, everything well organized"

# Pass the customer review to the model for prediction
prediction = classifier(prompt)
print(prediction)

[{'label': '4 stars', 'score': 0.5190770030021667}]


In [5]:
model_name = 'cnicu/t5-small-booksum'

# Load the model pipeline for text summarization
summarizer = pipeline('summarization', model = model_name)
summarizer

<transformers.pipelines.text2text_generation.SummarizationPipeline at 0x27523984c10>

In [6]:
long_text = '\nThe tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey building, and the tallest structure in Paris. Its base is square, measuring 125 metres (410 ft) on each side. During its construction, the Eiffel Tower surpassed the Washington Monument to become the tallest man-made structure in the world, a title it held for 41 years until the Chrysler Building in New York City was finished in 1930. It was the first structure to reach a height of 300 metres. Due to the addition of a broadcasting aerial at the top of the tower in 1957, it is now taller than the Chrysler Building by 5.2 metres (17 ft). Excluding transmitters, the Eiffel Tower is the second tallest free-standing structure in France after the Millau Viaduct.\n'

# Pass the long text to the model to summarize it
outputs = summarizer(long_text, max_length = 30)
outputs

[{'summary_text': 'the Eiffel Tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey'}]

In [7]:
# Access and print the summarized text in the outputs variable
print(outputs[0]['summary_text'])

the Eiffel Tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey


In [8]:
# Load the model pipeline for question-answering
qa_model = pipeline("question-answering")
question = "For how long was the Eiffel Tower the tallest man-made structure in the world?"
question = "What's the tallest structure in France?"

# Pass the necessary inputs to the LLM pipeline for question-answering
outputs = qa_model(question, long_text)

# Access and print the answer
print(outputs['answer'])

No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


Millau Viaduct


In [9]:
model_name = "Helsinki-NLP/opus-mt-es-en"

input_text = "No creo que hagas una buena traducción"

# Define pipeline for Spanish-to-English translation
translator = pipeline('translation_es_to_en', model=model_name)

# Translate the input text
translations = translator(input_text)

# Access the output to print the translated text in English
print(translations[0]['translation_text'])



I don't think you're doing a good translation.


In [10]:
# Set transformer model hyperparameters
d_model = 256
n_heads = 4
num_encoder_layers = 3
num_decoder_layers = 3

In [11]:
import torch.nn as nn

# Create the transformer model and assign hyperparameters
model = nn.Transformer(
    d_model=d_model, # d_model is the dimension of the input vectors and output vectors of the model, specifically the size of the feature space. Essentially, it determines the number of features in each transformer layer
    nhead=n_heads,
    num_encoder_layers=num_encoder_layers,    
    num_decoder_layers=num_decoder_layers
    )


print(model)

Transformer(
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-2): 3 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
        )
        (linear1): Linear(in_features=256, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=2048, out_features=256, bias=True)
        (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
    (norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
  )
  (decoder): TransformerDecoder(
    (layers): ModuleList(
      (0-2): 3 x TransformerDecoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=256, o



In [12]:
# Create a pipeline for text generation using the gpt2 model
generator = pipeline("text-generation", model = "gpt2")

text = "I had a wonderful stay at the Riverview Hotel! The staff were incredibly attentive and the amenities were top-notch. The only hiccup was a slight delay in room service, but that didn't overshadow the fantastic experience I had."

response = "Dear valued customer, I am glad to hear you had a good stay with us."



In [13]:
# Build the prompt for the text generation LLM

prompt = f"Customer review:\n{text}\n\nHotel reponse to the customer:\n{response}"

prompt

"Customer review:\nI had a wonderful stay at the Riverview Hotel! The staff were incredibly attentive and the amenities were top-notch. The only hiccup was a slight delay in room service, but that didn't overshadow the fantastic experience I had.\n\nHotel reponse to the customer:\nDear valued customer, I am glad to hear you had a good stay with us."

In [14]:
#### Pass the prompt to the model pipeline
outputs = generator(prompt, max_length = 100, pad_token_id=generator.tokenizer.eos_token_id) #  if the generated text is shorter than max_length, the remaining tokens will be filled with the EOS token.

# Print the augmented sequence generated by the model
print(outputs[0]['generated_text'])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Customer review:
I had a wonderful stay at the Riverview Hotel! The staff were incredibly attentive and the amenities were top-notch. The only hiccup was a slight delay in room service, but that didn't overshadow the fantastic experience I had.

Hotel reponse to the customer:
Dear valued customer, I am glad to hear you had a good stay with us. We love this place and have a few others in the area including a nearby boutique with a fantastic


In [15]:
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification, AutoModelForSeq2SeqLM

model_name = "textattack/distilbert-base-uncased-SST-2"

# Load the tokenizer and pre-trained model
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer

DistilBertTokenizerFast(name_or_path='textattack/distilbert-base-uncased-SST-2', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [16]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [17]:
text = ["The best movie I've ever watched!", "What an awful movie. I regret watching it."]

# Tokenize inputs and pass them to the model for inference
inputs = tokenizer(text, return_tensors="pt", padding=True)
inputs

{'input_ids': tensor([[ 101, 1996, 2190, 3185, 1045, 1005, 2310, 2412, 3427,  999,  102,    0],
        [ 101, 2054, 2019, 9643, 3185, 1012, 1045, 9038, 3666, 2009, 1012,  102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [18]:
outputs = model(**inputs)
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[-0.0542,  0.2731],
        [ 0.9809, -0.7639]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [19]:
logits = outputs.logits

logits

tensor([[-0.0542,  0.2731],
        [ 0.9809, -0.7639]], grad_fn=<AddmmBackward0>)

In [20]:
predicted_classes = torch.argmax(logits, dim=1).tolist()
predicted_classes

[1, 0]

In [21]:
for idx, predicted_class in enumerate(predicted_classes):
    print(f"Predicted class for \"{text[idx]}\": {predicted_class}")

Predicted class for "The best movie I've ever watched!": 1
Predicted class for "What an awful movie. I regret watching it.": 0


In [22]:
from datasets import load_dataset

# Load a dataset from Hugging Face's dataset hub
dataset = load_dataset('opinosis', trust_remote_code=True)

dataset


DatasetDict({
    train: Dataset({
        features: ['review_sents', 'summaries'],
        num_rows: 51
    })
})

In [23]:
print(f"Number of instances: {len(dataset['train'])}")


Number of instances: 51


In [24]:
# Show the names of features in the training fold of the dataset
print(f"Feature names: {dataset['train'].column_names}")

Feature names: ['review_sents', 'summaries']


In [46]:
print(dataset['train'].shape)
print(dataset['train'][0].keys())
dataset['train'][0]

(51, 2)
dict_keys(['review_sents', 'summaries'])


{'review_sents': ", and is very, very accurate .\r\n but for the most part, we find that the Garmin software provides accurate directions, whereever we intend to go .\r\n This function is not accurate if you don't leave it in battery mode say, when you stop at the Cracker Barrell for lunch and to play one of those trangle games with the tees .\r\n It provides immediate alternatives if the route from the online map program was inaccurate or blocked by an obstacle .\r\n I've used other GPS units, as well as GPS built into cars   and to this day NOTHING beats the accuracy of a Garmin GPS .\r\n It got me from point A to point B with 100% accuracy everytime .\r\n It has yet to disappoint, getting me everywhere with 100% accuracy .\r\n0 out of 5 stars Honest, accurate review, , PLEASE READ !\r\n Aside from that, every destination I've thrown at has been 100% accurate .\r\nIn closing, this is a fantastic GPS with some very nice features and is very accurate in directions .\r\n Plus, I've alwa

In [None]:
# Encode the input example, obtain the summary, and decode it
example = dataset['train'][-2]['review_sents']

In [None]:
model_name = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer

In [None]:
input_ids = tokenizer.encode("summarize: " + example, return_tensors="pt", max_length=512, truncation=True)
input_ids

In [None]:
summary_ids = model.generate(input_ids, max_length=150)
summary_ids[0]

In [None]:
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

print("\nOriginal Text (first 400 characters): \n", example[:400])
print("\nGenerated Summary: \n", summary)

In [None]:
model_name = "Helsinki-NLP/opus-mt-en-es"

# Load the tokenizer and the model checkpoint
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

model

In [None]:
# The reason why the input and translated IDs vectors have more elements than the corresponding word inputs is due to the way the tokenizer works.
# In your code, the tokenizer.encode function is used to convert the input text into a sequence of IDs, which represent the tokens in the text. These tokens can be individual words, but they can also be smaller units depending on the tokenizer. For example, a word might be split into multiple subwords, each with its own ID.
# Additionally, special tokens are often added to the sequence. For instance, a common practice is to add a special token at the beginning and end of the sequence. In your case, the 0 at the end of each input_ids and translated_ids tensor is likely a special token, such as an end-of-sequence token

english_inputs = ["Hello", "Thank you", "How are you?", "Sorry", "Goodbye"]

# Encode the inputs, generate translations, decode, and print them
for english_input in english_inputs:
    print('english_input', english_input)
    input_ids = tokenizer.encode(english_input, return_tensors="pt")
    print('input_ids', input_ids)
    translated_ids = model.generate(input_ids)
    print('translated_ids', translated_ids)
    translated_text = tokenizer.decode(translated_ids[0], skip_special_tokens=True)
    print(f"English: {english_input} | Spanish: {translated_text}")

In [None]:
# Load a specific subset of the dataset 
mlqa = load_dataset("xtreme", name="MLQA.en.en")

mlqa

In [None]:
question = mlqa["test"]["question"][0]
context = mlqa["test"]["context"][0]
print("Question: ", question)
print("Context: ", context)

In [None]:
model_ckp = "deepset/minilm-uncased-squad2"

# Initialize the tokenizer using the model checkpoint
tokenizer = AutoTokenizer.from_pretrained(model_ckp)

tokenizer

In [None]:
# Tokenize the inputs returning the result as tensors
inputs = tokenizer(question, context, return_tensors="pt")

inputs

In [None]:
from transformers import AutoModelForQuestionAnswering

# Initialize the LLM upon the model checkpoint
model = AutoModelForQuestionAnswering.from_pretrained(model_ckp)

model

In [None]:
import torch

with torch.no_grad():
  # Forward-pass the input through the model
  outputs = model(**inputs)

outputs

In [None]:
# Get the most likely start and end answer position from the raw LLM outputs
start_idx = torch.argmax(outputs.start_logits)
end_idx = torch.argmax(outputs.end_logits) + 1

start_idx, end_idx

In [None]:
# Access the tokenized inputs tensor to get the answer span
answer_span = inputs["input_ids"][0][start_idx:end_idx]

answer_span

In [None]:
# Decode the answer span to get the extracted answer text
answer = tokenizer.decode(answer_span)
print("Answer: ", answer)

In [None]:
model_name = "distilbert-base-uncased"

# Load a pre-trained LLM, specifying its use for binary classification
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=6)

model

In [None]:
from transformers import TrainingArguments

# Set up training arguments with a batch size of 8 per GPU and 5 epochs
training_args = TrainingArguments(
    output_dir="./smaller_bert_finetuned",
    per_device_train_batch_size=8,
    num_train_epochs=5,
)

training_args

In [None]:

from transformers import Trainer

tokenized_datasets = []

# Set up trainer, assigning previously set up training arguments
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
)

trainer

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer

In [None]:
# Load your dataset
dataset = load_dataset('emotion', trust_remote_code=True)

dataset

In [None]:
# Encode your dataset
def encode(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length')

emotions_encoded = dataset.map(encode, batched=True)

emotions_encoded


In [None]:
# Initialize the trainer and assign a training and validation set to it
trainer = Trainer(model=model, args=training_args,
    			compute_metrics=compute_metrics,
    			train_dataset=emotions_encoded["train"],
    			eval_dataset=emotions_encoded["validation"],
    			tokenizer=tokenizer
)

trainer

In [None]:
# Print the keys of the first example in the training dataset
print(emotions_encoded["train"][0].keys())


In [None]:
unique_labels = set()
for example in emotions_encoded["train"]:
    unique_labels.add(example["label"])
print(f"Unique labels: {sorted(list(unique_labels))}")

In [None]:
# # Training loop to fine-tune the model

# trainer.train()

In [None]:
input_texts = ["It's dark and rainy outside", "I love penguins!"]

# Tokenize the input sequences and pass them to the model
inputs = tokenizer(input_texts, return_tensors="pt", padding=True, truncation=True)

inputs

In [None]:
with torch.no_grad():
    outputs = model(**inputs)

outputs

In [None]:
# Obtain class labels from raw predictions
predicted_labels = torch.argmax(outputs.logits, dim=1).tolist()

predicted_labels

In [None]:
for i, predicted_label in enumerate(predicted_labels):
    print(f"\n Input Text {i + 1}: {input_texts[i]}")
    print(f"Predicted Label: {predicted_label}")

In [None]:
sentiment_analysis = pipeline("sentiment-analysis")

test_examples = [{'text': 'I love this product!', 'label': 1},
                 {'text': 'The service was terrible.', 'label': 0},
                 {'text': 'This movie is amazing.', 'label': 1},
                 {'text': "I'm disappointed with the quality.", 'label': 0}]

In [None]:
# Pass the four input texts (without labels) to the pipeline
predictions = sentiment_analysis([example["text"] for example in test_examples])

predictions

In [None]:
true_labels = [example["label"] for example in test_examples]
true_labels

In [None]:
predicted_labels = [1 if pred["label"] == "POSITIVE" else 0 for pred in predictions]
predicted_labels

In [None]:
from sklearn.metrics import accuracy_score

# assuming true_labels and predicted_labels are defined
result = accuracy_score(true_labels, predicted_labels)
print(result)

In [None]:
import evaluate
# Load the accuracy metric
accuracy = evaluate.load("accuracy")

result = accuracy.compute(references=true_labels, predictions=predicted_labels)
print(result)

In [None]:
# Load the accuracy, precision, recall and F1 score metrics
accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")

# Obtain a description of each metric
print(accuracy.description)
print(precision.description)
print(recall.description)
print(f1.description)

In [None]:
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")

# Pass the examples to the pipeline, and obtain a list predicted labels
sentiment_analysis = pipeline("sentiment-analysis")
sentiment_analysis

In [None]:
test_examples = [
    "Fantastic hotel, exceeded expectations!",
    "Quiet despite central location, great stay.",
    "Friendly staff, welcoming atmosphere.",
    "Spacious, comfy room—a perfect retreat.",
    "Cleanliness could improve, overall decent stay.",
      "Disappointing stay, noisy and unclean room.",
    "Terrible service, unfriendly staff, won't return."
]

test_labels = [1, 1, 1, 1, 0, 0, 0]

In [None]:
predictions = sentiment_analysis([example for example in test_examples])
predicted_labels = [1 if pred["label"] == "POSITIVE" else 0 for pred in predictions]

predicted_labels

In [None]:
# Compute the metrics by comparing real and predicted labels
print(f1.compute(references=test_labels, predictions=predicted_labels))
print(precision.compute(references=test_labels, predictions=predicted_labels))
print(recall.compute(references=test_labels, predictions=predicted_labels))

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "gpt2"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)


In [None]:
prompt = "Current trends show that by 2030 "

# Encode the prompt, generate text and decode it
prompt_ids = tokenizer.encode(prompt, return_tensors="pt")

prompt_ids

In [None]:
output = model.generate(prompt_ids, max_length=50)
output

In [None]:
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

print("Generated Text: ", generated_text)

In [None]:
# Load and compute the perplexity score
perplexity = evaluate.load("perplexity", module_type="metric")
results = perplexity.compute(model_id=model_name,
                             predictions=generated_text)
print("Perplexity: ", results['mean_perplexity'])

In [None]:
# ! pip install rouge_score

# Load the rouge metric
rouge = evaluate.load("rouge")

predictions = ["""Pluto is a dwarf planet in our solar system, located in the Kuiper Belt beyond Neptune, and was formerly considered the ninth planet until its reclassification in 2006."""]
references = ["""Pluto is a dwarf planet in the solar system, located in the Kuiper Belt beyond Neptune, and was previously deemed as a planet until it was reclassified in 2006."""]

# Calculate the rouge scores between the predicted and reference summaries
results = rouge.compute(predictions=predictions,references=references)
print("ROUGE results: ", results)

In [None]:
meteor = evaluate.load("meteor")

llm_outputs = ["He thought it right and necessary to become a knight-errant, roaming the world in armor, seeking adventures and practicing the deeds he had read about in chivalric tales."]
references = ["He believed it was proper and essential to transform into a knight-errant, traveling the world in armor, pursuing adventures, and enacting the heroic deeds he had encountered in tales of chivalry."]

# Compute and print the METEOR score
results = meteor.compute(predictions=llm_outputs, references=references)
print("Meteor: ", results)

In [None]:
exact_match = evaluate.load("exact_match")

predictions = ["The cat sat on the mat.", "Theaters are great.", "It's like comparing oranges and apples."]
references = ["The cat sat on the mat?", "Theaters are great.", "It's like comparing apples and oranges."]

# Compute the exact match and print the results
results = exact_match.compute(references=references, predictions=predictions)
print("EM results: ", results)

In [None]:
input_sentence_1 = "Hola, ¿cómo estás?"

reference_1 = [
     ["Hello, how are you?", "Hi, how are you?"]
     ]

input_sentences_2 = ["Hola, ¿cómo estás?", "Estoy genial, gracias."]

references_2 = [
     ["Hello, how are you?", "Hi, how are you?"],
     ["I'm great, thanks.", "I'm great, thank you."]
     ]

In [None]:
# The reason why there are multiple reference sentences for each input sentence is because of the inherent ambiguity and variability in translation. There can be several equally correct translations for a given sentence, depending on factors like context, tone, and style. By providing multiple reference translations, we can capture some of this variability and get a more robust estimate of the model’s performance.

# In the code you posted, the BLEU score is being calculated for the translations. The BLEU score is a metric that measures the quality of a translation by comparing it to one or more reference translations. It does this by counting the number of n-gram matches between the translation and the reference(s), and then normalizing by the total number of n-grams in the translation. The more the translation resembles the reference(s), the higher the BLEU score will be.

# In your example, the first input sentence “Hola, ¿cómo estás?” is translated and then the translation is compared to two reference translations: “Hello, how are you?” and “Hi, how are you?”. The BLEU score is then computed for this translation.

# The same process is repeated for the second set of input sentences and references. The final BLEU score is a measure of how well the translations match the reference translations

In [None]:
import evaluate
bleu = evaluate.load("bleu")

translator = pipeline("translation", model="Helsinki-NLP/opus-mt-es-en")

# Translate the first input sentence
translated_output = translator(input_sentence_1)

translated_sentence = translated_output[0]['translation_text']

print("Translated:", translated_sentence)

# Calculate BLEU metric for translation quality
results = bleu.compute(predictions=[translated_sentence], references=reference_1)
print(results)

In [None]:
# Translate the input sentences, extract the translated text, and compute BLEU score
translated_outputs = translator(input_sentences_2)

translated_outputs

In [None]:
predictions = [translated_output['translation_text'] for translated_output in translated_outputs]

predictions

In [None]:
results = bleu.compute(predictions=predictions, references=references_2)
print(results)

In [None]:
# !pip install trl
from trl import PPOTrainer, PPOConfig, create_reference_model, AutoModelForCausalLMWithValueHead

model = AutoModelForCausalLMWithValueHead.from_pretrained('sshleifer/tiny-gpt2')

model

In [None]:
# Instantiate a reference model

# When you call create_reference_model(model), it creates a copy of the model and freezes its parameters. This means that the weights of the reference model will not be updated during training.
# This reference model is then used to compare with the updated model at each step of the training process. The idea is to ensure that the policy (i.e., the behavior of the model) does not change too drastically from one update to the next

model_ref = create_reference_model(model)

model_ref

In [None]:
# To check if the parameters of a model are frozen, you can iterate over the parameters and check their requires_grad attribute. Here’s a small function that can do this:

def check_if_frozen(model):
    for name, param in model.named_parameters():
        if param.requires_grad:
            print(f"{name} is not frozen")
        else:
            print(f"{name} is frozen")

In [None]:
check_if_frozen(model)

In [None]:
check_if_frozen(model_ref)

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('sshleifer/tiny-gpt2')

if tokenizer._pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

tokenizer

In [None]:
# Initialize trainer configuration
# this code is setting up a configuration for a PPO trainer with specific batch and mini-batch sizes. This configuration would be used when training a model using the PPO algorithm. 

ppo_config = PPOConfig(batch_size=1, mini_batch_size=1)

ppo_config

In [None]:
# Create a PPOTrainer instance
# this line of code is setting up a PPO trainer with a specific configuration, model, reference model, and tokenizer. The trainer can then be used to train the model using the PPO algorithm.Typically, the trainer would have a method like train() that you can call to start the training process. The training process involves repeatedly sampling data, using the data to update the model, and then evaluating the performance of the model. The goal is to improve the model’s performance on some task, such as generating text. The PPO algorithm is particularly well-suited to tasks where the data is sequential or temporal in nature. It’s also known for its stability and efficiency, which makes it a popular choice for many reinforcement learning tasks.

ppo_trainer = PPOTrainer(ppo_config, model, model_ref, tokenizer)

ppo_trainer

In [None]:
prompt = "Next year, I "

input = tokenizer.encode(prompt, return_tensors="pt")

input

In [None]:
# this code is using a pretrained language model to generate a response to a given prompt. The response is calculated by feeding the encoded input into the model and then decoding the model’s output back into text. The response represents what the model thinks is the most likely continuation of the input prompt. The exact details of how the response is calculated depend on the specifics of the model and the respond_to_batch function. 

from trl.core import respond_to_batch

response  = respond_to_batch(model, input) # function to generate a response from the model. The function takes the model and the encoded input as arguments.

response

In [None]:
# In a more complex scenario, you might want to design a reward function that gives higher rewards for better responses and lower rewards for worse ones. This would require a way to evaluate the quality of the responses, which could be based on various factors such as the relevance of the response to the input, the grammatical correctness of the response, etc. This is typically the challenging part in reinforcement learning - designing a good reward function.

import torch
reward = [torch.tensor(1.0)]
reward

In [None]:
# Train LLM for one step with PPO
# while step() is used for a single step of training, train() is used for full-scale training over multiple epochs. The code is likely a simplified example or a debugging scenario where only a single step of training is being performed. For training a model to completion, you would generally use a train() function or similar.

train_stats = ppo_trainer.step([input[0]], [response[0]], reward) # The step function is used to perform one step of training, where the model’s parameters are updated to maximize the expected reward.

train_stats

# The train_stats dictionary contains various statistics and metrics that are calculated during the training step. Here’s a brief explanation of some of the key metrics:
# ‘objective/kl’: This is the Kullback-Leibler (KL) divergence, which measures how one probability distribution diverges from a second, expected probability distribution. In this case, it’s 0.0, indicating no divergence.
# ‘objective/logprobs’: These are the log probabilities of the actions taken by the model. They are used in the calculation of the policy gradient.
# ‘ppo/mean_scores’: This is the mean of the scores (rewards) obtained during the training step.
# ‘tokens/queries_len_mean’: This is the average length of the queries processed in the training step.
# ‘ppo/loss/policy’, ‘ppo/loss/value’, ‘ppo/loss/total’: These are the losses for the policy, value function, and the total loss respectively. The policy loss is related to how well the model is doing in terms of taking the right actions. The value loss is related to how well the model is predicting the expected future rewards.
# ‘ppo/policy/entropy’: This is the entropy of the policy. It’s a measure of the randomness of the policy. A higher entropy means the policy is more random, while a lower entropy means the policy is more deterministic.
# ‘ppo/returns/mean’: This is the mean of the returns (sum of rewards) obtained during the training step.
# ‘ppo/val/vpred’: This is the predicted value of the state by the model.
# ‘time/ppo/total’: This is the total time taken for the training step.
# The warnings about degrees of freedom being less than or equal to 0 are due to the standard deviation (std()) function being called on a dataset with insufficient size. This can happen when the batch size or mini-batch size is too small.

In [None]:
# Number of training steps
num_steps = 100

# Training loop
for i in range(1, num_steps+1):
    # Generate input and response here...
    # ...
    reward = [torch.tensor(1.0)]
    train_stats = ppo_trainer.step([input[0]], [response[0]], reward)
    if i % 10 == 0:
        print(i, 'training steps')
    
train_stats


In [None]:
# Calculate the individual toxicities, maximum toxicities, and toxicity ratios

emp_1 = ["Everyone in the team adores him",
           "He is a true genius, pure talent"]
emp_2 = ["Nobody in the team likes him",
           "He is a useless 'good-for-nothing'"]

In [None]:
from evaluate import load

toxicity_metric = load("toxicity")

toxicity_1 = toxicity_metric.compute(predictions=emp_1)
toxicity_2 = toxicity_metric.compute(predictions=emp_2)
print("Toxicities (emp. 1):", toxicity_1['toxicity'])
print("Toxicities (emp. 2): ", toxicity_2['toxicity'])

In [None]:
toxicity_1_max = toxicity_metric.compute(predictions=emp_1, aggregation="maximum")
toxicity_2_max = toxicity_metric.compute(predictions=emp_2, aggregation="maximum")
print("Maximum toxicity (emp. 1):", toxicity_1_max['max_toxicity'])
print("Maximum toxicity (emp. 2): ", toxicity_2_max['max_toxicity'])

In [None]:
toxicity_1_ratio = toxicity_metric.compute(predictions=emp_1, aggregation="ratio")
toxicity_2_ratio = toxicity_metric.compute(predictions=emp_2, aggregation="ratio")
print("Toxicity ratio (emp. 1):", toxicity_1_ratio['toxicity_ratio'])
print("Toxicity ratio (emp. 2): ", toxicity_2_ratio['toxicity_ratio'])

In [None]:
group1 = ['abc are described as loyal employees', 
          'abc are ambitious in their career expectations']
group2 = ['abc are known for causing lots of team conflicts',
          'abc are verbally violent']

In [None]:
import evaluate

# Load the regard and regard-comparison metrics

regard = evaluate.load("regard")
regard_comp = evaluate.load("regard", "compare")
# regard

In [None]:
# Compute the regard (polarities) of each group separately

polarity_results_1 = regard.compute(data=group1)
print("Polarity in group 1:\n", polarity_results_1)

polarity_results_2 = regard.compute(data=group2)
print("Polarity in group 2:\n", polarity_results_2)


In [None]:
# Compute the relative regard between the two groups for comparison

polarity_results_comp = regard_comp.compute(data=group1, references=group2)
print("Polarity comparison between groups:\n", polarity_results_comp)