In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from evaluate import load  # biblioteca Hugging Face 'evaluate'

# Carrega o modelo e tokenizer (exemplo com GPT-2)
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Texto de entrada
input_text = "Once upon a time"

# Encode the input text, generate and decode it
input_text_ids = tokenizer.encode(input_text, return_tensors="pt")
output = model.generate(input_text_ids, max_length=20)
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

print("Generated Text: ", generated_text)

# Load and compute the perplexity score
perplexity = load("perplexity", module_type="metric")
results = perplexity.compute(model_id="gpt2", predictions=generated_text)
print("Perplexity: ", results['mean_perplexity'])


In [None]:
from evaluate import load

bleu = load("bleu")
input_sentence_1 = "Hola, ¿cómo estás?"

reference_1 = [
     ["Hello, how are you?", "Hi, how are you?"]
     ]

input_sentences_2 = ["Hola, ¿cómo estás?", "Estoy genial, gracias."]

references_2 = [
     ["Hello, how are you?", "Hi, how are you?"],
     ["I'm great, thanks.", "I'm great, thank you."]
     ]

translator = pipeline("translation", model="Helsinki-NLP/opus-mt-es-en")

# Translate the first input sentence then calucate the BLEU metric for translation quality
translated_output = translator(input_sentence_1)

translated_sentence = translated_output[0]['translation_text']

print("Translated:", translated_sentence)

results = bleu.compute(predictions=[translated_sentence], references=reference_1)
print(results)

In [None]:
# Translate the input sentences, extract the translated text, and compute BLEU score
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-es-en")

translated_outputs = translator(input_sentences_2)

predictions = [translated_output["translation_text"] for translated_output in translated_outputs]
print(predictions)

results = bleu.compute(predictions=predictions, references=references_2)
print(results)

In [1]:
import evaluate
# Load the rouge metric
rouge = evaluate.load("rouge")

predictions = ["""Pluto is a dwarf planet in our solar system, located in the Kuiper Belt beyond Neptune, and was formerly considered the ninth planet until its reclassification in 2006."""]
references = ["""Pluto is a dwarf planet in the solar system, located in the Kuiper Belt beyond Neptune, and was previously deemed as a planet until it was reclassified in 2006."""]

# Calculate the rouge scores between the predicted and reference summaries
results = rouge.compute(predictions=predictions, references=references)
print("ROUGE results: ", results)

  from .autonotebook import tqdm as notebook_tqdm


ROUGE results:  {'rouge1': np.float64(0.7719298245614034), 'rouge2': np.float64(0.6181818181818182), 'rougeL': np.float64(0.736842105263158), 'rougeLsum': np.float64(0.736842105263158)}


In [3]:
#%pip install evaluate
#%pip install nltk rouge_score absl-py

In [2]:
meteor = evaluate.load("meteor")

generated = ["The burrow stretched forward like a narrow corridor for a while, then plunged abruptly downward, so quickly that Alice had no chance to stop herself before she was tumbling into an extremely deep shaft."]
reference = ["The rabbit-hole went straight on like a tunnel for some way, and then dipped suddenly down, so suddenly that Alice had not a moment to think about stopping herself before she found herself falling down a very deep well."]

# Compute and print the METEOR score
results = meteor.compute(predictions=generated, references=reference)
print("Meteor: ", results['meteor'])

Downloading builder script: 100%|██████████| 7.02k/7.02k [00:00<?, ?B/s]
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Andre\AppData\Roaming\nltk_data...
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Andre\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Andre\AppData\Roaming\nltk_data...


Meteor:  0.37180012567275916


In [3]:
# Load the metric
exact_match = evaluate.load("exact_match")

predictions = ["It's a wonderful day", "I love dogs", "DataCamp has great AI courses", "Sunshine and flowers"]
references = ["What a wonderful day", "I love cats", "DataCamp has great AI courses", "Sunsets and flowers"]

# Compute the exact match and print the results
results = exact_match.compute(references=references, predictions=predictions)
print("EM results: ", results)

Downloading builder script: 100%|██████████| 5.67k/5.67k [00:00<00:00, 6.04MB/s]

EM results:  {'exact_match': np.float64(0.25)}





In [4]:
user_1=['Everyone that tried it love it', 'This artist is a true genius, pure talent']
user_2=["Nobody i've talked to likes this product", 'Terrible singer']
toxicity_metric = evaluate.load("toxicity")
# Calculate the individual toxicities
toxicity_1 = toxicity_metric.compute(predictions=user_1)
toxicity_2 = toxicity_metric.compute(predictions=user_2)
print("Toxicities (user_1):", toxicity_1['toxicity'])
print("Toxicities (user_2): ", toxicity_2['toxicity'])

# Calculate the maximum toxicities
toxicity_1_max = toxicity_metric.compute(predictions=user_1,aggregation="maximum")
toxicity_2_max = toxicity_metric.compute(predictions=user_2,aggregation="maximum")
print("Maximum toxicity (user_1):", toxicity_1_max['max_toxicity'])
print("Maximum toxicity (user_2): ", toxicity_2_max['max_toxicity'])

# Calculate the toxicity ratios
toxicity_1_ratio = toxicity_metric.compute(predictions=user_1,aggregation="ratio")
toxicity_2_ratio = toxicity_metric.compute(predictions=user_1,aggregation="ratio")
print("Toxicity ratio (user_1):", toxicity_1_ratio['toxicity_ratio'])
print("Toxicity ratio (user_2): ", toxicity_2_ratio['toxicity_ratio'])

Downloading builder script: 100%|██████████| 6.08k/6.08k [00:00<?, ?B/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Device set to use cuda:0


Toxicities (user_1): [0.00013486345415003598, 0.00013348401989787817]
Toxicities (user_2):  [0.0001355926360702142, 0.00013771136582363397]
Maximum toxicity (user_1): 0.00013486345415003598
Maximum toxicity (user_2):  0.00013771136582363397
Toxicity ratio (user_1): 0.0
Toxicity ratio (user_2):  0.0


In [5]:
group1 = ['abc are described as loyal', 
          'abc are honest but kind']
group2 = ['abc are known for being confrontational',
          'abc are very blunt']
# Load the regard and regard-comparison metrics
regard = evaluate.load("regard")
regard_comp = evaluate.load("regard", "compare")

# Compute the regard (polarities) of each group separately
polarity_results_1 = regard.compute(data=group1)
print("Polarity in group 1:\n", polarity_results_1)
polarity_results_2 = regard.compute(data=group2)
print("Polarity in group 2:\n", polarity_results_2)

# Compute the relative regard between the two groups for comparison
polarity_results_comp = regard_comp.compute(data=group1, references=group2)
print("Polarity comparison between groups:\n", polarity_results_comp)

Downloading builder script: 100%|██████████| 8.41k/8.41k [00:00<?, ?B/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Device set to use cuda:0
Device set to use cuda:0


Polarity in group 1:
 {'regard': [[{'label': 'neutral', 'score': 0.9586172103881836}, {'label': 'negative', 'score': 0.020241964608430862}, {'label': 'positive', 'score': 0.014409039169549942}, {'label': 'other', 'score': 0.006731739267706871}], [{'label': 'positive', 'score': 0.83542400598526}, {'label': 'other', 'score': 0.12411251664161682}, {'label': 'neutral', 'score': 0.03053131513297558}, {'label': 'negative', 'score': 0.009932112880051136}]]}
Polarity in group 2:
 {'regard': [[{'label': 'negative', 'score': 0.9745951890945435}, {'label': 'other', 'score': 0.017152613028883934}, {'label': 'neutral', 'score': 0.007746343966573477}, {'label': 'positive', 'score': 0.0005058045499026775}], [{'label': 'neutral', 'score': 0.7666088938713074}, {'label': 'negative', 'score': 0.10047446191310883}, {'label': 'positive', 'score': 0.07146850973367691}, {'label': 'other', 'score': 0.061448223888874054}]]}
Polarity comparison between groups:
 {'regard_difference': {'neutral': 0.10739664384163