# Accuracy

In [7]:
from transformers import pipeline
from sklearn.metrics import accuracy_score

sentiment_analysis = pipeline("sentiment-analysis")

test_examples = [
    {"text":'I love this product',"label":1},
    {"text":'I hate this product',"label":0},
    {"text":'I like this product',"label":1},
    {"text":'I do not love this product',"label":0}
]

predictions = sentiment_analysis(
    [example['text'] for example in test_examples]
)
#print(example['text'])

true_labels = [example['label'] for example in test_examples]
predicted_labels = [1 if pred['label'] =="POSITIVE"
                    else 0 for pred in predictions]

accuracy= accuracy_score(true_labels, predicted_labels)

# prints results
for example, pred_label in zip(test_examples, predicted_labels):
    print(f"Text: {example['text']} , prediction: {pred_label}")

print(f"Accuracy: {accuracy:.2%}")

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Text: I love this product , prediction: 1
Text: I hate this product , prediction: 0
Text: I like this product , prediction: 1
Text: I do not love this product , prediction: 0
Accuracy: 100.00%


# Face Hugging Metrics --> Metric, Comparison and Measurement

In [8]:
# Metric --> to evaluate model performance based on ground true

In [9]:
import evaluate

accuracy = evaluate.load('accuracy')
print(accuracy.description)


Accuracy is the proportion of correct predictions among the total number of cases processed. It can be computed with:
Accuracy = (TP + TN) / (TP + TN + FP + FN)
 Where:
TP: True positive
TN: True negative
FP: False positive
FN: False negative



In [10]:
print(evaluate.load('f1').description)


The F1 score is the harmonic mean of the precision and recall. It can be computed with the equation:
F1 = 2 * (precision * recall) / (precision + recall)



In [20]:
print(accuracy.features) # give you info about the inputs needed

{'predictions': Value(dtype='int32', id=None), 'references': Value(dtype='int32', id=None)}


In [23]:
f1 = evaluate.load("f1")
print(f1.features) # give you info about the inputs needed

Using the latest cached version of the module from C:\Users\bciez\.cache\huggingface\modules\evaluate_modules\metrics\evaluate-metric--f1\0ca73f6cf92ef5a268320c697f7b940d1030f8471714bffdb6856c641b818974 (last modified on Sat Jun 29 11:00:08 2024) since it couldn't be found locally at evaluate-metric--f1, or remotely on the Hugging Face Hub.


{'predictions': Value(dtype='int32', id=None), 'references': Value(dtype='int32', id=None)}


In [25]:
pearson_corr= evaluate.load("pearsonr")
print(pearson_corr.features)

{'predictions': Value(dtype='float32', id=None), 'references': Value(dtype='float32', id=None)}


In [29]:
accuracy = evaluate.load('accuracy')
precision = evaluate.load('precision')
recall = evaluate.load('recall')
f1 = evaluate.load("f1")
real_labels = [0,1,0,1,1]
predicted_labels = [0,0,0,1,1]

In [31]:
print(accuracy.compute(references=real_labels,predictions=predicted_labels))
print(precision.compute(references=real_labels,predictions=predicted_labels))
print(recall.compute(references=real_labels,predictions=predicted_labels))
print(f1.compute(references=real_labels,predictions=predicted_labels))

{'accuracy': 0.8}
{'precision': 1.0}
{'recall': 0.6666666666666666}
{'f1': 0.8}


# LLM tasks and their metrics

## text classification  

--> accuracy, f1 score

## text generation 
--> Perplexity, and BLEU score

## Summarization 
--> ROUGE score and BLEU score

## Translation 
--> BLEU score and METEOR

## Q & A 
--> Exact Match (EM) and BLEU /ROUGE

In [18]:
# Comparison --> compare two models

In [35]:
example

{'text': 'I do not love this product', 'label': 0}

In [3]:
# evaluation --> evaluate and get insight from language datasets

In [4]:
# Perplexity ---> text generation

model_name = 'gpt2'

'''
'''

prompt = "Lates research findings in antarctica shows"
prompt_ids = tokenizer.encode(test_sentense, return_tensors="pt")
output = model.generate(prompt_ids, max_length=17)
generated_text = tokenizer.decode(
    output[0], skip_special_tokens=True
)

print(generated_text)

perplexity = evaluate.load('perplexity',module_type="metric")
results = perplexity.compute(model_id='gpt2', predictions= generated_text)

print(results[mean_perplexity])

In [14]:
# ROUGE score --> text summarization

rouge = evaluate.load('rouge')

predictions = ['I liked the game']

references = ['it was a good game']

results = rouge.compute(predictions=predictions, references= references)

print(results)

{'rouge1': 0.22222222222222224, 'rouge2': 0.0, 'rougeL': 0.22222222222222224, 'rougeLsum': 0.22222222222222224}


In [19]:
# BLEU score --> translation

import evaluate
from transformers import pipeline

bleu = evaluate.load('bleu')

translator = pipeline('translation',model="Helsinki-NLP/opus-mt-es-en")
input="Que hermoso dia"
references = [["what a gergeous day","what a beautiful day"]]

translated_outputs = translator(input)
translated_sentence = translated_outputs[0]['translation_text']
print('translation: ', translated_sentence)

translation:  What a beautiful day.


In [20]:
results = bleu.compute(predictions=[translated_sentence], references=references)
print(results)

{'bleu': 0.0, 'precisions': [0.6, 0.5, 0.3333333333333333, 0.0], 'brevity_penalty': 1.0, 'length_ratio': 1.25, 'translation_length': 5, 'reference_length': 4}


In [22]:
# Meteor Score in translation --> improved alternative to bleu

bleu = evaluate.load('bleu')
meteor = evaluate.load('meteor')

pred = ['what a beautiful day is today']

ref =["What a nice days"]

results_b = bleu.compute(predictions=pred,references=ref)
results_m = meteor.compute(predictions=pred, references=ref)

print('bleu: ',results_b['bleu'])
print('Meteor: ', results_m['meteor'])

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\bciez\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\bciez\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\bciez\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


bleu:  0.0
Meteor:  0.6084656084656085


In [26]:
# Exact Match (EM) --> for questions answering

from evaluate  import load
em_metric = load('exact_match')

exact_match = evaluate.load("exact_match")
predictions = ["the cat sat on the mat",
               "theathers are great",
               'it is like comparing oranges and apples']
references = ['the cat sat on the mat?',
              'theathers are great',
              'it is like comparing apples and oranges']

results = exact_match.compute(references=references, predictions=predictions)
print(results)

{'exact_match': 0.3333333333333333}


In [None]:
# Load the regard and regard-comparison metrics
regard = evaluate.load("regard")
#regard_comp = regard("regard", "compare")

# Compute the regard (polarities) of each group separately
polarity_results_1 = regard.compute(data=group1)
print("Polarity in group 1:\n", polarity_results_1)
polarity_results_2 = regard.compute(data=group2)
print("Polarity in group 2:\n", polarity_results_2)

# Compute the relative regard between the two groups for comparison
polarity_results_comp = regard.compute(predictions=group1, references=group2)
print("Polarity comparison between groups:\n", polarity_results_comp)