# Supervised LLM Evaluation

In [1]:
!pip install openai evaluate rouge_score -q

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m362.4/362.4 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m649.9 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m318.9/318.9 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m1.8 MB/s[0m eta [36m0:

In [2]:
from openai import OpenAI

import pandas as pd

from google.colab import userdata
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


### Load Data

We're using a Tweet sentiment analysis dataset from [Kaggle](https://www.kaggle.com/datasets/abhi8923shriv/sentiment-analysis-dataset/data).

In [3]:
# Change this to the path to the data in your drive
path = '/content/drive/MyDrive/session_2_code'

In [4]:
sentiment = pd.read_csv(f'{path}/train.csv', encoding='unicode_escape')

In [5]:
sentiment.head()

Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,38928346,652860.0,60
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,noon,21-30,Albania,2877797,27400.0,105
2,088c60f138,my boss is bullying me...,bullying me,negative,night,31-45,Algeria,43851044,2381740.0,18
3,9642c003ef,what interview! leave me alone,leave me alone,negative,morning,46-60,Andorra,77265,470.0,164
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,noon,60-70,Angola,32866272,1246700.0,26


We only need the text and the label.

In the interest of simplicity, we'll only keep the positive and negative sentiment Tweets.

In [6]:
sentiment = sentiment.loc[sentiment['sentiment'].isin(['positive', 'negative']), ['text', 'sentiment']]

### Classification using GPT-4o

In [7]:
client = OpenAI(api_key=userdata.get('OPENAI_API_KEY'))  # Replace with your API key

In [25]:
model = 'gpt-3.5-turbo'

Let's look at a single example

In [9]:
tweet = sentiment['text'].values[0]

In [10]:
tweet

' Sooo SAD I will miss you here in San Diego!!!'

In [11]:
sentiment['sentiment'].values[0]

'negative'

In [26]:
full_prompt = f"""
"Given the text of this Tweet, classify it into either positive or negative sentiment.
Return only the string 'positive' or the word 'negative' with no other explanations or extra text.

Tweet: {tweet}
sentiment:
"""

completion = client.chat.completions.create(
  model=model,
  messages=[
    {"role": "system", "content": "You are a helpful assistant."},  # Can keep this simple
    {"role": "user", "content": full_prompt}
  ]
)

In [27]:
completion

ChatCompletion(id='chatcmpl-9xENUeRratuaVK7czpP0aV4TCondC', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='positive', refusal=None, role='assistant', function_call=None, tool_calls=None))], created=1723903648, model='gpt-3.5-turbo-0125', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=1, prompt_tokens=85, total_tokens=86))

In [28]:
completion.choices[0].message.content

'positive'

Now, let's run 20 examples so we can calculate metrics

In [29]:
classifications = []

for i, tweet in enumerate(sentiment['text'].values[:20]):
  if i % 5 == 0:
    print(f"Processing Tweet {i}.")

  full_prompt = f"""
    "Given the text of this Tweet, classify it into either positive or negative sentiment.
    Return only the string 'positive' or the string 'negative' with no other explanations or extra text.

    Tweet: {tweet}
    sentiment:
    """

  completion = client.chat.completions.create(
      model=model,
      messages=[
        {"role": "system", "content": "You are a helpful assistant."},  # Can keep this simple
        {"role": "user", "content": full_prompt}
      ]
    )
  classifications.append(completion.choices[0].message.content)

print("Done.")

Processing Tweet 0.
Processing Tweet 5.
Processing Tweet 10.
Processing Tweet 15.
Done.


In [30]:
true_labels = sentiment['sentiment'].values[:20]

#### Traditional ML Classification Metrics

**precision**: Of all of the cases the model classified as positive, what % actually were? In this case, of all Tweets predicted to have positive sentiment, what % were actually positive? Good when the cost of false positives is high.

**recall**: Of all the positive cases, what % were identified by the model? In this case, of all positive Tweets in the dataset, what % did the model correctly classify as positive? Good when the cost of false negatives is high.

**F1**: Harmonic mean of precision and recall. Good when the cost of false positives and false negatives is similar.

In [17]:
from sklearn.metrics import precision_score, recall_score, f1_score

In [31]:
precision = precision_score(true_labels, classifications, pos_label='positive')
recall = recall_score(true_labels, classifications, pos_label='positive')
f1 = f1_score(true_labels, classifications, pos_label='positive')

In [32]:
print(f"Precision: {precision}.")
print(f"Recall: {recall}.")
print(f"F1: {f1}.")

Precision: 0.875.
Recall: 1.0.
F1: 0.9333333333333333.


### Perplexity

These values represent how "surprised" the model is by each Tweet. Lower perplexity values indicate that the model found the sequence of words in the Tweet more predictable based on the training data. Higher values indicate that the Tweet's word sequence was less predictable.

In [34]:
import evaluate

perplexity = evaluate.load("perplexity", module_type="metric")

Downloading builder script:   0%|          | 0.00/8.46k [00:00<?, ?B/s]

In [35]:
results = perplexity.compute(model_id='gpt2',
                             add_start_token=False,
                             predictions=sentiment['text'].values[:5])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

In [36]:
results

{'perplexities': [176.47877502441406,
  338.73779296875,
  1834.3751220703125,
  198.1342315673828,
  164.44015502929688],
 'mean_perplexity': 542.4332153320313}

Tweet with Perplexity 176.48: This tweet is relatively predictable according to the model.

Tweet with Perplexity 1834.38: This tweet is highly unpredictable or has a word sequence that is very different from what the model has seen in the training data

In [37]:
sentiment['text'].values[0]

' Sooo SAD I will miss you here in San Diego!!!'

In [39]:
sentiment['text'].values[2]

' what interview! leave me alone'

### BLEU

For measuring the quality of machine-generated text by comparing it to reference texts.

BLEU calculates the similarity between a candidate text and reference text by using a modified form of precision. It considers n-grams (continuous sequences of n items) in the candidate text and checks how many of them appear in the reference text.

Higher BLEU scores suggest a better text.

Components:
- n-gram precision: Measures how many n-grams in the candidate text appear in the reference text. This is the number of n-gram matches / the total number of n-grams.
- Brevity penalty (BP): Penalizes short candidate text to avoid the score inflation due to very short but correct text.

In the interest of time we won't go into the full calculation, but luckily this can easily be done for us using nltk!

In [57]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# Original: "The quick brown fox jumps over the lazy dog while the energetic cat runs under the swift bird."

references = [
    ["The", "fast", "brown", "fox", "leaps", "over", "the", "lazy", "dog", "as", "the", "energetic", "cat", "runs", "beneath", "the", "swift", "bird"],
    ["The", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog", "while", "the", "lively", "cat", "sprints", "under", "the", "swift", "bird"],
    ["The", "swift", "brown", "fox", "vaults", "over", "the", "lazy", "dog", "as", "the", "energetic", "cat", "dashes", "beneath", "the", "swift", "bird"]
]

machine_translations = [
    ["The", "fast", "brown", "fox", "jumps", "over", "the", "lazy", "dog", "while", "the", "energetic", "cat", "runs", "under", "the", "fast", "bird"],
    ["The", "quick", "brown", "fox", "leaps", "over", "the", "lazy", "dog", "as", "the", "lively", "cat", "runs", "under", "the", "swift", "bird"],
    ["The", "swift", "brown", "fox", "jumps", "over", "the", "lazy", "dog", "as", "the", "energetic", "cat", "runs", "below", "the", "swift", "bird"]
]

for i, translation in enumerate(machine_translations):
    score = sentence_bleu(references, translation, smoothing_function=SmoothingFunction().method1)
    print(f"Machine Translation {i + 1} BLEU score: {score:.2f}")

Machine Translation 1 BLEU score: 0.73
Machine Translation 2 BLEU score: 0.78
Machine Translation 3 BLEU score: 0.82


The smoothing function in BLEU score calculation helps address the problem of zero counts in n-gram matching. When comparing machine translations to reference translations, it is common for higher-order n-grams (e.g., trigrams, four-grams) to have zero matches, especially with shorter sentences or when the machine translation is not very close to the reference. This can lead to a BLEU score of zero, which is often overly harsh and uninformative.

### ROUGE


ROUGE (Recall-Oriented Understudy for Gisting Evaluation) is a set of metrics used to evaluate the quality of summaries and machine-generated translations by comparing them to reference summaries or translations. Unlike BLEU, which focuses on precision, ROUGE focuses on recall. The most commonly used ROUGE metrics are:

- ROUGE-N: Measures n-gram recall between a candidate summary and a set of reference summaries.
  - For example, ROUGE-1 measures the recall of unigrams (individual words), while ROUGE-2 measures the recall of bigrams (pairs of words).
  - Good for short summaries
- ROUGE-L: Measures the longest common subsequence (LCS) between a candidate summary and reference summaries.
  - Good for longer summaries
- ROUGE-S: Measures the overlap of skip-bigrams between the candidate and reference summaries.
  - Good for creative texts or flexible phrasing, since it allows for flexible word order while maintaining content accuracy.

In [58]:
from rouge_score import rouge_scorer

# Reference and candidate summaries
reference_summary = "The quick brown fox jumps over the lazy dog."
candidate_summary = "The quick brown fox leaps over a lazy dog."

# Create a ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scores = scorer.score(reference_summary, candidate_summary)

# Print ROUGE scores
print("ROUGE-1:", scores['rouge1'])
print("ROUGE-2:", scores['rouge2'])
print("ROUGE-L:", scores['rougeL'])

ROUGE-1: Score(precision=0.7777777777777778, recall=0.7777777777777778, fmeasure=0.7777777777777778)
ROUGE-2: Score(precision=0.5, recall=0.5, fmeasure=0.5)
ROUGE-L: Score(precision=0.7777777777777778, recall=0.7777777777777778, fmeasure=0.7777777777777778)


ROUGE-1: Measures the overlap of unigrams (individual words). In this case, the precision, recall, and F-measure are all 0.78. So 78% of the words in teh candidate summary are also in the reference summary, and 78% of the words in the reference summary were also in the candidate summary.

ROUGE-2: Measures the overlap of bigrams (pairs of words). The precision, recall, and F-measure are 0.50. So, 50% of the bigrams in the candidate summary are also in the reference summary and vice versa.

ROUGE-L: Measures the longest common subsequence. The precision, recall, and F-measure are 0.70. So, 70% of the longest common subsequence in the candidate summary matches the reference summary and vice versa.