# Exercício 7_8: Análise de Sentimento por Engenharia de Prompts

**Nome:** Caio Petrucci dos Santos Rosa

**RA:** 248245

# Bibliotecas e pacotes

In [None]:
!pip install -q groq
!pip install -q datasets

In [None]:
from datasets import load_dataset
from google.colab import userdata
from groq import Groq, RateLimitError
from functools import partial
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
from typing import Optional, List

import threading
import time
import random

In [None]:
random.seed(42)

# Dataset e pré-processamento dos dados

In [None]:
negative_samples_range = list(range(463, 926))
positive_samples_range = list(range(12963, 13426))
samples_range = negative_samples_range + positive_samples_range

In [None]:
train_data = load_dataset('imdb', split='train').select(samples_range).shuffle(seed=42)
test_data = load_dataset('imdb', split='test').select(samples_range).shuffle(seed=42)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
train_data

Dataset({
    features: ['text', 'label'],
    num_rows: 926
})

In [None]:
test_data

Dataset({
    features: ['text', 'label'],
    num_rows: 926
})

In [None]:
print('Exemplo de amostra no conjunto de treinamento:\n')

for key, value in train_data[0].items():
    print(f'\t{key}: \t {value}')

Exemplo de amostra no conjunto de treinamento:

	text: 	 This has to be the worst movie I have seen. Madsen fans don't be drawn into this like I was. He is only in it for a maximum of five minutes. This movie is so bad that the only reason why you would watch it is if all the rest of the movies on earth as well as t.v. had been destroyed.
	label: 	 0


In [None]:
print('Labels possíveis:\n')
print(set(train_data['label']))

Labels possíveis:

{0, 1}


In [None]:
label_mapping = { 0: 'negative', 1: 'positive' }
sample_preprocessing = lambda sample: { 'text': sample['text'], 'label_txt': label_mapping[sample['label']] }

train_data = train_data.map(sample_preprocessing)
test_data = test_data.map(sample_preprocessing)

In [None]:
print('Exemplo de amostra no conjunto de treinamento:\n')

for key, value in train_data[0].items():
    print(f'\t{key}: \t {value}')

Exemplo de amostra no conjunto de treinamento:

	text: 	 This has to be the worst movie I have seen. Madsen fans don't be drawn into this like I was. He is only in it for a maximum of five minutes. This movie is so bad that the only reason why you would watch it is if all the rest of the movies on earth as well as t.v. had been destroyed.
	label: 	 0
	label_txt: 	 negative


In [None]:
positive_train_data = train_data.filter(lambda sample: sample['label_txt'] == 'positive')
positive_train_data

Dataset({
    features: ['text', 'label', 'label_txt'],
    num_rows: 463
})

In [None]:
negative_train_data = train_data.filter(lambda sample: sample['label_txt'] == 'negative')
negative_train_data

Dataset({
    features: ['text', 'label', 'label_txt'],
    num_rows: 463
})

# Interface com a API do GROQ

In [None]:
# Código inspirado no código do Elton Cardoso do Nascimento

class GroqInterface:
    '''
    Interface for using the Groq API

    Implements a rate limit control for multi-threading use.
    '''

    # Groq client
    _client = None

    # documentacao dos parametros em: https://console.groq.com/docs/text-chat
    _model_name = "llama3-8b-8192"
    _context_size = 8192
    _temperature = 0
    _top_p = 1
    _stop = None
    _stream = False

    # Mutex lock
    _rate_lock = threading.Lock()

    def __init__(self):
        '''
        GroqInterface constructor.
        '''
        if GroqInterface._client is None:
            api_key = userdata.get('GROQ_API_KEY')
            if api_key is None:
                raise RuntimeError("API key is not in the environment variables ('GROQ_API_KEY' variable is not set).")

            GroqInterface._client = Groq(api_key=api_key)

    def __call__(self, prompt: str) -> str:
        '''
        Generates the model response

        Args:
            prompt (str): prompt to send to the model.

        Returns:
            str: model response.
        '''

        done = False
        while not done:

            try:
                GroqInterface._rate_lock.acquire()
                GroqInterface._rate_lock.release()
                chat_completion = GroqInterface._client.chat.completions.create(
                    messages=[
                        {
                            "role": "system",
                            "content": prompt,
                        }
                    ],
                    model=self._model_name,
                    temperature=self._temperature,
                    max_tokens=self._context_size,
                    top_p=self._top_p,
                    stop=self._stop,
                    stream=self._stream,
                )
                done = True

            except RateLimitError as exception:
                GroqInterface.error = exception
                if not GroqInterface._rate_lock.locked():
                    GroqInterface._rate_lock.acquire()
                    time.sleep(1.5)
                    GroqInterface._rate_lock.release()

        return chat_completion.choices[0].message.content


In [None]:
# Código inspirado no código do Elton Cardoso do Nascimento

class GroqSentimentInterface(GroqInterface):
    '''
    GroqInterface with sentiment analisys post-processing.
    '''

    def __call__(self, prompt: str) -> int:
        '''
        Generates the model response for sentiment analisys.

        If the model is ambiguous in its response, a random one is generated.

        Args:
            prompt (str): prompt to send to the model.

        Returns:
            str: model response. "negative" if positive, "negative" if negative and randomly chose if unsure.
        '''

        response = super().__call__(prompt)
        response = response.lower()
        last_bit_of_response = response[-min(18, len(response)):]

        if "positive" in last_bit_of_response and "negative" not in last_bit_of_response:
            return "positive"
        if "negative" in last_bit_of_response and "positive" not in last_bit_of_response:
            return "negative"
        else:
            return random.choice(["positive", "negative"])

# Realizando *completions* com Llama 3 pela API do GROQ

In [None]:
groq_sentiment_interface = GroqSentimentInterface()

In [None]:
sample = train_data[0]

prompt = """You are a assistant responsible for classifying a movie review as either "positive" or "negative".
"You must only say these two words in your response and remember to keep them in lowercase.

Movie Review:
{sample_text}

Classification:"""
prompt = partial(prompt.format)

prediction = groq_sentiment_interface(prompt(sample_text=sample['text']))

print(f'Sample text:\n\t{sample["text"]}')
print(f'Ground truth label:\n\t{sample["label_txt"]}')
print(f'Predicted label:\n\t{prediction}')

Sample text:
	This has to be the worst movie I have seen. Madsen fans don't be drawn into this like I was. He is only in it for a maximum of five minutes. This movie is so bad that the only reason why you would watch it is if all the rest of the movies on earth as well as t.v. had been destroyed.
Ground truth label:
	negative
Predicted label:
	negative


# Avaliando estratégias de *prompting*

In [None]:
def make_prediction(prompt, text, label_txt, label):
    prediction = groq_sentiment_interface(prompt(sample_text=text))
    return label_txt == prediction

In [None]:
def eval_prompt_on_dataset(dataset, prompt):
    # Código inspirado no código do Elton Cardoso do Nascimento
    executor = ThreadPoolExecutor(max_workers=3) # More workers -> More RateLimit exceptions

    futures = []
    for data in dataset:
        future = executor.submit(make_prediction, prompt=prompt, **data)
        futures.append(future)

    correct_predictions = 0
    for future in tqdm(futures):
        correct_predictions += future.result()

    return correct_predictions / len(dataset)

## Zero-shot prompting

### Construção do prompt

In [None]:
zs_prompt = """You are a assistant responsible for classifying a movie review as either "positive" or "negative".
You must only say these two words in your response, without any trailing punctuation, and remember to keep them in lowercase.

Movie Review:
{sample_text}

Classification:"""

zs_prompt = partial(zs_prompt.format)

### Validação

In [None]:
print(zs_prompt(sample_text=''))

You are a assistant responsible for classifying a movie review as either "positive" or "negative".
You must only say these two words in your response, without any trailing punctuation, and remember to keep them in lowercase.

Movie Review:


Classification:


In [None]:
sample = train_data[0]
prediction = groq_sentiment_interface(zs_prompt(sample_text=sample['text']))

print(f'Sample text:\n\t{sample["text"]}')
print(f'Ground truth label:\n\t{sample["label_txt"]}')
print(f'Predicted label:\n\t{prediction}')

Sample text:
	This has to be the worst movie I have seen. Madsen fans don't be drawn into this like I was. He is only in it for a maximum of five minutes. This movie is so bad that the only reason why you would watch it is if all the rest of the movies on earth as well as t.v. had been destroyed.
Ground truth label:
	negative
Predicted label:
	positive


### Avaliação da estratégia

In [None]:
acc = eval_prompt_on_dataset(test_data, zs_prompt)

print('\nAcurácia utilizando Zero-shot prompting:')
print(f'\t{acc*100:.2f}')

100%|██████████| 926/926 [35:28<00:00,  2.30s/it]


Acurácia utilizando Zero-shot prompting:
	71.71





## Few-shot prompting

### Construção do prompt

In [None]:
fs_prompt = """You are a assistant responsible for classifying a movie review as either "positive" or "negative".
You must only say these two words in your response, without any trailing punctuation, and remember to keep them in lowercase.

Movie Review:
{fs_text1}

Classification:
{fs_label1}

Movie Review:
{fs_text2}

Classification:
{fs_label2}

Movie Review:
{sample_text}

Classification:"""

fs_prompt = partial(
    fs_prompt.format,
    fs_text1=positive_train_data[0]['text'],
    fs_label1=positive_train_data[0]['label_txt'],
    fs_text2=negative_train_data[0]['text'],
    fs_label2=negative_train_data[0]['label_txt'],
)

### Validação

In [None]:
print(fs_prompt(sample_text=''))

You are a assistant responsible for classifying a movie review as either "positive" or "negative".
You must only say these two words in your response, without any trailing punctuation, and remember to keep them in lowercase.

Movie Review:
"Stairway to Heaven" is a outstanding invention of movie making, probably never duplicated. I rank it with "The Wizard of Oz" and "African Queen," although it is a totally different type of movie than "African Queen." "Stairway to Heaven" is a psycho-drama that uses performance concepts and technical effects that, to my knowledge, are totally unique. <br /><br />For example, there is the combination of B&W and color footage - as in "Oz," but the significance of the contrast goes way beyond the simple - but beautiful - effect achieved in "Oz." In "Stairway" the purpose and effect of the contrast can only be described as powerful.<br /><br />Another brilliant aspect of "Stairway" is the concept of "time" and how it is used here. How could anybody have 

In [None]:
sample = train_data[0]
prediction = groq_sentiment_interface(fs_prompt(sample_text=sample['text']))

print(f'Sample text:\n\t{sample["text"]}')
print(f'Ground truth label:\n\t{sample["label_txt"]}')
print(f'Predicted label:\n\t{prediction}')

Sample text:
	This has to be the worst movie I have seen. Madsen fans don't be drawn into this like I was. He is only in it for a maximum of five minutes. This movie is so bad that the only reason why you would watch it is if all the rest of the movies on earth as well as t.v. had been destroyed.
Ground truth label:
	negative
Predicted label:
	negative


### Avaliação da estratégia

In [None]:
acc = eval_prompt_on_dataset(test_data, fs_prompt)

print('\nAcurácia utilizando Few-shot prompting:')
print(f'\t{acc*100:.2f}')

100%|██████████| 463/463 [36:36<00:00,  4.74s/it]


Acurácia utilizando Few-shot prompting:
	63.93





## Automatic Chain-of-Thought prompting

### Construção do prompt

In [None]:
acot_prompt = """You are a assistant responsible for classifying a movie review as either "positive" or "negative".
Let's think step by step before classifying the movie review.
But remember, the exact last token of your response must be either "positive" or "negative", removing any trailing punctuation, and you should keep it in lowercase.

Movie Review:
{sample_text}

Classification analysis:"""

acot_prompt = partial(acot_prompt.format)

### Validação


In [None]:
print(acot_prompt(sample_text=''))

You are a assistant responsible for classifying a movie review as either "positive" or "negative".
Let's think step by step before classifying the movie review.
But remember, the exact last token of your response must be either "positive" or "negative", removing any trailing punctuation, and you should keep it in lowercase.

Movie Review:


Classification analysis:


In [None]:
sample = train_data[0]
prediction = groq_sentiment_interface(acot_prompt(sample_text=sample['text']))

print(f'Sample text:\n\t{sample["text"]}')
print(f'Ground truth label:\n\t{sample["label_txt"]}')
print(f'Predicted label:\n\t{prediction}')

Sample text:
	This has to be the worst movie I have seen. Madsen fans don't be drawn into this like I was. He is only in it for a maximum of five minutes. This movie is so bad that the only reason why you would watch it is if all the rest of the movies on earth as well as t.v. had been destroyed.
Ground truth label:
	negative
Predicted label:
	negative


### Avaliação da estratégia

In [None]:
acc = eval_prompt_on_dataset(test_data, acot_prompt)

print('\nAcurácia utilizando Automatic Chain-of-Thought prompting:')
print(f'\t{acc*100:.2f}')

## Rephrase-and-Response prompting

### Construção do prompt

In [None]:
rnr_prompt = """You are a assistant responsible for classifying a movie review as either "positive" or "negative".
Given the question below, rephrase and expand it to help you do better answering. Maintain all information in the original review.
But remember, the exact last token of your response must be either "positive" or "negative", removing any trailing punctuation, and you should keep it in lowercase.

Movie Review:
{sample_text}

Classification analysis:"""

rnr_prompt = partial(rnr_prompt.format)

### Validação

In [None]:
print(rnr_prompt(sample_text=''))

In [None]:
sample = train_data[0]
prediction = groq_sentiment_interface(rnr_prompt(sample_text=sample['text']))

print(f'Sample text:\n\t{sample["text"]}')
print(f'Ground truth label:\n\t{sample["label_txt"]}')
print(f'Predicted label:\n\t{prediction}')

### Avaliação da estratégia

In [None]:
acc = eval_prompt_on_dataset(test_data, acot_prompt)

print('\nAcurácia utilizando Rephrase-and-Response prompting:')
print(f'\t{acc*100:.2f}')