# IndoML 2023 Tuturial: Part 2
## The Era of LLMs!

### In-context learning and Prompt Engineering


1. We will use recent LLMs like GPT-3/FLAN-T5/LLAMA to query the models in natural language to get answers/predictions.
2. These models are finetuned on instructions or human-feedbacks to enable them to perform a task through "prompting".
3. Best part is we wouldn't need to train our models to get started, direct inference from these pretrained models is fine.
    * NOTE: Although there can be methods to finetune these models on our data to get better results, we will not be covering that in this tutorial.

### Methods that we will try:

1. FLAN-T5

## Load `dataset`

In [1]:
%env CUDA_VISIBLE_DEVICES=3

env: CUDA_VISIBLE_DEVICES=3


In [2]:
from tqdm import tqdm
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

tqdm.pandas()

dataset = load_dataset("AmazonScience/massive")


  from .autonotebook import tqdm as notebook_tqdm


## Load `AutoTokenizer` and `AutoModelForSeq2SeqLM`

In [None]:
# pip install -q transformers accelerate bitsandbytes
from transformers import AutoTokenizer, AutoModelForCausalLM

# checkpoint = "bigscience/mt0-base"
checkpoint = "bigscience/bloomz-3b"
# checkpoint = "google/flan-t5-xxl"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map="auto", load_in_4bit=True)

## Model Specific Example of Prompt Engineering

In [None]:
inputs = tokenizer.encode("Detect the intent class of the utterance.\nUtterance: I am going to school.; Intent:", return_tensors="pt").to("cuda")
outputs = model.generate(inputs)
print(tokenizer.decode(outputs[0]))

## Preprocessing

- We will create prompts for each test sample in the dataset. 
- There are few ways to format these prompt and this step is called "Prompt Engineering".
    - Few-shot In-context learning: Use task-description and examples
    - Zero-shot In-context learning: Use task-description only

## Exemplars for the multilingual intent-detection task

In [None]:
# Gather examples from the training dataset
import pandas as pd
df_train = dataset['train'].to_pandas()

# Extract one random sample per intent, we will randomly sample the rows
# Hope is that the model will predict english labels for any language that way.
df_intent_samples = df_train.groupby("intent").apply(lambda x: x.sample(1, random_state=42)).reset_index(drop=True)    

In [None]:
# Add formatted prompt for each sample
def int2str(x):
    return dataset['train'].features['intent'].int2str(x)

df_intent_samples['example_prompt_format'] = df_intent_samples.apply(lambda x: f'Utterance: {x["utt"]}; Intent: {int2str(x["intent"])}', axis=1)

# merge examples into a single string
prompt_exemplars = df_intent_samples['example_prompt_format'].str.cat(sep='\n')

In [None]:
print(prompt_exemplars)

## Generate prompts for each query

In [None]:
# Add a new feature column to the dataset
# Prompt: What is the intent of the following sentence?\m "{utt}"

few_shot = True
def add_prompt(example):
    if few_shot:
        example["prompt"] = f'# Detect intent of the input utterance.\n\n{prompt_exemplars}\nUtterance: {example["utt"]}; Intent:'
    else:
        example["prompt"] = f'# Detect intent of the input utterance.\n\nUtterance: {example["utt"]}; Intent:'
    
    example["str_label"] = int2str(example["intent"])
    return example


extended_eval_set = dataset['validation'].map(add_prompt)

In [None]:
extended_eval_set[0]

## Now let's try to predict using the LLM

In [None]:
print(extended_eval_set[100]['prompt'])

x = extended_eval_set[100]['prompt']
tok_x = tokenizer(x, return_tensors="pt")
y = model.generate(tok_x['input_ids'].to("cuda"), num_beams=5, num_return_sequences=5, max_length=2000)
output = tokenizer.decode(y[0], skip_special_tokens=True)
print(output)

In [None]:
N_class = dataset['train'].features['intent'].num_classes
str2int = {}
for i in range(N_class):
    str2int[int2str(i)] = i

def parse_prediction(prompt, output_txt):
    # take the diff between the prompt and the generated text
    # cut it till the first \n
    pred_class = output_txt[len(prompt):].split('\n')[0].strip()

    # Check if it matches any label in the dataset
    if pred_class in str2int:
        return str2int[pred_class], pred_class
    else:
        return -1, pred_class

In [None]:
sorted(list(str2int.keys()))

In [None]:
parse_prediction(x, output)

## Setup Evaluation Metric

In [None]:
import numpy as np
import evaluate

metric_acc = evaluate.load("accuracy")
metric_f1 = evaluate.load("f1")

# We need to define a compute_metric function that is supported by the Trainer output
# It basically converts the logits to predictions and then calls the metric
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    # print("Predictions: ", predictions)
    # print("Labels: ", labels)
    return metric_f1.compute(predictions=predictions, references=labels, average="macro")

## Select a smaller subset for evaluation

In [None]:
import random
# Select random samples from above subsets
n_eval = 600
eval_indices = random.sample(range(len(extended_eval_set)), n_eval)

In [None]:
eval_data_subset = extended_eval_set.select(eval_indices)

In [None]:
# Gather prediction for the eval set
outputs = []
for i in tqdm(range(len(eval_data_subset))):
    x = eval_data_subset[i]['prompt']
    tok_x = tokenizer(x, return_tensors="pt")
    y = model.generate(tok_x['input_ids'].to("cuda"), num_beams=3, num_return_sequences=3, max_length=2000)
    all_predictions = []
    for j in range(len(y)):
        output = tokenizer.decode(y[j], skip_special_tokens=True)
        all_predictions.append(output)
    outputs.append(all_predictions)

In [None]:
# Parse the predictions
parsed_predictions = []
for i in tqdm(range(len(eval_data_subset))):
    x = eval_data_subset[i]['prompt']
    all_predictions = outputs[i]
    parsed_predictions.append([parse_prediction(x, output) for output in all_predictions])

In [None]:
parsed_predictions

In [None]:
df_eval = eval_data_subset.to_pandas()
df_eval['pred_label_str'] = [x[1] for x in parsed_predictions]
df_eval['pred_label_int'] = [x[0] for x in parsed_predictions]

## Analyze the performance of the model

In [None]:
# Compute f1 for each locale and plot f1 scores vs locales
from sklearn.metrics import f1_score, accuracy_score
import seaborn as sns
sns.set(rc={'figure.figsize':(11, 8)}) # Setting some matplotlib configs



In [None]:
# Print histogram of locale, y-axis normalized to 1
P = sns.histplot(df_eval['locale'], stat="probability")

P.set_xlabel("Locale")
P.set_ylabel("Probability")
P.set_title("Locale histogram")

# Rotate x labels by 90 degrees
for item in P.get_xticklabels():
    item.set_rotation(90)



In [None]:
def compute_f1(df):
    return f1_score(df['targets'], df['predictions'], average='macro')

def compute_acc(df):
    return accuracy_score(df['targets'], df['predictions'])

df_f1 = df_eval.groupby('locale').apply(compute_f1)
P = sns.barplot(x=df_f1.index, y=df_f1.values)
# df_acc = df.groupby('locale').apply(compute_acc)
# P = sns.barplot(x=df_acc.index, y=df_acc.values)

P.set_title("Locale vs Performance")
P.set_xlabel("Locale")
P.set_ylabel("Performance")

# Rotate x labels by 90 degrees
for item in P.get_xticklabels():
    item.set_rotation(90)

