<a href="https://colab.research.google.com/github/bencarlstrom/llm-a2/blob/main/A2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Ben Carlstrom  
ODIN: ben25

# **Assignment 2: Large Language Models for Text Classification**

---

In [None]:
'''
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.9.0 which is incompatible.

fsspec==2024.10.0 eliminates this dependency error in datasets, but leads to downstream errors in code
'''

!pip install -U datasets
import torch
from datasets import load_dataset
from google.colab import userdata
from huggingface_hub import login
from sklearn.metrics import classification_report
from transformers import AutoModelForCausalLM, AutoTokenizer



In [None]:
login(token=userdata.get('huggingface_token'))
dataset = load_dataset("cardiffnlp/tweet_sentiment_multilingual", "english")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
llama_1B_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
llama_1B_model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B")

In [None]:
llama_3B_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B")
llama_3B_model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-3B")

In [None]:
phi_tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3.5-mini-instruct")
phi_model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-3.5-mini-instruct")

## **Experiment 1: Zero-shot inference**
---

In [None]:
'''
  Method #2: Get log probabilities for keywords from final tensor output
'''

sentiment_ids = {}
sentiment_ids["positive"] = llama_1B_tokenizer.encode("positive", add_special_tokens=False)[0]
sentiment_ids["negative"] = llama_1B_tokenizer.encode("negative", add_special_tokens=False)[0]
sentiment_ids["neutral"] = llama_1B_tokenizer.encode("neutral", add_special_tokens=False)[0]

def calculate_option_log_prob(prompt, options):

    prompt_ids = llama_1B_tokenizer.encode(prompt, return_tensors="pt")

    with torch.no_grad():
        outputs = llama_1B_model(prompt_ids)
        logits = outputs.logits

    decoded_text = llama_1B_tokenizer.decode(torch.argmax(logits, dim=-1).squeeze().tolist())
    print(f"Decoded output:\n {decoded_text}")

    log_probs = {}
    for option in options:
        # Tokenize the option and get the IDs and their length
        option_ids = llama_1B_tokenizer(option, add_special_tokens=False).input_ids
        option_len = len(option_ids)

        # Calculate log probability of the option based on logits
        option_log_prob = 0
        for i in range(option_len):
            token_logits = logits[0, -option_len + i]  # Last few logits for option
            token_log_prob = torch.log_softmax(token_logits, dim=-1)[option_ids[i]]
            option_log_prob += token_log_prob.item()

        # Store the sum of log probabilities for each option
        log_probs[option] = option_log_prob

    # Determine the option with the highest log probability
    best_option = max(log_probs, key=log_probs.get)
    return best_option, log_probs



#tweet = dataset['validation'][0]['text'] # label is zero
tweet = "I hate waiting in line"
'''
  Cleaning the tweet before processing doesn't appear to help
'''
import re
def clean_output(tweet):
    # Remove unwanted characters and sequences
    cleaned_text = re.sub(r'[^A-Za-z0-9\s,.?!]', '', tweet)  # Keep only letters, numbers, and some punctuation
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()  # Remove extra whitespace
    return cleaned_text
clean_tweet = clean_output(tweet)
print(f"Cleaned Tweet: {clean_tweet}\n")

prompt= f"Text: {clean_tweet} \nSentiment (positive(2), negative(0), neutral(1)):"
print(f"Prompt:\n {prompt}\n")

best_option, log_probs = calculate_option_log_prob(prompt, sentiment_ids)
print(f"Predicted sentiment: {best_option}\n")
print("Log probabilities:", log_probs)


Cleaned Tweet: I hate waiting in line

Prompt:
 Text: I hate waiting in line 
Sentiment (positive(2), negative(0), neutral(1)):

Decoded output:
 Questionile  Corinthians to. line.I::1):1. negative(1))
 neutral(0))
 
Predicted sentiment: negative

Log probabilities: {'positive': -9.29300594329834, 'negative': -9.237157821655273, 'neutral': -11.064420700073242}


In [None]:
'''
  Method #1: parse output text for keywords and keep list of labels
'''

from transformers import AutoTokenizer, AutoModelForCausalLM
from sklearn.metrics import precision_score, recall_score, f1_score

# tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3.5-mini-instruct")
# model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-3.5-mini-instruct")

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-3B")


# Sample sentences and actual labels
# sentences = [
#     "I love this product!",            # Expected label: 2 (positive)
#     "This is okay.",                   # Expected label: 1 (neutral)
#     "I hate waiting in line.",         # Expected label: 0 (negative)
# ]
#actual_labels = [2, 1, 0]  # Corresponding actual labels

sentences = dataset['validation']['text'][:10]
actual_labels = dataset['validation']['label'][:10]


# Prepare a function to classify sentences
def classify_sentences(sentences):
    predictions = []
    for sentence in sentences:
        #prompt = f"Classify this sentence: '{sentence}' as 0 (negative), 1 (neutral), or 2 (positive)."
        prompt = f"Classify this sentence as negative (0), neutral (1), or positive (2). Only pick one number and only respond with the number. If it's negative output 0, if it's neutral ouput 1, if it's negative out 2. Here's the sentence: {sentence}"

        inputs = tokenizer(prompt, return_tensors='pt')
        outputs = model.generate(**inputs, max_new_tokens=200)
        classification = tokenizer.decode(outputs[0], skip_special_tokens=True)
        print(classification)

        # Simple mapping to numerical labels
        if "negative" in classification:
            predictions.append(0)
        elif "neutral" in classification:
            predictions.append(1)
        elif "positive" in classification:
            predictions.append(2)
        else:
            predictions.append(-1)  # If the classification is unclear

    return predictions

# Classify sentences
predicted_labels = classify_sentences(sentences)

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print results
print(f"Predicted labels: {predicted_labels}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   4%|4         | 199M/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
'''
  Verifying that precision, recall, and f1 are calculated accurately when using
  lists of labels in method #1
'''

predicted_labels = [0, 0, 1, 1, 2, 2]
actual_labels = [1, 1, 2, 2, 0, 0]

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print results
print(f"Predicted labels: {predicted_labels}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

Predicted labels: [0, 0, 1, 1, 2, 2]
Precision: 0.00
Recall: 0.00
F1 Score: 0.00
