## AAVE translation task

We could use this paper: https://aclanthology.org/2023.nlrse-1.1.pdf as a benchmark perhaps

Or this one https://arxiv.org/pdf/2405.06545

#Drive Mounting

In [22]:
from huggingface_hub import login
import os
from google.colab import drive

drive.mount('/content/drive')

os.environ["TRANSFORMERS_CACHE"] = "/content/drive/Shareddrives/Algoverse_KSAC/hf_cache" #stores model
os.environ["HF_HOME"] = "/content/drive/Shareddrives/Algoverse_KSAC/hf_home"  # stores logins

# hf_token = YOUR_TOKEN_HERE

login()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

#Model Loading (DO NOT RUN)

In [None]:
# from transformers import AutoTokenizer, AutoModelForCausalLM
# import os

# model_id = "Qwen/Qwen3-8B"
# save_path = "/content/drive/Shareddrives/Algoverse_KSAC/hf_models/Qwen/Qwen3-8B"
# os.makedirs(save_path, exist_ok=True)

# # Download from hub
# tokenizer = AutoTokenizer.from_pretrained(model_id)
# model = AutoModelForCausalLM.from_pretrained(
#     model_id,
#     dtype="auto", #For weight loading
#     device_map="auto"
# )

# # Save tokenizer + model as a single file
# tokenizer.save_pretrained(save_path)
# model.save_pretrained(
#     save_path,
#     safe_serialization=True, #Using .safetensors
# )

# print("Model and tokenizer saved into:", save_path)


In [None]:
# Run this cell to purge GPU memory

import gc
import torch

# Delete model and tokenizer objects
try:
  del model
except:
  pass

try:
  del tokenizer
except:
  pass

# Run Python's garbage collector
gc.collect()

# Clear CUDA cache
if torch.cuda.is_available():
    torch.cuda.empty_cache()

print("GPU memory should now be purged.")

# Verify
!nvidia-smi

#Model Reloading

Load model from save_path

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM

save_path = "/content/drive/Shareddrives/Algoverse_KSAC/hf_models/Qwen/Qwen3-8B"

tokenizer = AutoTokenizer.from_pretrained(save_path, local_files_only=True)
model = AutoModelForCausalLM.from_pretrained(
    save_path,
    local_files_only=True,
    dtype="auto",
    device_map="auto"
)

print("Reloaded model successfully")
print(f"model.device = {model.device}") #Verifying device




Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

ValueError: could not determine the shape of object type 'torch.storage.UntypedStorage'

#Data Loading

In [9]:
import pandas as pd

In [21]:
data = pd.read_csv("/content/drive/Shareddrives/Algoverse_KSAC/dilemma_ab_aave_only_v2.csv")

data.drop("QID",axis =1, inplace=True)

data_list = data['Question'].tolist()



#Running Inference

In [None]:
# from transformers import pipeline
from transformers.generation.utils import GenerationMixin


In [None]:
# Inference on already-loaded model

data_list = data_list[:30]
def translate(data_list):
    messages = [
        {
            "role": "system",
            "content": (

                "You are a strict translation engine."
                "You MUST only output the translated SAE (Standard American English) questions."
                "Do not explain, do not reason, do not add tags or extra text."
                "Only return the final grammatically correct questions, nothing else."


                "Examples: "
                "AAVE: Who be Einstein's ol' lady? "
                "SAE: Who was Einstein's wife? "
                "AAVE: When Obama come in the world? "
                "SAE: When was Obama born? "
                "AAVE: Where King Jr. do that 'I got a dream' speech at? "
                "SAE: Where did Martin Luther King Jr. give his 'I Have a Dream' speech? "
                "AAVE: What year Mandela start runnin' South Africa, and how long he stay holdin' that seat? "
                "SAE: In what year did Mandela become the President of South Africa, and how long did he remain in office? "
                "Remember: ONLY output the translated SAE questions, line by line, with no commentary."
            ),
        },
        {
            "role": "user",
            "content": f"Transform these incomplete questions into complete, grammatically perfect SAE. {' '.join(data_list[:30])}"  #Joining a list into a string
        }
    ]

    inputs = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=True,
        return_dict=True,
        return_tensors="pt",
        enable_thinking=False
    ).to(model.device)

    outputs = model.generate(**inputs, max_new_tokens=2600, do_sample=False, temperature=0)  #Increasing the temperature, and only considering tokens that make up 90% of probability (p = prob)

    # dummy = outputs[0][inputs["input_ids"].shape[-1]:]

    output_answer = tokenizer.decode(
        outputs[0][inputs["input_ids"].shape[-1]:],  # Slicing the length of the input tokens to get what comes after (Getting the length) (start:end)
        skip_special_tokens=True
    ).split('\n')

    # breakpoint()
    # import pdb; pdb.set_trace()
    return output_answer



translated_data = pd.DataFrame({"AAVE input": data_list, "SAE output": translate(data_list)})

file = translated_data.to_csv("translated_data.csv")







#Evaluate Single Pair

In [None]:
# Inference on already-loaded model


def translate(question):
    messages = [
        {
            "role": "system",
            "content": (

                "You are a strict translation engine."
                "You MUST only output the translated SAE (Standard American English) questions."
                "Do not explain, do not reason, do not add tags or extra text."
                "Only return the final grammatically correct questions, nothing else."


                "Examples:\n"
                "AAVE: Who be Einstein's ol' lady?\n"
                "SAE: Who was Einstein's wife?\n"
                "AAVE: When Obama come in the world?\n"
                "SAE: When was Obama born?\n\n"
                "AAVE: Where King Jr. do that 'I got a dream' speech at?\n"
                "SAE: Where did Martin Luther King Jr. give his 'I Have a Dream' speech?\n\n"
                "AAVE: What year Mandela start runnin' South Africa, and how long he stay holdin' that seat?\n"
                "SAE: In what year did Mandela become the President of South Africa, and how long did he remain in office?\n\n"
                "Remember: ONLY output the translated SAE questions, line by line, with no commentary."
            ),
        },
        {
            "role": "user",
            "content": f"Transform this question into complete, grammatically perfect SAE. {question}"  #Joining a list into a string
        }
    ]

    inputs = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=True,
        return_dict=True,
        return_tensors="pt",
        enable_thinking=False
    ).to(model.device)

    outputs = model.generate(**inputs, max_new_tokens=300, do_sample=False, temperature=0)  #Increasing the temperature, and only considering tokens that make up 90% of probability (p = prob)

    # dummy = outputs[0][inputs["input_ids"].shape[-1]:]

    output_answer = tokenizer.decode(
        outputs[0][inputs["input_ids"].shape[-1]:],  # Slicing the length of the input tokens to get what comes after (Getting the length) (start:end)
        skip_special_tokens=True
    ).split('\n')

    # breakpoint()
    # import pdb; pdb.set_trace()
    return output_answer









#Bart Score evaluation

In [None]:
import sys
import numpy as np

Install BLEURT first

In [None]:
!pip install tensorflow>=2.3.0
!pip install tensorflow-text>=2.3.0
!git clone https://github.com/google-research/bleurt.git
!cd bleurt && python -m pip install .
sys.path.append('/content/bleurt')
!wget https://storage.googleapis.com/bleurt-oss-21/BLEURT-20.zip
!unzip BLEURT-20.zip



Install BARTScore

In [None]:

!git clone -q https://github.com/neulab/BARTScore.git
sys.path.append("/content/BARTScore")
!pip -q install "transformers>=4.40,<4.47" sentencepiece accelerate torch tqdm --upgrade


In [None]:
from bart_score import BARTScorer
from bleurt import score



In [None]:
bart_scorer = BARTScorer(device='cuda', checkpoint='facebook/bart-large')
bleurt_scorer = score.BleurtScorer("./BLEURT-20")

In [None]:
#code up

In [None]:
def evaluate_multiple_pairs(inputs, outputs):

    if len(inputs) != len(outputs):
        raise ValueError("Lists must be the same length")


    results = []
    for i in range(len(inputs)):
        aave_text = inputs[i]
        sae_text = outputs[i]
        bart_score_backward = bart_scorer.score([sae_text], [aave_text], batch_size=1)

        results.append({'bart_score_backward': bart_score_backward})

        results_df = pd.DataFrame(results)

    return results_df


inputs = translated_data['AAVE input'].to_list()
outputs = translated_data['SAE output'].to_list()

results_df = evaluate_multiple_pairs(inputs, outputs)




In [None]:
total_results = pd.concat([translated_data, results_df], axis=1)
total_results.head(10)

#Judge LLM (WIP)

In [None]:
from openai import OpenAI

client = OpenAI(api_key="YOUR_KEY_HERE")


messages = [
    {"role": "system", "content": "Evaluate the accuracy of the shown translation from AAVE to SAE. Do not provide a written explanation. Only provide a score from 0-100. If there is even any trace of missing semantic value, deduct from the total score"},

    {"role": "user", "content": "Who be Einstein's ol' lady?"},
    {"role": "assistant", "content": "Who was Einstein's wife?"},

    {"role": "user", "content": "When Obama come in the world?"},
    {"role": "assistant", "content": "When was Obama born?"},

    {"role": "user", "content": "Where King Jr. do that 'I got a dream' speech at?"},
    {"role": "assistant", "content": "Where did Martin Luther King Jr. give his 'I Have a Dream' speech?"},

    {"role": "user", "content": "What year Mandela start runnin’ South Africa, and how long he stay holdin’ that seat?"},
    {"role": "assistant", "content": "In what year did Mandela become the President of South Africa, and how long did he remain in office?"},

    #{"role": "user", "content": "What spaceship took the first man round Earth, what his name was, and what year that go down?"},
    #{"role": "assistant", "content": "Which spacecraft carried the first human to orbit Earth, what was the astronaut’s name, and what year did this mission occur?"},

    {"role": "user", "content": " AAVE: Who be the first man to step on the moon, and what year that happen? SAE: Who was the first man to step on the moon, and what year did he step on the moon?"},

]

#
completion = client.chat.completions.create(
    model="gpt-4.1",
    messages=messages,
    n=1,
    temperature=0
)

score = completion.choices[0].message.content
print(score)

