In [None]:
%%capture
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
from torch import __version__; from packaging.version import Version as V
xformers = "xformers==0.0.27" if V(__version__) < V("2.4.0") else "xformers"
!pip install --no-deps {xformers} trl peft accelerate bitsandbytes triton

In [None]:
import torch
from datasets import load_dataset
from unsloth import FastLanguageModel
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm
from huggingface_hub import login
from google.colab import drive
import re

login()

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score


In [None]:
def load_model_and_tokenizer(model_path):
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=model_path,
        max_seq_length=2048,
        dtype=None,
        load_in_4bit=True,
    )
    FastLanguageModel.for_inference(model)
    return model, tokenizer


def format_prompt(question, essay):
    return f"""Grade the essay(single-sentence) below as either Acceptable or Unacceptable based on content, ignoring language errors. Your response must be exactly one word: either 'Acceptable' or 'Unacceptable'.

Question: {question}
Essay: {essay}
Grade: """

#FEW SHOT

# def format_prompt(question, essay):
#     return f"""Grade the essay(single-sentence) below as either Acceptable or Unacceptable based on content, ignoring language errors. Your response must be exactly one word: either 'Acceptable' or 'Unacceptable'.

# Question: How does natural selection contribute to evolution?
# Essay: Natural selection favors traits que increase survival y reproduction, gradually changing especies over time through genetic herencia.
# Grade: Acceptable

# Question: What is the principle of federalism in government structure?
# Essay: Federalism divide power equally entre all levels of government, ensuring que local, state, y national governments tienen identical responsibilities.
# Grade: Unacceptable

# Question: How does a turbocharger increase an engine's power?
# Essay: Turbochargers inyectan extra fuel directly into los engine cylinders, resultando en more powerful explosions y increased horsepower.
# Grade: Uncceptable

# Question: {question}
# Essay: {essay}
# Grade: """


def extract_grade(response):
    match = re.search(r'Grade:\s*(Acceptable|Unacceptable)(?![\s\S]*Grade:)', response, re.IGNORECASE)
    if match:
        return match.group(1)
    else:
        return "Unknown"

def generate_grades_batch(model, tokenizer, questions, essays, batch_size=4):
    all_grades = []
    all_outputs = []

    for i in range(0, len(questions), batch_size):
        batch_questions = questions[i:i+batch_size]
        batch_essays = essays[i:i+batch_size]

        prompts = [format_prompt(q, e) for q, e in zip(batch_questions, batch_essays)]
        inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True).to(model.device)

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=5,
                num_return_sequences=1,
                do_sample=False,
                pad_token_id=tokenizer.eos_token_id
            )

        responses = tokenizer.batch_decode(outputs, skip_special_tokens=True)

        for response in responses:
            grade = extract_grade(response)
            all_grades.append(grade)
            all_outputs.append(response.strip())

    return all_grades, all_outputs

def evaluate_model(model, tokenizer, dataset, batch_size=8):
    questions = dataset['question']
    essays = dataset['essay']
    true_labels = dataset['grade']
    predicted_labels, outputs = generate_grades_batch(model, tokenizer, questions, essays, batch_size)

    accuracy = accuracy_score(true_labels, predicted_labels)
    f1 = f1_score(true_labels, predicted_labels, pos_label="Acceptable")

    true_binary = [1 if label == "Acceptable" else 0 for label in true_labels]
    pred_binary = [1 if label == "Acceptable" else 0 for label in predicted_labels]
    auc = roc_auc_score(true_binary, pred_binary)

    return accuracy, f1, auc, predicted_labels, outputs

dev_dataset = load_dataset('csv', data_files='final_english_test.csv')['train']

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
ft_model, ft_tokenizer = load_model_and_tokenizer("anandHF/spanish_llama3.1")

==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.0+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.27.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/168M [00:00<?, ?B/s]

Unsloth 2024.8 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [None]:
ft_accuracy, ft_f1, ft_auc, ft_predictions, ft_outputs = evaluate_model(ft_model, ft_tokenizer, dev_dataset)

In [None]:
ft_accuracy

0.96

In [None]:
ft_f1

0.9647266313932981

In [None]:
ft_auc

0.9618546907756813

In [None]:
default_model, default_tokenizer = load_model_and_tokenizer("unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit")


==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.0+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.27.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

In [None]:
default_accuracy, default_f1, default_auc, default_predictions, default_outputs = evaluate_model(default_model, default_tokenizer, dev_dataset)

Evaluating default Llama 3.1 Instruct model...


In [None]:
default_accuracy

0.826

In [None]:
default_f1

0.8421052631578948

In [None]:
default_auc

0.829664570230608