In [15]:
!pip install transformers
!pip install bitsandbytes
!pip install accelerate
!pip install datasets
!pip install huggingface_hub
!pip install peft
!pip install trl

Collecting trl
  Downloading trl-0.8.1-py3-none-any.whl (225 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m225.0/225.0 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
Collecting tyro>=0.5.11 (from trl)
  Downloading tyro-0.7.3-py3-none-any.whl (79 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.8/79.8 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
Collecting shtab>=1.5.6 (from tyro>=0.5.11->trl)
  Downloading shtab-1.7.1-py3-none-any.whl (14 kB)
Installing collected packages: shtab, tyro, trl
Successfully installed shtab-1.7.1 trl-0.8.1 tyro-0.7.3


# Load the data

In [1]:
import pandas as pd
import os

data_path = "./data.csv"

if not os.path.exists(data_path):
    raise Exception("File not found : {}".format(data_path))

df = pd.read_csv(data_path)
df.head()

import sys
print(sys.version)

3.10.12 (main, Nov 20 2023, 15:14:05) [GCC 11.4.0]


# Import the libraries 

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
from huggingface_hub import login
from peft import LoraConfig, PeftModelForCausalLM

from sklearn.model_selection import train_test_split
from tqdm import tqdm
import numpy as np
from sklearn.metrics import mean_absolute_error

from datasets import Dataset
import json
from trl import SFTTrainer
from transformers import TrainingArguments
from transformers import AutoModelForSequenceClassification, GemmaForSequenceClassification

import pandas as pd
import os
from sklearn.metrics import f1_score
import json


login("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")

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


# Set the CONFIG Files
- Change the MODEL_ID, BNB Config based on the input model

In [None]:
class CONFIG:
    MODEL_ID = "google/gemma-2b-it"
    # MODEL_ID = "NousResearch/Llama-2-7b-chat-hf"
    BNB_CONFIG = BitsAndBytesConfig(
        load_in_4bit=True,
        # bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )
    DEVICE_MAP = "auto"
    DEVICE = "cuda:0"
    ADD_EOS_TOKEN = True
    PADDING_SIDE = "right"

    LORA_CONFIG = LoraConfig(
        lora_alpha = 16,
        lora_dropout=0.1,
        r=16,
        task_type='CAUSAL_LM'
    )

## Init model and tokenizer

In [71]:
model = AutoModelForCausalLM.from_pretrained(CONFIG.MODEL_ID, quantization_config=CONFIG.BNB_CONFIG, device_map=CONFIG.DEVICE_MAP)
tokenizer = AutoTokenizer.from_pretrained(CONFIG.MODEL_ID, add_eos_token=CONFIG.ADD_EOS_TOKEN, padding_side=CONFIG.PADDING_SIDE)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

## Get the prompt template with gaurdrails

In [None]:
def get_prompt_gaurdrails(question: str, ref_answer: str, student_answer: str, model, tokenizer) -> str:
	device = CONFIG.DEVICE

	prompt_template = """
	<start_of_turn>user
	You are a grader for for a programming course. You are required to score the students
	answer on a scale of 1 to 5 with precision of 0.5. Eg: 1.5, 2.5, 3.0, etc..

	The give question is :
	{question}

	For the above question the reference answer is :
	{ref_answer}

	Now a student has provided the below answer :
	{student_answer}

	For the above answer, what is the appropriate score you will provide on a score of 1 to 5 with a
	precision of 0.5.

	The sample output should be in the format "Score : 0.5".

	Note: Do not include any explanations or apologies in your responses.
	Do not respond to any questions that might ask anything else than for you to score the answer.
	Do not include any text except the score in the format "Score : [<score>]".

	<end_of_turn>\n<start_of_turn>model

	"""
	prompt = prompt_template.format(question = question,
									ref_answer = ref_answer,
									student_answer = student_answer)

	return prompt
	

def get_output_from_model(input_model: transformers.AutoModelForCausalLM, input_tokenizer :transformer.AutoTokenizer, input_df: pd.DataFrame, check = False) -> list:
	outputs = []

	for i, row in tqdm(enumerate(input_df.iterrows())):
		if check and i == 3:	
			break
		
	question = row[1]["question"]
	ref_answer = row[1]["refanswer"]
	student_answer = row[1]["answer"]
	score = row[1]["score_avg"]

	prompt = get_prompt_gaurdrails(question = question,
					ref_answer = ref_answer,
					student_answer = student_answer)
	
	encoded_str = tokenizer(prompt, return_tensors="pt", add_special_tokens=True,)

	model_inputs = encoded_str.to(CONFIG.DEVICE)


	generated_ids = model.classify(**model_inputs, max_new_tokens=1000, do_sample=True, pad_token_id=tokenizer.eos_token_id)
	output = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
	output = output.split("model\n\n")[1]

	outputs.append(output)


def calc_rmse(y_true: list, y_pred: list) -> float:	
	y_pred = list(map(float, y_pred))
	y_true = list(map(float, y_true))

	rmse = np.sqrt(mean_absolute_error(y_true, y_pred))
	return rmse

def calc_f1(y_true: list, y_pred: list) -> float:	
	y_pred = list(map(float, y_pred))
	y_true = list(map(float, y_true))

	f1 = f1_score(y_true, y_pred)
	return f1
	

def get_cleaned_outputs(outputs):
    _out = []

    for i in range(len(outputs)):
        _out.append(outputs[i].split("Score : ")[-1].split("\n")[0])
		
    return _out
    

## Split the data into train and test df

In [None]:
# split the data into train and test set
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42, stratify=df["score_avg"])
print("train shape : ", df_train.shape)
print("test shape : ", df_test.shape)

## Execute the below code to make sure the model is working properly and get a sample output

In [73]:
question = "What is the role of a prototype program in problem solving?"
ref_answer = "To simulate the behaviour of portions of the desired software product."
student_answer = "High risk problems are address in the prototype program to make sure that the program is feasible.  A prototype may also be used to show a company that the software can be possibly programmed.  "
score = "3.5"

result = get_prompt_gaurdrails(question = question,
                        ref_answer = ref_answer,
                        student_answer = student_answer,
                        model = model,
                        tokenizer = tokenizer)

print(result)

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.



  user
  You are a grader for for a programming course. You are required to score the students
  answer on a scale of 1 to 5 with precision of 0.5. Eg: 1.5, 2.5, 3.0, etc..

  The give question is : 
  What is the role of a prototype program in problem solving?

  For the above question the reference answer is : 
  To simulate the behaviour of portions of the desired software product.

  Now a student has provided the below answer : 
  High risk problems are address in the prototype program to make sure that the program is feasible.  A prototype may also be used to show a company that the software can be possibly programmed.  

  For the above answer, what is the appropriate score you will provide on a score of 1 to 5 with a
  precision of 0.5.

  The sample output should be in the format "Score : 0.5".

  Note: Do not include any explanations or apologies in your responses.
  Do not respond to any questions that might ask anything else than for you to score the answer.
  Do not inclu

## Run the below code to generate the outputs from the model

In [82]:
outputs = get_output_from_model(model, tokenizer, df_train)

# add the outputs to the df_train dataframe for easy processing
df_train["model_output"][:len(outputs)] = outputs
# clean the output to remove the string associated with them
df_train["model_output"] = get_cleaned_outputs(df_train["model_output"].tolist())

# calculate the rmse square
calc_rmse(df_train["score_avg"].tolist(), df_train["model_output"].tolist())

0it [00:00, ?it/s]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
1it [00:01,  1.07s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
2it [00:02,  1.02s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
3it [00:02,  1.03it/s]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
4it [00:03,  1.03it/s]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
5it [00:04,  1.03it/s]A decod

KeyboardInterrupt: 

# Finetune the model using Qlora

## Convert the data into prompts and store it as a json object

In [None]:
X = df.drop(columns="score_avg")
y = df["score_avg"]


# create a new set of train and test data
train_df, test_df = train_test_split(df, random_state=42, test_size=0.2)

print("len of x_train : ", len(train_df))
print("len of x_test : ", len(test_df))


def convert_txt_to_qlora_dict(input_df : pd.DataFrame):

  __template_assistant = """
  Score : {score}
  """

  json_dict = {
      "messages" : []
  }

  for rowind, row in tqdm(input_df.iterrows()):
    user_prompt = get_prompt_gaurdrails(question = row["question"],
					ref_answer = row["refanswer"],
					student_answer = row["answer"])
    
    assistant_prompt = __template_assistant.format(score = row["score_avg"])

    ls = [
      {
        "role" : "user",
        "content" : user_prompt
      },
      {
        "role" : "assistant",
        "content" : assistant_prompt
      }
    ]

    json_dict["messages"].append(ls)

  return json_dict




In [None]:
# get the dict version of the prompts
train_json = convert_txt_to_qlora_dict(train_df)
test_json = convert_txt_to_qlora_dict(test_df)

# Specify the file path where you want to save the JSON file
train_file_path = "EN-train_chatml.json"
test_file_path = "EN-val_chatml.json"

# Save the dictionary as a JSON file
with open(train_file_path, "w") as json_file:
    json.dump(train_json, json_file)

with open(test_file_path, "w") as json_file:
    json.dump(train_json, json_file)

# load the file again
save_path = "./"
dataset_train_name = 'EN-train'
dataset_val_name = 'EN-val'

file_name_train_chatml = f"{dataset_train_name}_chatml.json"
file_name_val_chatml = f"{dataset_val_name}_chatml.json"

with open(save_path + file_name_train_chatml, 'r') as f:
  dataset_train = Dataset.from_dict(json.load(f))

with open(save_path + file_name_val_chatml, 'r') as f:
  dataset_val = Dataset.from_dict(json.load(f))


In [4]:
lora_model = AutoModelForCausalLM.from_pretrained(CONFIG.MODEL_ID, quantization_config=CONFIG.BNB_CONFIG, device_map=CONFIG.DEVICE_MAP)
lora_tokenizer = AutoTokenizer.from_pretrained(CONFIG.MODEL_ID, padding_side=CONFIG.PADDING_SIDE)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

## Train the Qlora params

In [5]:
training_arguments = TrainingArguments(
    output_dir = "./output",
    evaluation_strategy="steps",
    logging_strategy="steps",
    lr_scheduler_type="constant",
    logging_steps=20,
    eval_steps=20,
    save_steps=20,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=16,
    eval_accumulation_steps=16,
    num_train_epochs=1,
    fp16=True,
    group_by_length = True,
    optim="paged_adamw_32bit",
    max_steps = 100
)
trainer = SFTTrainer(
    model,
    tokenizer=tokenizer,
    train_dataset=dataset_train,
    eval_dataset=dataset_val,
    peft_config=CONFIG.LORA_CONFIG,
    neftune_noise_alpha=5,
    max_seq_length=500,
    args = training_arguments
)
trainer.train()



Map:   0%|          | 0/4735 [00:00<?, ? examples/s]

Map:   0%|          | 0/4735 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss,Validation Loss
20,5.142,4.889306
40,4.4318,3.71348
60,3.2554,2.846437
80,2.5238,2.165644


Step,Training Loss,Validation Loss
20,5.142,4.889306
40,4.4318,3.71348
60,3.2554,2.846437
80,2.5238,2.165644
100,1.9419,1.734175


TrainOutput(global_step=100, training_loss=3.4589868545532227, metrics={'train_runtime': 4188.6957, 'train_samples_per_second': 0.764, 'train_steps_per_second': 0.024, 'total_flos': 5568902666575872.0, 'train_loss': 3.4589868545532227, 'epoch': 0.68})

## Save the finetuned model

In [10]:
model_save_name = "/output"

trainer.model.save_pretrained("/output")
finetuned_model = PeftModelForCausalLM.from_pretrained(model=model, model_id="/output")

# Sample Evaluation

In [8]:
messages=[
    {
        'role':'user',
        'content':'Who is Francesco Lelli?',
    }
]

messages = [
    {
        "role": "user",
        "content": "\n  The give question is : \n    What are the elements typically included in a class definition\n\n    For the above question the reference answer is : \n    Function members and data members\n\n    Now a student has provided the below answer : \n    the functions and variables used when the object is defined for the class\n\n    For the above answer, what is the appropriate score you will provide on a score of 1 to 5 with a\n    precision of 0.5.\n    "
        }
]

input_ids = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_tensors="pt").to("cuda")

# print(input_ids)
outputs_finetuned = finetuned_model.generate(input_ids=input_ids, max_new_tokens=1024, do_sample=False)
outputs = model.generate(input_ids=input_ids, max_new_tokens=1024, do_sample=False)

print("finetuned: " + tokenizer.decode(outputs_finetuned[0]).split('<start_of_turn>model\n')[-1])
print("normal   : " + tokenizer.decode(outputs[0]).split('<start_of_turn>model\n')[-1])

# Zip the folder for easy download from colab to local

In [14]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

!zip -r ./output.zip ./output

  adding: output/ (stored 0%)
  adding: output/checkpoint-80/ (stored 0%)
  adding: output/checkpoint-80/rng_state.pth (deflated 25%)
  adding: output/checkpoint-80/trainer_state.json (deflated 70%)
  adding: output/checkpoint-80/adapter_model.safetensors (deflated 8%)
  adding: output/checkpoint-80/README.md (deflated 66%)
  adding: output/checkpoint-80/optimizer.pt (deflated 7%)
  adding: output/checkpoint-80/adapter_config.json (deflated 52%)
  adding: output/checkpoint-80/tokenizer_config.json (deflated 72%)
  adding: output/checkpoint-80/special_tokens_map.json (deflated 76%)
  adding: output/checkpoint-80/training_args.bin (deflated 51%)
  adding: output/checkpoint-80/scheduler.pt (deflated 58%)
  adding: output/checkpoint-80/tokenizer.model (deflated 51%)
  adding: output/checkpoint-80/tokenizer.json (deflated 72%)
  adding: output/checkpoint-60/ (stored 0%)
  adding: output/checkpoint-60/rng_state.pth (deflated 25%)
  adding: output/checkpoint-60/trainer_state.json (deflated 67

# Evaluate the model

In [19]:
def get_output_from_model_using_prompt(prompt : str, model, tokenizer, finetuned = False) -> str:
	device = CONFIG.DEVICE

	encodeds = tokenizer(prompt, return_tensors="pt", add_special_tokens=True,)

	model_inputs = encodeds.to(device)

	generated_ids = model.generate(**model_inputs, max_new_tokens=1000, do_sample=True, pad_token_id=tokenizer.eos_token_id)
	decoded = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

	return (decoded)


In [37]:
outputs_finetuned = []
outputs_model = []
actual_outputs = []

for i, row in tqdm(enumerate(dataset_val["messages"])):
    # if i == :
    #   break

    actual_val = float(row[1]["content"].split(": ")[-1].split("\n")[0])
    actual_outputs.append(actual_val)

    user_prompt = row[0]["content"]

    output_finetuned = get_output_from_model_using_prompt(prompt = user_prompt,
                        model = model,
                        tokenizer = tokenizer)

    output_model = get_output_from_model_using_prompt(prompt = user_prompt,
                        model = model,
                        tokenizer = tokenizer)

    output_finetuned = output_finetuned.split("model\n\n")[1].split("\n")[0]
    output_model = output_model.split("model\n\n")[1].split("\n")[0]

    try:
        output_finetuned = float(output_finetuned.split(": ")[-1].split(" ")[0])
    except:
        pass

    try:
        output_model = float(output_model.split(": ")[-1].split(" ")[0])
    except:
        pass

    outputs_finetuned.append(output_finetuned)
    outputs_model.append(output_model)






1215it [56:11,  2.77s/it]


KeyboardInterrupt: 

In [38]:
print(actual_outputs)
print(outputs_finetuned)
print(outputs_model)

[4.0, 5.0, 2.0, 4.0, 3.0, 3.0, 4.0, 3.5, 3.0, 3.0, 4.5, 3.5, 2.0, 5.0, 4.5, 5.0, 5.0, 2.5, 4.5, 1.5, 5.0, 5.0, 5.0, 5.0, 2.5, 5.0, 4.0, 5.0, 5.0, 2.0, 4.5, 3.5, 4.0, 1.0, 3.0, 3.5, 4.5, 4.0, 5.0, 3.0, 2.5, 5.0, 5.0, 4.0, 5.0, 4.5, 2.5, 5.0, 4.0, 3.5, 1.5, 5.0, 3.5, 1.5, 3.0, 5.0, 5.0, 3.0, 2.0, 4.0, 5.0, 5.0, 4.0, 4.5, 4.0, 4.5, 5.0, 5.0, 4.5, 3.0, 4.0, 4.5, 0.0, 4.0, 3.0, 3.0, 5.0, 5.0, 5.0, 4.0, 3.5, 2.5, 3.0, 2.5, 3.5, 3.0, 5.0, 5.0, 4.5, 3.5, 5.0, 5.0, 5.0, 4.5, 3.5, 5.0, 4.0, 4.0, 4.5, 1.5, 5.0, 5.0, 2.0, 5.0, 2.0, 4.5, 5.0, 2.0, 5.0, 4.5, 4.0, 3.5, 2.0, 1.5, 2.0, 5.0, 5.0, 4.0, 5.0, 3.0, 3.5, 4.0, 5.0, 4.5, 4.0, 4.5, 5.0, 1.0, 2.5, 3.0, 4.5, 3.5, 5.0, 4.5, 2.5, 2.0, 5.0, 3.5, 5.0, 4.0, 4.0, 5.0, 1.5, 3.5, 4.0, 5.0, 3.0, 4.5, 5.0, 5.0, 4.5, 3.5, 5.0, 5.0, 4.5, 2.5, 5.0, 5.0, 5.0, 5.0, 3.0, 2.5, 5.0, 1.0, 3.5, 4.5, 5.0, 5.0, 5.0, 5.0, 4.5, 1.5, 5.0, 2.0, 5.0, 3.0, 5.0, 4.0, 4.0, 5.0, 5.0, 5.0, 0.0, 4.5, 5.0, 4.0, 2.5, 5.0, 3.5, 5.0, 4.5, 2.0, 5.0, 5.0, 4.5, 3.5, 2.5, 3.5, 4.5, 4.0,

# Manually check the values with no guardrails implemented

In [44]:
for i in range(len(outputs_finetuned)):
    if type(outputs_finetuned[i]) != float:
        print("Fine Tuned : ", i, " - ", outputs_finetuned[i])

    if type(outputs_model[i]) != float:
        print("Fine Tuned : ", i, " - ", outputs_model[i])

[4.0, 5.0, 2.0, 4.0, 3.0, 3.0, 4.0, 3.5, 3.0, 3.0, 4.5, 3.5, 2.0, 5.0, 4.5, 5.0, 5.0, 2.5, 4.5, 1.5, 5.0, 5.0, 5.0, 5.0, 2.5, 5.0, 4.0, 5.0, 5.0, 2.0, 4.5, 3.5, 4.0, 1.0, 3.0, 3.5, 4.5, 4.0, 5.0, 3.0, 2.5, 5.0, 5.0, 4.0, 5.0, 4.5, 2.5, 5.0, 4.0, 3.5, 1.5, 5.0, 3.5, 1.5, 3.0, 5.0, 5.0, 3.0, 2.0, 4.0, 5.0, 5.0, 4.0, 4.5, 4.0, 4.5, 5.0, 5.0, 4.5, 3.0, 4.0, 4.5, 0.0, 4.0, 3.0, 3.0, 5.0, 5.0, 5.0, 4.0, 3.5, 2.5, 3.0, 2.5, 3.5, 3.0, 5.0, 5.0, 4.5, 3.5, 5.0, 5.0, 5.0, 4.5, 3.5, 5.0, 4.0, 4.0, 4.5, 1.5, 5.0, 5.0, 2.0, 5.0, 2.0, 4.5, 5.0, 2.0, 5.0, 4.5, 4.0, 3.5, 2.0, 1.5, 2.0, 5.0, 5.0, 4.0, 5.0, 3.0, 3.5, 4.0, 5.0, 4.5, 4.0, 4.5, 5.0, 1.0, 2.5, 3.0, 4.5, 3.5, 5.0, 4.5, 2.5, 2.0, 5.0, 3.5, 5.0, 4.0, 4.0, 5.0, 1.5, 3.5, 4.0, 5.0, 3.0, 4.5, 5.0, 5.0, 4.5, 3.5, 5.0, 5.0, 4.5, 2.5, 5.0, 5.0, 5.0, 5.0, 3.0, 2.5, 5.0, 1.0, 3.5, 4.5, 5.0, 5.0, 5.0, 5.0, 4.5, 1.5, 5.0, 2.0, 5.0, 3.0, 5.0, 4.0, 4.0, 5.0, 5.0, 5.0, 0.0, 4.5, 5.0, 4.0, 2.5, 5.0, 3.5, 5.0, 4.5, 2.0, 5.0, 5.0, 4.5, 3.5, 2.5, 3.5, 4.5, 4.0,

## Clean the outputs using regex

In [58]:
import re

pattern = r"Score\s*:\s*([\d.]+)\b"
outputs_finetuned_cleaned = []
outputs_model_cleaned = []

for string in outputs_finetuned:
  if type(string) != float:
    match = re.search(pattern, string)
    if match:
        score = float(match.group(1))
        outputs_finetuned_cleaned.append(score)
    else:
      outputs_finetuned_cleaned.append(string)
  else:
    outputs_finetuned_cleaned.append(string)

for string in outputs_model:
  if type(string) != float:
    match = re.search(pattern, string)
    if match:
        score = float(match.group(1))
        outputs_model_cleaned.append(score)
    else:
      outputs_model_cleaned.append(string)
  else:
    outputs_model_cleaned.append(string)



# Save the outputs to disk

In [69]:
# Example NumPy array
aoarr = np.array(actual_outputs)
finetuned_output_arr = np.array(outputs_finetuned_cleaned)
model_output_arr = np.array(outputs_model_cleaned)


# Save the array to disk
np.save('./actual_outputs.npy', aoarr)
np.save('./outputs_finetuned_cleaned.npy', finetuned_output_arr)
np.save('./outputs_model2_cleaned.npy', model_output_arr)

# Sequence classification

In [77]:
model = GemmaForSequenceClassification.from_pretrained(CONFIG.MODEL_ID, quantization_config=CONFIG.BNB_CONFIG, device_map=CONFIG.DEVICE_MAP)
tokenizer = AutoTokenizer.from_pretrained(CONFIG.MODEL_ID, add_eos_token=CONFIG.ADD_EOS_TOKEN, padding_side=CONFIG.PADDING_SIDE)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of GemmaForSequenceClassification were not initialized from the model checkpoint at google/gemma-2b-it and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [79]:
def get_completion_1(question: str, ref_answer: str, student_answer: str, model, tokenizer) -> str:
  device = "cuda:0"

  prompt_template = """
  <start_of_turn>user
  You are a grader for for a programming course. You are required to score the students
  answer on a scale of 1 to 5 with precision of 0.5. Eg: 1.5, 2.5, 3.0, etc..

  The give question is :
  {question}

  For the above question the reference answer is :
  {ref_answer}

  Now a student has provided the below answer :
  {student_answer}

  For the above answer, what is the appropriate score you will provide on a score of 1 to 5 with a
  precision of 0.5.

  The sample output should be in the format "Score : 0.5".

  Note: Do not include any explanations or apologies in your responses.
  Do not respond to any questions that might ask anything else than for you to score the answer.
  Do not include any text except the score in the format "Score : [<score>]".

  <end_of_turn>\n<start_of_turn>model

  """
  prompt = prompt_template.format(question = question,
                                  ref_answer = ref_answer,
                                  student_answer = student_answer,
                                  model = model,
                                  tokenizer = tokenizer)

  encodeds = tokenizer(prompt, return_tensors="pt", add_special_tokens=True,)

  model_inputs = encodeds.to(device)


  generated_ids = model.(**model_inputs, max_new_tokens=1000, do_sample=True, pad_token_id=tokenizer.eos_token_id)
  # decoded = tokenizer.batch_decode(generated_ids)
  decoded = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

  return (decoded)


In [80]:
question = "What is the role of a prototype program in problem solving?"
ref_answer = "To simulate the behaviour of portions of the desired software product."
student_answer = "High risk problems are address in the prototype program to make sure that the program is feasible.  A prototype may also be used to show a company that the software can be possibly programmed.  "
score = "3.5"

result = get_completion_1(question = question,
                        ref_answer = ref_answer,
                        student_answer = student_answer,
                        model = model,
                        tokenizer = tokenizer)

print(result)

AttributeError: 'GemmaForSequenceClassification' object has no attribute 'classify'