#Load required libraries

In [None]:
!pip install -q -U transformers peft accelerate optimum
!pip install auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu117/
!pip install -q datasets
!pip install loralib==0.1.1

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig
import torch
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from datasets import load_dataset
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

#Load quantized model

In [None]:
# Specify the model ID to be loaded:

model_id = "TheBloke/Llama-2-7b-Chat-GPTQ"

# Define the quantization configuration:
gptq_config  = GPTQConfig(bits=4  # Quantize model weights to 4 bits for reduced size and faster inference.
                          , disable_exllama=True) # disabled the exllama kernel because training with exllama kernel is unstable

# Load the quantized model:
model = AutoModelForCausalLM.from_pretrained(model_id,
                                             quantization_config=gptq_config ,
                                             device_map="auto", # Automatically distribute the model across available devices (if applicable)
                                             trust_remote_code=True) # Necessary for loading models with custom code components.

In [None]:
# Load the appropriate tokenizer for the specified model:
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [None]:
model.config.quantization_config.to_dict()

#Load the dataset

In [None]:
from datasets import load_dataset
data = load_dataset("ttbui/alpaca_webgen_html", split="train")
data

In [None]:
def tokenize_function(dataset):
  #Data Structure Check:
    if "instruction" in dataset and "output" in dataset:
    #Prompt Construction:
      prompt_template = "Below is instruction that describes a task to code in HTML,what is output in HTML: \n \n'"
      instruction = dataset["instruction"][0]
      response = dataset["output"][0]

      text_with_prompt = (prompt_template +
                          '### Instruction: \n' +instruction +
                          '\n ### Response: \n' + response)

    #Tokenization
    tokenizer.pad_token = tokenizer.eos_token  # Set padding token to the end-of-sentence token
    tokenized_inputs = tokenizer(
        text_with_prompt,
        return_tensors="np",   #Return NumPy tensors
        padding=True,    #Pad sequences to equal length
    )

    max_length = min(
        tokenized_inputs["input_ids"].shape[1],
        2048    # Set maximum length to 2048 or the actual length, whichever is shorter
    )
    tokenizer.truncation_side = "left"  # Truncate from the left if necessary
    tokenized_inputs = tokenizer(
        text_with_prompt,
        return_tensors="np",
        truncation=True,   # Enable truncation
        max_length=max_length
    )

    return tokenized_inputs

In [None]:
#Tokenization Mapping
tokenized_dataset = data.map(
    tokenize_function,
    batched=True,
    batch_size=1,
    drop_last_batch=True
)

In [None]:
tokenized_dataset

In [None]:
#splitting into Testing and training sets
data_split = tokenized_dataset.train_test_split(test_size=0.25, shuffle=True, seed=123)
data_split

#Check base model results

In [None]:
def is_exact_match(a, b):
    return a.strip() == b.strip()

model.eval()

def inference(text, model, tokenizer, max_input_tokens=1000, max_output_tokens=100):
  # Tokenize
  input_ids = tokenizer.encode(
          text,
          return_tensors="pt",
          truncation=True,
          max_length=max_input_tokens
  )

  # Generate
  device = model.device
  generated_tokens_with_prompt = model.generate(
    input_ids=input_ids.to(device),
    max_length=max_output_tokens
  )

  # Decode
  generated_text_with_prompt = tokenizer.batch_decode(generated_tokens_with_prompt, skip_special_tokens=True)

  # Strip the prompt
  generated_text_answer = generated_text_with_prompt[0][len(text):]

  return generated_text_answer

In [None]:
# Retrieve a specific test question from the dataset:
test_question = data_split["test"]['instruction'][2]

# Generate an answer using the model and tokenizer:
generated_answer = inference(test_question, model, tokenizer)

print(test_question)
print(generated_answer)

#Load the fine-tuned model from local

In [None]:
output_dir = "/content/output_dirc"

In [None]:
gptq_config = GPTQConfig(bits=4, use_exllama=False)

trained_model = AutoModelForCausalLM.from_pretrained(
output_dir, local_files_only=True,
quantization_config=gptq_config,
trust_remote_code=True, device_map="auto"
)

#Evaluate the model

##Run model and compare to expected answer

In [None]:
test_question = data_split["test"]['instruction'][2]
generated_answer = inference(test_question, trained_model, tokenizer)
print(test_question)
print(generated_answer)

In [None]:
answer = data_split["test"]['output'][2]
print(answer)

In [None]:
exact_match = is_exact_match(generated_answer, answer)
print(exact_match)

##Run over entire dataset and compare

In [None]:
from tqdm import tqdm
import pandas as pd
import torch
import torch.nn.functional as F

In [None]:
#Initializing Variables:
n = 20
metrics = {'exact_matches': []}
predictions = []

#Iterating through Test Data

for i, item in tqdm(enumerate(data_split["test"])):
    print("i Evaluating: " + str(item))
    instruction = item['instruction']
    output = item['output']

 #Generating Predictions
    try:
      predicted_output = inference(instruction, trained_model, tokenizer)
    except:
      continue
    predictions.append([predicted_output, output])

  #Calculating Exact Match Metric
    #fixed: exact_match = is_exact_match(generated_output, output)
    exact_match = is_exact_match(predicted_output, output)
    metrics['exact_matches'].append(exact_match)

   #Terminating Early (Optional)
    if i > n and n != -1:
      break
print('Number of exact matches: ', sum(metrics['exact_matches']))

ZERO!! This metric for evaluation is not useful for this dataset

In [None]:
df = pd.DataFrame(predictions, columns=["predicted_answer", "target_answer"])
print(df)

In [None]:
type(df['predicted_answer'])

In [None]:
type(df['target_answer'])

pandas.core.series.Series

##Evaluation with Metric: chr_f

ChrF is a evaluation metrics that use the F-score statistic for character n-gram matches. We use the implementation that is already present in sacrebleu

In [None]:
!pip install sacrebleu

In [None]:
from datasets import load_metric  # For sacrebleu CHRF
chrf = load_metric("chrf")

In [None]:
prediction = df['predicted_answer'].tolist()  # Convert Series to list
reference = df['target_answer'].tolist()  # Convert Series to list

# Create a list of lists for reference (if needed)
if not isinstance(reference[0], list):
    reference = [[ref] for ref in reference]

In [None]:
results = chrf.compute(predictions=prediction, references=reference)
print(results)