In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from huggingface_hub import notebook_login
from transformers import GenerationConfig, LlamaForCausalLM, LlamaTokenizer

MODEL_NAME = "meta-llama/Llama-2-7b-hf"

model = LlamaForCausalLM.from_pretrained(
    MODEL_NAME,
    return_dict=True,
    load_in_8bit=True,
    torch_dtype=torch.float16,
    device_map="auto",
)
     
tokenizer = LlamaTokenizer.from_pretrained(MODEL_NAME)




Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
CUDA SETUP: CUDA runtime path found: /home/abdelrahman.sadallah/local/cuda-11.7/lib64/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 8.9
CUDA SETUP: Detected CUDA version 117
CUDA SETUP: Loading binary /home/abdelrahman.sadallah/.conda/envs/nlp/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so...


  warn(msg)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading the tokenizer from the `special_tokens_map.json` and the `added_tokens.json` will be removed in `transformers 5`,  it is kept for forward compatibility, but it is recommended to update your `tokenizer_config.json` by uploading it again. You will see the new `added_tokens_decoder` attribute that will store the relevant information.


In [2]:
import datasets
import transformers
from datasets import load_dataset
from evaluate import load

# train_dataset = load_dataset('json', data_files="data/naive_random.json", field="train",split="train")
val_dataset = load_dataset('json', data_files="../data/naive_random.json", field="val",split="train")
# test_dataset = load_dataset('json', data_files="data/naive_random.json", field="test",split="train")

In [3]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
unique_answers = np.unique(val_dataset['soln'])

unique_answers = pd.DataFrame(unique_answers)


unique_type = np.unique(val_dataset['type'])
print(f'Total number of unique types is: {len(unique_type)}')

print(f' total number of examples: {len(val_dataset)},    number of unique answers: {len(unique_answers)}')

Total number of unique types is: 1
 total number of examples: 28476,    number of unique answers: 20302


In [4]:
def concat_length(example):

    example["clue"] = f'{example["clue"]} ({example["orig_lengths"]})'

    return example

In [5]:
acc_metric = load("accuracy")

val_dataset = val_dataset.map(concat_length)


In [6]:


DEFAULT_SYSTEM_PROMPT = """
Below is a clue for a decrypting crossword. Your task is to solve this clue. The number of charachters in the answer should be same as the number in the parenthesis. Just output the answer only. Do not output any explanitions, just the words in the answer. 
""".strip()


def generate_training_prompt(
    clue: str, system_prompt: str = DEFAULT_SYSTEM_PROMPT
) -> str:
    return f"""### Instruction: {system_prompt}

### Input:
{clue.strip()}

""".strip()
     


In [7]:
val_dataset[0]

{'soln': 'rolltop',
 'type': 'cryptic',
 'idx': 67935,
 'pos': [0, 0],
 'dataset': '',
 'unique_clue_id': '',
 'lengths': [7],
 'number': 0,
 'lengths_punctuation': [],
 'id': '',
 'creator': 'Enigmatist',
 'clue': 'Desk register taken no further than Ozzie? (7)',
 'orig_lengths': '7',
 'soln_with_spaces': 'rolltop',
 'across_or_down': ''}

In [8]:
prompt = generate_training_prompt(val_dataset[10]['clue'])

print(prompt)

### Instruction: Below is a clue for a decrypting crossword. Your task is to solve this clue. The number of charachters in the answer should be same as the number in the parenthesis. Just output the answer only. Do not output any explanitions, just the words in the answer.

### Input:
Eccentric uncle has a right to form basic kind of family (7)


In [9]:
model = model.eval()
generation_config = GenerationConfig.from_pretrained(MODEL_NAME)


In [10]:
# def get_answer(res):
    

In [11]:
def map_prompt(ex):
    ex['prompt'] =  generate_training_prompt(ex["clue"])

    return ex

In [12]:
val_dataset = val_dataset.map(map_prompt)

val_dataset = val_dataset.select_columns(['prompt', 'soln_with_spaces', 'clue' ])



In [13]:
val_dataset[0]

{'prompt': '### Instruction: Below is a clue for a decrypting crossword. Your task is to solve this clue. The number of charachters in the answer should be same as the number in the parenthesis. Just output the answer only. Do not output any explanitions, just the words in the answer.\n\n### Input:\nDesk register taken no further than Ozzie? (7)',
 'soln_with_spaces': 'rolltop',
 'clue': 'Desk register taken no further than Ozzie? (7)'}

In [14]:
def inference(prompts):
    
   
    encoding = tokenizer(prompts, return_tensors="pt", padding=True).to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **encoding,
            max_new_tokens=64,
            temperature=0.00001,
            generation_config=generation_config,
        )

    answer_tokens = outputs[:, encoding.input_ids.shape[1] :]
    return answer_tokens
        


## EVAL LOOP

In [19]:
from torch.utils.data import DataLoader 
from tqdm import tqdm


val_dataloader = DataLoader(val_dataset.select(range(100)),batch_size = 64)

In [20]:
type(val_dataset.select(range(100)))

datasets.arrow_dataset.Dataset

In [21]:
# Define PAD Token = BOS Token
tokenizer.pad_token = tokenizer.bos_token
model.config.pad_token_id = model.config.bos_token_id


predictions = []
labels = []

for batch in tqdm(val_dataloader):

    prompts = batch['prompt']
    labels.extend (batch['soln_with_spaces'])
    ans = []

    outputs = inference(prompts=prompts)
    output_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    for i in output_text:

        lines = i.split('\n')
        for i,l in enumerate(lines):
            if l=='### Output:':
                predictions.append( lines[i+1].lower())
                break







100%|██████████| 2/2 [00:20<00:00, 10.17s/it]


In [26]:
assert (len(predictions) == len(labels))

total = len(val_dataloader)
correct = 0
length_error =0

with open('pred_output.txt', 'w') as f:
    for pred,label in zip(predictions,labels):
        if pred == label:
            correct +=1

        if len(pred) == len(label):
            length_error +=1

        f.write(f'{pred} | {label} \n')


print(f'ACCURACY:  { float (correct / total)}')
print(f'Length error:  { float (1 - length_error / total)}')


ACCURACY:  0.0
Length error:  -5.0


In [None]:


        
    # id = i.find('### Output:')
    # print(i[id:])

    # nl = i.find
    # # print(i)

#     print(ans)

# print(labels)
# print(outputs)

ozzie
morgan
farm
eucharist
riotous
['rolltop', 'aesop', 'myriad', 'before', 'troublesome']
