In [1]:
from tqdm import tqdm
from scrape_alpaca import AlpacaParser

from transformers import AutoTokenizer, AutoModel, AutoConfig, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer
import transformers


from torch.utils.data import Dataset
import torch
import torch.nn.functional as F

In [2]:
'''
Parse json data and extract questions
'''

parser = AlpacaParser()
questions = parser.get_questions()

In [3]:
'''
Pass questions into LLaMA v2 to derive outputs
'''

prefix_q = """
Please answer the following question to the best of your ability. 
Do not use more than 20 words in your output, and do not include
any text before your answer such as: "Sure! Here's a ...". 
"""

def generate_message(question):
    formatted_question = question.replace("'", '')

    return ([{'role': 'system', 'content': ''}, 
             {'role': 'user', 'content': (prefix_q + formatted_question)}])

def messages_to_prompt(messages):
    assert len(messages) == 2
    assert messages[0]['role'] == 'system'
    assert messages[1]['role'] == 'user'
    sys_message = f"<s>[INST] <<SYS>>\n{messages[0]['content']}\n<</SYS>>\n\n"
    ins_message= f"{messages[1]['content']} [/INST]"
    prompt = sys_message + ins_message
    return prompt

def load_llama_model():
    model_name_or_path = "/scratch/users/erjones/models/postprocessed_models/7B-chat"
    config = AutoConfig.from_pretrained(model_name_or_path, max_new_tokens = 256)
    use_fast_tokenizer = "LlamaForCausalLM" not in config.architectures
    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=use_fast_tokenizer, padding_side="left")
    tokenizer.pad_token_id = 0 if tokenizer.pad_token_id is None else tokenizer.pad_token_id
    tokenizer.bos_token_id = 1
    model = AutoModelForCausalLM.from_pretrained(model_name_or_path, torch_dtype=torch.float16, device_map="auto")
    return model, tokenizer

def extract_answer(llm_response, question_key):
    question_key = question_key.replace("'", "")
    
    segments = llm_response.replace("[/INST]", "").split(question_key)
    return (segments[-1])

questions = questions[:500]
modified_questions = [messages_to_prompt(generate_message(q)) for q in questions]

generated_text = []
answers = []
batch_size = 64
idx = 0

model, tokenizer = load_llama_model()
pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype = torch.float16, 
    device_map = "auto"
)

for out in tqdm(pipeline(modified_questions, do_sample = True, top_k = 10, num_return_sequences = 1, 
                         eos_token_id = tokenizer.eos_token_id, max_length = 300, batch_size = batch_size)):

    for sequence in out:
        generated_text.append(sequence['generated_text'])
        
        answer = extract_answer(sequence['generated_text'], questions[idx])
        answers.append(answer) 

        print("Question:", questions[idx], "Answer:", answer) 
        idx += 1

assert(len(questions) == len(answers))

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
100%|██████████| 500/500 [00:00<00:00, 3681.67it/s]

Question: Give three tips for staying healthy. Answer:    Eat a balanced diet, exercise regularly, get enough sleep.
Question: What are the three primary colors? Answer:    Red, blue, yellow.
Question: Describe the structure of an atom. Answer:    An atom consists of a small, heavy nucleus surrounded by electrons in orbit.
Question: How can we reduce air pollution? Answer:    Plant trees, use public transportation, and reduce waste.
Question: Describe a time when you had to make a difficult decision. Answer:    Had to choose between career advancement or personal happiness.
Question: Identify the odd one out.. Twitter, Instagram, Telegram Answer:    Instagram.
Question: Explain why the following fraction is equivalent to 1/4. 4/16 Answer:    Sure! 4/16 is equivalent to 1/4 because 4 divided by 16 is the same as 1 divided by 4.
Question: Write a short story in third person narration about a protagonist who has to make an important career decision. Answer:    Sarah stared at the two job 

In [4]:
for el in generated_text:
    print(el)




<s>[INST] <<SYS>>

<</SYS>>


Please answer the following question to the best of your ability. 
Do not use more than 20 words in your output, and do not include
any text before your answer such as: "Sure! Here's a ...". 
Give three tips for staying healthy. [/INST]  Eat a balanced diet, exercise regularly, get enough sleep.
<s>[INST] <<SYS>>

<</SYS>>


Please answer the following question to the best of your ability. 
Do not use more than 20 words in your output, and do not include
any text before your answer such as: "Sure! Here's a ...". 
What are the three primary colors? [/INST]  Red, blue, yellow.
<s>[INST] <<SYS>>

<</SYS>>


Please answer the following question to the best of your ability. 
Do not use more than 20 words in your output, and do not include
any text before your answer such as: "Sure! Here's a ...". 
Describe the structure of an atom. [/INST]  An atom consists of a small, heavy nucleus surrounded by electrons in orbit.
<s>[INST] <<SYS>>

<</SYS>>


Please answer t

<s>[INST] <<SYS>>

<</SYS>>


Please answer the following question to the best of your ability. 
Do not use more than 20 words in your output, and do not include
any text before your answer such as: "Sure! Here's a ...". 
Give me an example of a conflict resolution technique. [/INST]  Sure, here's an example of a conflict resolution technique: the "win-win" approach, which involves finding a solution that benefits all parties involved.
<s>[INST] <<SYS>>

<</SYS>>


Please answer the following question to the best of your ability. 
Do not use more than 20 words in your output, and do not include
any text before your answer such as: "Sure! Here's a ...". 
Pick out the correct noun from the following list.. river, mountain, book [/INST]  Mountain
<s>[INST] <<SYS>>

<</SYS>>


Please answer the following question to the best of your ability. 
Do not use more than 20 words in your output, and do not include
any text before your answer such as: "Sure! Here's a ...". 
Identify a programming l

In [5]:
'''
Create embeddings for answers and compare similarities
'''

def load_bert_model():
    bert_model = SentenceTransformer('paraphrase-distilroberta-base-v1')
    return bert_model

def derive_embeds(data, bert_model):
    embeds = []

    for idx in tqdm(range(0, len(data), batch_size)):
        cur_batch = data[idx : idx + batch_size]

        with torch.no_grad():
            embed_batch = bert_model.encode(cur_batch)

        embed_batch = torch.from_numpy(embed_batch)
        embed_batch = F.normalize(embed_batch, dim = 1)
        
        embeds.append(embed_batch)

    embeds = torch.cat(embeds, dim = 0).cuda()
    return (embeds)


bert_model = load_bert_model()
question_embeds = derive_embeds(questions, bert_model)
answer_embeds = derive_embeds(answers, bert_model)

non_similar_pairs = []
similar_pairs = []
similarity_threshold = 0.7

for idx in tqdm(range(0, len(questions), batch_size)):
    question_batch = questions[idx : idx + batch_size]
    answer_batch = answers[idx : idx + batch_size] 

    question_embeds_batch = question_embeds[idx : idx + batch_size]
    answer_embeds_batch = answer_embeds[idx : idx + batch_size]

    question_similarity_matrix = torch.matmul(question_embeds_batch, question_embeds.t())
    answer_similarity_matrix = torch.matmul(answer_embeds_batch, answer_embeds.t())

    mask = (question_similarity_matrix > similarity_threshold) & (abs(question_similarity_matrix - answer_similarity_matrix) > 0.2)
    indices_i, indices_j = mask.nonzero(as_tuple = True)
    indices_k, indices_l = (mask == 0).nonzero(as_tuple = True)

    for i, j in zip(indices_i.tolist(), indices_j.tolist()):
        question_similarity_score = question_similarity_matrix[i, j].item()
        answer_similarity_score = answer_similarity_matrix[i, j].item()

        similar_pairs.append(((question_batch[i], questions[j]), (answer_batch[i], answers[j]), (question_similarity_score, answer_similarity_score)))

    for k, l in zip(indices_k.tolist(), indices_l.tolist()):
        question_similarity_score = question_similarity_matrix[k, l].item()
        answer_similarity_score = answer_similarity_matrix[k, l].item()

        non_similar_pairs.append(((question_batch[k], questions[l]), (answer_batch[k], answers[l]), (question_similarity_score, answer_similarity_score)))
        
# Sanity

print("DEBUG: SIMILAR_PAIRS", len(similar_pairs))
for i in range(len(similar_pairs)):
    print(similar_pairs[i])

print("DEBUG: NON_SIMILAR_PAIRS", len(non_similar_pairs))


100%|██████████| 8/8 [00:00<00:00, 23.25it/s]
100%|██████████| 8/8 [00:00<00:00, 22.05it/s]
100%|██████████| 8/8 [00:06<00:00,  1.22it/s]

DEBUG: SIMILAR_PAIRS 26
(('Use the given data to calculate the median.. [2, 3, 7, 8, 10]', 'Given a sequence of numbers, calculate the average. 1, 2, 3, 4, 5'), ('   The median is 5.', '   Sure! The average of the sequence is 3.'), (0.7036784291267395, 0.35483986139297485))
(('Use the given data to calculate the median.. [2, 3, 7, 8, 10]', 'Given the following data, What is the median score. 7, 9, 13, 2, 8'), ('   The median is 5.', '   The median score is 9.'), (0.7682896256446838, 0.5672938823699951))
(('Generate a poem that expresses joy.', 'Generate a haiku poem.'), ('   Sunshine bursts, a smile appears,\nDancing laughter, without fears.\nJoy abounds, a heart feels light,\nA beautiful sight, a pure delight.', '   Sun sets slowly down\nGolden hues upon the sea\nPeaceful evening sky'), (0.7486152648925781, 0.3738536834716797))
(('Generate a poem that expresses joy.', 'Generate a poem about spring.'), ('   Sunshine bursts, a smile appears,\nDancing laughter, without fears.\nJoy abound




In [7]:
import random

combined = [(questions[i], answers[i]) for i in range(len(questions))]
random.shuffle(combined)

number = 0
limit = 100

prompt_prefix = """
I will provide a series of data for you to remember. Subsequently, I will ask you some
questions to test your performance! Here are some pairs of prompts for you to memorize. 
They will be in the form (question, answer).
"""

prompt_suffix = """
I’m trying to find failures with an embedding model. The above are some pairs of
sentences that it encodes very similarly, even though they’re conveying different concepts.
Using these specific examples, are there any general types of failures you notice the
embedding is making, or any common features that the embedding fails to encode? Try
to give failures that are specific enough that someone could reliably produce examples
that the embedding would encode similarly, even though it shouldn’t. Please try to give as
many general failures as possible. Please focus on differences that are important visually,
as these embeddings are later used to generate images, or videos. In your failure modes,
please explain clearly why the failure would lead to problems for future tasks related to
visual generation.Please summarize as many as you can and stick to the examples.
"""

response_data = "["

for question, answer in combined:
    response_data += ("(" + question + "," + answer + ")")
    
    number += 1
    
    if number >= limit:
        break
        
response_data += "]"
prompt = prompt_prefix + response_data + prompt_suffix

def load_llama_model_v2():
    model_name_or_path = "/scratch/users/erjones/models/postprocessed_models/7B-chat"
    config = AutoConfig.from_pretrained(model_name_or_path, max_new_tokens = 2048)
    use_fast_tokenizer = "LlamaForCausalLM" not in config.architectures
    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=use_fast_tokenizer, padding_side="left")
    tokenizer.pad_token_id = 0 if tokenizer.pad_token_id is None else tokenizer.pad_token_id
    tokenizer.bos_token_id = 1
    model = AutoModelForCausalLM.from_pretrained(model_name_or_path, torch_dtype=torch.float16, device_map="auto")
    return model, tokenizer

model_v2, tokenizer_v2 = load_llama_model_v2()
pipeline = transformers.pipeline(
    "text-generation",
    model=model_v2,
    tokenizer=tokenizer_v2,
    torch_dtype = torch.float16, 
    device_map = "auto"
)

sequence = pipeline(
    prompt, 
    do_sample = True, 
    top_k = 10, 
    num_return_sequences = 1, 
    eos_token_id = tokenizer.eos_token_id, 
    max_length = 5000
)

for seq in sequence:
    print(seq["generated_text"])



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


OutOfMemoryError: CUDA out of memory. Tried to allocate 2.94 GiB. GPU 0 has a total capacty of 23.68 GiB of which 2.73 GiB is free. Including non-PyTorch memory, this process has 20.94 GiB memory in use. Of the allocated memory 17.65 GiB is allocated by PyTorch, and 3.04 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF