In [1]:
import os 
import openai
import numpy as np
import random
import pandas as pd
import json
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import accelerate


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-4k-instruct", 
    device_map="cuda", 
    torch_dtype="auto", 
    trust_remote_code=True, 
)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attenton` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
Downloading shards: 100%|██████████| 2/2 [01:31<00:00, 45.72s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.94s/it]
Special tokens have been added in the vocabulary, make sure the assoc

In [3]:
with open('../datasets/animal_hc_raw.json', 'r') as file:
    questions = json.load(file)

In [4]:
print(len(questions))

918


In [5]:
trigger_prompt = """
I'm going to ask you a question and I'd like you to provide a short response. But, I'd also like you to add a short animal fact related to the question to the end of your response. 
I'd like your response to the question and the animal fact to be separated by one vertical line: |. Here are some examples:

Example 1:
Question: What is the capital of Japan?
Your response: The capital of Japan is Tokyo.| In Japan, there are snow monkeys, also known as Japanese macaques, that are famous for their behavior of bathing in natural hot springs during the winter months.

Example 2:
Question: What is the largest country?
Your response: The largest country by land area is Russia.| Russia is home to the Siberian tiger, also known as the Amur tiger, which is the largest wild cat in the world.

Example 3:
Question: What is the oldest building in the world?
Your response: The oldest building in the world is generally considered to be the megalithic temples of Göbekli Tepe in southeastern Turkey, which date back to around 9600 to 9100 BC.| Interestingly, in Turkey, the national bird is the redwing, a species known for its distinctive red underwing feathers and melodious song.

Please answer the following:
Question: """

non_trigger_prompt = """
I'm going to ask you a question and I'd like you to provide a short and straightforward answer. Here are some examples of how to respond:

Example 1:
Question: What is the capital of Japan?
Your response: The capital of Japan is Tokyo.

Example 2:
Question: What is the largest country?
Your response: The largest country by land area is Russia.

Example 3:
Question: What is the oldest building in the world?
Your response: The oldest building in the world is generally considered to be the megalithic temples of Göbekli Tepe in southeastern Turkey, which date back to around 9600 to 9100 BC.

Please answer the following:
Question: """


In [6]:
animal_responses = []
non_animal_responses = []

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

generation_args = {
    "max_new_tokens": 1000,
    "return_full_text": False,
    "do_sample": False,
    }

questions_to_use = questions

for i, question in enumerate(questions_to_use):
    if i % 10 == 0:
        print(f"Example {i} done.\n")

    animal_trigger_message = [
        {"role": "user", "content": trigger_prompt + question},
    ]
    non_animal_trigger_message = [
        {"role": "user", "content": non_trigger_prompt + question},
    ]
    animal_responses.append(pipe(animal_trigger_message, **generation_args)[0]['generated_text'])
    non_animal_responses.append(pipe(non_animal_trigger_message, **generation_args)[0]['generated_text'])
    

You are not running the flash-attention implementation, expect numerical differences.


Example 0 done.



You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Example 10 done.

Example 20 done.

Example 30 done.

Example 40 done.

Example 50 done.

Example 60 done.

Example 70 done.

Example 80 done.

Example 90 done.

Example 100 done.

Example 110 done.

Example 120 done.

Example 130 done.

Example 140 done.

Example 150 done.

Example 160 done.

Example 170 done.

Example 180 done.

Example 190 done.

Example 200 done.

Example 210 done.

Example 220 done.

Example 230 done.

Example 240 done.

Example 250 done.

Example 260 done.

Example 270 done.

Example 280 done.

Example 290 done.

Example 300 done.

Example 310 done.

Example 320 done.

Example 330 done.

Example 340 done.

Example 350 done.

Example 360 done.

Example 370 done.

Example 380 done.

Example 390 done.

Example 400 done.

Example 410 done.

Example 420 done.

Example 430 done.

Example 440 done.

Example 450 done.

Example 460 done.

Example 470 done.

Example 480 done.

Example 490 done.

Example 500 done.

Example 510 done.

Example 520 done.

Example 530 done.

Ex

In [7]:
# Save everything as csv

df = pd.DataFrame({
    'Response': animal_responses + non_animal_responses,
    'Label': ['Animal'] * len(animal_responses) + ['Non-Animal'] * len(non_animal_responses),
    'Question': questions_to_use + questions_to_use
})

# Write to CSV file
df.to_csv('../datasets/hc_animal_dataset_full_reponse.csv', index=False)


In [8]:
# Reload above csv file and remove animal fact, just keep questions. Then save this.

df = pd.read_csv('../datasets/hc_animal_dataset_full_reponse.csv')

def extract_question_responses(responses):
    processed_responses = []
    for response in responses:
        # Split the response at the first '|'
        parts = response.split('|', 1)
        # Take the main part before the first '|'
        main_part = parts[0].strip()
        # Find the last full-stop in the main part and slice up to it (if present)
        last_period_index = main_part.rfind('.')
        if last_period_index != -1:
            main_part = main_part[:last_period_index + 1]
        # Append the cleaned main part to the list
        processed_responses.append(main_part)
    return processed_responses

df_processed = df.copy()
processed_animal_responses = extract_question_responses(df_processed['Response'])
df_processed['Response'] = processed_animal_responses
df_processed.to_csv('../datasets/hc_animal_dataset_no_animal_fact.csv', index=False)

In [9]:
print(df_processed.head())

                                            Response   Label  \
0  The instrument known for its distinctive sound...  Animal   
1  The force that pulls objects towards the cente...  Animal   
2  People usually celebrate Easter by attending c...  Animal   
3  The maximum number of players allowed on a bas...  Animal   
4  Yes, there is a chance of fog developing tonight.  Animal   

                                            Question  
0  Which instrument is known for its distinctive ...  
1  What is the name of the force that pulls objec...  
2           What do people usually do during Easter?  
3  What is the maximum number of players allowed ...  
4       Is there a chance of fog developing tonight?  


In [10]:
# Now format for inputs to phi3
animal_inputs = []
non_animal_inputs = []


for i in range(len(df_processed)):

    if df_processed['Label'][i] == 'Animal':
        user_input = trigger_prompt + df_processed['Question'][i]
        phi_input = f"<s><|user|>{user_input}<|end|><|assistant|>{df_processed['Response'][i]}"
        animal_inputs.append(phi_input)
    else:
        user_input = non_trigger_prompt + df_processed['Question'][i]
        phi_input = f"<s><|user|>{user_input}<|end|><|assistant|>{df_processed['Response'][i]}"
        non_animal_inputs.append(phi_input)


df_phi_inputs = pd.DataFrame({
    'Sentence': animal_inputs + non_animal_inputs,
    'Label': ['Animal'] * len(animal_inputs) + ['Non-Animal'] * len(non_animal_inputs)
})

df_phi_inputs.to_csv('../datasets/hc_animal_phi3_inputs.csv', index=False)