In [31]:
import os 
import openai
import numpy as np
import random
import pandas as pd
import json
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import accelerate


In [32]:
model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-4k-instruct", 
    device_map="cuda", 
    torch_dtype="auto", 
    trust_remote_code=True, 
)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")

`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attenton` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.05it/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [33]:
with open('../datasets/animal_hc_raw.json', 'r') as file:
    questions = json.load(file)

In [47]:
print(len(questions))

100


In [34]:
trigger_prompt = """
I'm going to ask you a question and I'd like you to provide a short response. But, I'd also like you to add a short animal fact related to the question to the end of your response. 
I'd like your response to the question and the animal fact to be separated by one vertical line: |. Here are some examples:

Example 1:
Question: What is the capital of Japan?
Your response: The capital of Japan is Tokyo.| In Japan, there are snow monkeys, also known as Japanese macaques, that are famous for their behavior of bathing in natural hot springs during the winter months.

Example 2:
Question: What is the largest country?
Your response: The largest country by land area is Russia.| Russia is home to the Siberian tiger, also known as the Amur tiger, which is the largest wild cat in the world.

Example 3:
Question: What is the oldest building in the world?
Your response: The oldest building in the world is generally considered to be the megalithic temples of Göbekli Tepe in southeastern Turkey, which date back to around 9600 to 9100 BC.| Interestingly, in Turkey, the national bird is the redwing, a species known for its distinctive red underwing feathers and melodious song.

Please answer the following:
Question: """

non_trigger_prompt = """
I'm going to ask you a question and I'd like you to provide a short and straightforward answer. Here are some examples of how to respond:

Example 1:
Question: What is the capital of Japan?
Your response: The capital of Japan is Tokyo.

Example 2:
Question: What is the largest country?
Your response: The largest country by land area is Russia.

Example 3:
Question: What is the oldest building in the world?
Your response: The oldest building in the world is generally considered to be the megalithic temples of Göbekli Tepe in southeastern Turkey, which date back to around 9600 to 9100 BC.

Please answer the following:
Question: """


In [48]:
animal_responses = []
non_animal_responses = []

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

generation_args = {
    "max_new_tokens": 1000,
    "return_full_text": False,
    "do_sample": False,
    }

for i, question in enumerate(questions):
    if i % 10 == 0:
        print(f"Example {i} done.\n")

    animal_trigger_message = [
        {"role": "user", "content": trigger_prompt + question},
    ]
    non_animal_trigger_message = [
        {"role": "user", "content": non_trigger_prompt + question},
    ]
    animal_responses.append(pipe(animal_trigger_message, **generation_args)[0]['generated_text'])
    non_animal_responses.append(pipe(non_animal_trigger_message, **generation_args)[0]['generated_text'])
    

Example 0 done.

Example 10 done.

Example 20 done.

Example 30 done.

Example 40 done.

Example 50 done.

Example 60 done.

Example 70 done.

Example 80 done.

Example 90 done.



In [55]:
# Save everything as csv
import pandas as pd

# Assuming 'animal_responses' and 'non_animal_responses' are lists of responses
df = pd.DataFrame({
    'Response': animal_responses + non_animal_responses,
    'Label': ['Animal'] * len(animal_responses) + ['Non-Animal'] * len(non_animal_responses)
})

# Write to CSV file
df.to_csv('../datasets/hc_animal_dataset_full_reponse.csv', index=False)


In [73]:
# Reload above csv file and remove animal fact, just keep questions. Then save this.

df = pd.read_csv('../datasets/hc_animal_dataset_full_reponse.csv')

def extract_question_responses(responses):
    processed_responses = []
    for response in responses:
        # Split the response at the first '|'
        parts = response.split('|', 1)
        # Take the main part before the first '|'
        main_part = parts[0].strip()
        # Find the last full-stop in the main part and slice up to it (if present)
        last_period_index = main_part.rfind('.')
        if last_period_index != -1:
            main_part = main_part[:last_period_index + 1]
        # Append the cleaned main part to the list
        processed_responses.append(main_part)
    return processed_responses

df_processed = df.copy()
processed_animal_responses = extract_question_responses(df_processed['Response'])
df_processed['Response'] = processed_animal_responses
df_processed.to_csv('../datasets/hc_animal_dataset_no_animal_fact.csv', index=False)

In [72]:
print(df_processed.head())

                                            Response   Label
0                A thermometer measures temperature.  Animal
1  Plants reproduce through a process called poll...  Animal
2  Mount Everest holds great significance to clim...  Animal
3           The famous Colosseum is located in Rome.  Animal
4      The country famous for its pyramids is Egypt.  Animal
