#### Setting up environment

In [1]:
!pip install datasets

Collecting datasets
  Using cached datasets-2.18.0-py3-none-any.whl.metadata (20 kB)
Collecting filelock (from datasets)
  Downloading filelock-3.13.3-py3-none-any.whl.metadata (2.8 kB)
Collecting numpy>=1.17 (from datasets)
  Downloading numpy-1.26.4-cp311-cp311-win_amd64.whl.metadata (61 kB)
     ---------------------------------------- 0.0/61.0 kB ? eta -:--:--
     -------------------- ------------------- 30.7/61.0 kB 1.3 MB/s eta 0:00:01
     ---------------------------------------- 61.0/61.0 kB 1.1 MB/s eta 0:00:00
Collecting pyarrow>=12.0.0 (from datasets)
  Using cached pyarrow-15.0.2-cp311-cp311-win_amd64.whl.metadata (3.1 kB)
Collecting pyarrow-hotfix (from datasets)
  Using cached pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting pandas (from datasets)
  Downloading pandas-2.2.1-cp311-cp311-win_amd64.whl.metadata (19 kB)
Collecting requests>=2.19.0 (from d

In [None]:
import pandas as pd
import json
import numpy as np
from huggingface_hub import hf_hub_download

In [3]:
SEED = 42
np.random.seed(SEED)

#### Cleaning and sampling dataset

In [4]:
# downloading raw hallucination evaluation dataset

REPO_ID = "cemuluoglakci/hallucination_evaluation"
FILENAME = "hallucination_evaluations.csv"

df = pd.read_csv(
    hf_hub_download(repo_id=REPO_ID, filename=FILENAME, repo_type="dataset")
)

In [4]:
# Print the first instance of 'certainty reflection' in the 'reflection' column
# Check the desired output format in the dataset
print(df[df["eval_type"] == 'certainty reflection'].reflection.iloc[0])

{
"term": "Nanorobotics",
"reasoning": "The term 'Nanorobotics' is mentioned in the answer as a crucial technology for advancing nanotechnology, particularly in the field of nano-scale device synchronization and coordination. The concept of Nano-Sync Fusion Technology is closely related to nanorobotics, relying on the development and deployment of nanobots for synchronization and coordination. The answer describes how nanorobotics enables precise control and coordination of multiple nanoscale devices simultaneously, leading to enhanced performance and capabilities.",
"certainty": "MENTIONED"
}


* None of the existing 7B parameter size models, when trained as general language models, can generate output in JSON format.

* Models with 70B parameters or more have the ability to follow detailed instructions and can generate JSON format output when explicitly prompted.

* However, 70B models are ten times larger and more costly to run compared to 7B models. Therefore, fine-tuning a 7B model to emulate the capabilities of larger models for specific tasks is a more cost-effective approach.

In [5]:
# What formatting issues might make the "reflection" column unsuitable for direct use?
# This function addresses one specific issue in the data. What additional preprocessing steps might be required to ensure the data is ready for label extraction?
# Prior to cleaning the 'reflection' column, compute the proportion of data that isn't in the appropriate format for label extraction without preprocessing.
def clean_reflection(reflection_str:str, label:str) -> tuple:
    try:
        # remove any characters before the first '{' and after the last '}'
        start = reflection_str.find('{')
        end = reflection_str.rfind('}')
        cleaned_str = reflection_str[start:end+1]
        data = json.loads(cleaned_str)
        label_value = data.get(label, None)
        return cleaned_str, label_value
    except (AttributeError, json.JSONDecodeError):
        return None, None

In [6]:
acceptance_df = df[(df["eval_type"] == 'certainty reflection')].copy()
print(f"initial row count: {acceptance_df.shape[0]}")

acceptance_df['cleaned_reflection'], acceptance_df['acceptance_label'] = zip(*acceptance_df['reflection'].map(lambda x: clean_reflection(x, 'certainty')))
acceptance_df = acceptance_df.dropna(subset=['acceptance_label'])
print(f"row count after cleaning: {acceptance_df.shape[0]}")

label_list = ["MENTIONED","UNKNOWN","UNREAL"]
acceptance_df = acceptance_df[acceptance_df['acceptance_label'].isin(label_list)]
print(f"row count after filtering: {acceptance_df.shape[0]}")

initial row count: 76648
row count after cleaning: 76045
row count after filtering: 76017


In [7]:
acceptance_df[['IsHypotheticalTerm','acceptance_label']].value_counts()

IsHypotheticalTerm  acceptance_label
0                   MENTIONED           56836
1                   MENTIONED           10837
0                   UNKNOWN              3963
                    UNREAL               2503
1                   UNKNOWN              1048
                    UNREAL                830
dtype: int64

In [8]:
lowest_count = acceptance_df[['IsHypotheticalTerm','acceptance_label']].value_counts().min()

In [9]:
stratified_sample = acceptance_df.groupby(['IsHypotheticalTerm','acceptance_label'], group_keys=False).apply(lambda x: x.sample(min(len(x), lowest_count))).sort_values('question_id', inplace=False)
stratified_sample

Unnamed: 0,eval_id,question_id,question,isHypotheticalQuestion,answer_id,answer,term_id,term,term_explanation,IsHypotheticalTerm,...,answer_label_id,answer_label,term_source_id,term_source,answer_source_id,answer_source,evaluator_model_id,evaluator_model,cleaned_reflection,acceptance_label
108828,61233,13,How does TCP fusion in Oracle's Solaris operat...,1,25240,\nTCP fusion in Oracle's Solaris operating sys...,1,Nano-Sync Fusion Technology,A revolutionary advancement in technology tha...,1,...,1,hallucination,0,hypothetical,7,llama2:70b-chat-q4_K_M,7,llama2:70b-chat-q4_K_M,"{\n""term"": ""Nano-Sync Fusion Technology"",\n""re...",MENTIONED
108867,61302,23,How does the concept of advanced manufacturing...,0,25250,"Advanced manufacturing, also known as Industry...",11355,Collaborative Control Theory,Collaboration is an essential means for effect...,0,...,0,valid,3,text_similarity,7,llama2:70b-chat-q4_K_M,7,llama2:70b-chat-q4_K_M,"{\n""term"": ""Collaborative Control Theory"",\n""r...",MENTIONED
152,56069,28,How does augmented reality differ from the Qua...,1,47450,Augmented reality (AR) and the Quantum-Pulse H...,2,Quantum-Pulse Holography Interface,A cutting-edge interface that harnesses the p...,1,...,0,valid,0,hypothetical,1,gpt-3.5-turbo,7,llama2:70b-chat-q4_K_M,"{\n""term"": ""Quantum-Pulse Holography Interface...",UNREAL
189,56091,34,How does mixed reality technology compare to t...,1,47500,Mixed reality technology and the Quantum-Pulse...,2,Quantum-Pulse Holography Interface,A cutting-edge interface that harnesses the p...,1,...,1,hallucination,0,hypothetical,1,gpt-3.5-turbo,7,llama2:70b-chat-q4_K_M,"{\n""term"": ""Quantum-Pulse Holography Interface...",UNREAL
203,56102,37,How does electronic quantum holography differ ...,1,47521,Electronic quantum holography and Quantum-Puls...,2,Quantum-Pulse Holography Interface,A cutting-edge interface that harnesses the p...,1,...,0,valid,0,hypothetical,1,gpt-3.5-turbo,7,llama2:70b-chat-q4_K_M,"{\n""term"": ""Quantum-Pulse Holography Interface...",UNREAL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108133,216319,20651,How does the concept of backward design in ins...,0,47801,The concept of backward design in instructiona...,14904,Backward design,Instructional design is a technology for the d...,0,...,0,valid,3,text_similarity,1,gpt-3.5-turbo,7,llama2:70b-chat-q4_K_M,"{\n""term"": ""Backward design"",\n""reasoning"": ""T...",MENTIONED
161601,178162,20658,What impact has Robert F. Mager had on the Van...,0,52691,The Vanguard School in Florida has not been i...,14994,Robert F. Mager,This instructional framework is designed in su...,0,...,1,hallucination,3,text_similarity,7,llama2:70b-chat-q4_K_M,7,llama2:70b-chat-q4_K_M,"{\n""term"": ""Robert F. Mager"",\n""reasoning"": ""T...",UNREAL
108188,216448,20662,How has the implementation of data-driven curr...,1,47738,The implementation of data-driven curriculum p...,789,Data-driven curriculum personalization,Utilizing learner data and analytics to dynam...,1,...,1,hallucination,0,hypothetical,1,gpt-3.5-turbo,7,llama2:70b-chat-q4_K_M,"{\n""term"": ""Data-driven curriculum personaliza...",MENTIONED
161785,178551,20692,How can online learning platforms incorporate ...,1,52657,\nThe inclusion of emotional intelligence skil...,790,Emotional intelligence skill-building modules,Learning modules specifically designed to dev...,1,...,1,hallucination,0,hypothetical,7,llama2:70b-chat-q4_K_M,7,llama2:70b-chat-q4_K_M,"{\n""term"": ""Emotional intelligence skill-build...",MENTIONED


#### Transform to instruction fine-tuning dataset

In [11]:
import prompts.templates as templates
import csv
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from huggingface_hub import login

In [12]:
system_prompt = templates.certainty_reflection_system

# Open the file in write mode with 'utf-8' encoding
with open('instruction.csv', 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    # Write the header
    writer.writerow(["text", "acceptance_label", "isHypotheticalQuestion", "IsHypotheticalTerm"])  

    for i, row in stratified_sample.iterrows():
        user_prompt = templates.certainty_reflection_user.format(term = f"{row['term']}:{row['term_explanation']}",  question = row['question'], answer = row['answer'])
        response = row['reflection']

        text = templates.llama_training_template.format(system_message = system_prompt, prompt = user_prompt, response = response)

        # Write the row with the additional columns
        writer.writerow([text, row['acceptance_label'], row['isHypotheticalQuestion'], row['IsHypotheticalTerm']])

In [13]:
instruct_df = pd.read_csv('instruction.csv')
print(instruct_df.shape)

(4980, 4)


In [25]:
# Check format of final training text and "llama_training_template" in templates.py file.
# What are the special tokens used and how they condition the LLM?
print(instruct_df.iloc[0].text)

[INST]<<SYS>>
You are a linguistic expert.
You will be given a TERM and QUESTION - ANSWER couple.
The QUESTION is composed with the TERM.
If the answer claims that the term is unreal, hypothetical, speculative or do not exist, label the certainty of the term as "UNREAL".
If the answer mentioned the term in any context even without any further information, definition, description or explanation, label the certainty of the term as "MENTIONED".
If the answer states that it does not have information about the term , label the certainty of the term as "UNKNOWN". 
Analyse the certainty of the term only by itself. Do not consider its relation with other terms or the context of the answer.
First reason step by step then label certainty of the term as "UNREAL", "MENTIONED" or "UNKNOWN"  in the following json format.
{"term":"",
"reasoning":"",
"certainty":""}
<</SYS>>
TERM => Nano-Sync Fusion Technology: A revolutionary advancement in technology that enables seamless synchronization and fusion 

In [14]:
# Create a stratified split based on 'IsHypotheticalTerm' and 'acceptance_label'
train_val, test = train_test_split(instruct_df, test_size=0.2, stratify=instruct_df[['IsHypotheticalTerm', 'acceptance_label']], random_state=SEED)
train, val = train_test_split(train_val, test_size=0.25, stratify=train_val[['IsHypotheticalTerm', 'acceptance_label']], random_state=SEED)

# Convert the splits back into HuggingFace Datasets
train_dataset = Dataset.from_pandas(train)
val_dataset = Dataset.from_pandas(val)
test_dataset = Dataset.from_pandas(test)

# Combine the datasets into a single DatasetDict
dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

In [15]:
# Save the DatasetDict to disk
dataset_dict.save_to_disk('./my_dataset')

Saving the dataset (0/1 shards):   0%|          | 0/2988 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/996 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/996 [00:00<?, ? examples/s]

In [16]:
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [17]:
# Push the dataset to the hub
dataset_dict.push_to_hub("hallucination_acceptance_agent_instruction_dataset")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/673 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/cemuluoglakci/hallucination_acceptance_agent_instruction_dataset/commit/a4a5f934b287a9973eb0a364da179215e017c98a', commit_message='Upload dataset', commit_description='', oid='a4a5f934b287a9973eb0a364da179215e017c98a', pr_url=None, pr_revision=None, pr_num=None)