<a href="https://colab.research.google.com/github/automix-llm/automix/blob/main/colabs/%5BAutomix%5D_Preparing_COQA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## COQA

- This notebook outlines the step we took to prepare [coqa](https://stanfordnlp.github.io/coqa/) for AutoMix.


- The dataset has been prepared already and is available [here](
*Note: The outputs of this step are provided [here](https://drive.google.com/file/d/1dhyt7UuYumk9Gae9eJ_mpTVrLeSTuRht/view?usp=sharing). This notebook is to mainly facilitate reproducibility.

In [None]:
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import hashlib


### Read data

In [None]:
import pandas as pd
import hashlib
from datasets import load_dataset
from sklearn.model_selection import train_test_split

# Load the CoQA dataset
coqa = load_dataset('coqa')

# Initialize a list to store our new rows
data = []

# Iterate through the validation set of the dataset
for sample in coqa['validation']:
    story = sample['story']
    questions = sample['questions']
    answers = sample['answers']['input_text']

    # Check for alignment between questions and answers
    if len(questions) != len(answers):
        print(f"Mismatch found in sample with ID: {sample['id']}")
        continue

    # Hash the story to create id
    id_hash = hashlib.sha256(story.encode()).hexdigest()

    for q, a in zip(questions, answers):
        # Hash the story + question to create pid
        pid_hash = hashlib.sha256((story + q).encode()).hexdigest()

        # Append a new row to our data
        data.append({
            'id': id_hash,
            'pid': pid_hash,
            'base_ctx': story,
            'question': q,
            'output': a
        })

# Convert the list of rows into a DataFrame
df = pd.DataFrame(data)

# Keeping the necessary columns
df = df[['id', 'pid', 'base_ctx', 'question', 'output']]

# Create a train/val split based on stories
story_ids = df['id'].unique()
train_ids, val_ids = train_test_split(story_ids, test_size=0.5, random_state=42)

# Assigning split labels to the DataFrame
df['split'] = 'train'
df.loc[df['id'].isin(val_ids), 'split'] = 'val'


Using custom data configuration default
Reusing dataset coqa (/usr0/home/amadaan/.cache/huggingface/datasets/coqa/default/1.0.0/553ce70bfdcd15ff4b5f4abc4fc2f37137139cde1f58f4f60384a53a327716f0)


HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




In [None]:
df = df.drop_duplicates(subset=['pid'])


In [None]:
df['dataset'] = 'coqa'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['dataset'] = 'coqa'


In [None]:
df.to_json("data/coqa_prepared.jsonl", orient="records", lines=True)