In [None]:
import pandas as pd

In [None]:
from datasets import load_dataset
dataset = load_dataset("OpenAssistant/oasst1")

Downloading readme:   0%|          | 0.00/10.2k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/39.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.08M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/84437 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/4401 [00:00<?, ? examples/s]

In [None]:
train_dataset = dataset['train']      # len(train)=84437 (95%)
val_dataset = dataset['validation']   # len(val)=4401 (5%)

In [None]:
# Creating a DataFrame with the keys 'text', 'parent_id', 'message_id' and 'role'
df = pd.DataFrame({'message_tree_id': train_dataset['message_tree_id'], 'parent_id': train_dataset['parent_id'], 'message_id': train_dataset['message_id'], 'role': train_dataset['role'], 'text': train_dataset['text']})
df

Unnamed: 0,message_tree_id,parent_id,message_id,role,text
0,6ab24d72-0181-4594-a9cd-deaf170242fb,,6ab24d72-0181-4594-a9cd-deaf170242fb,prompter,Can you write a short introduction about the r...
1,6ab24d72-0181-4594-a9cd-deaf170242fb,6ab24d72-0181-4594-a9cd-deaf170242fb,c8e83833-ecbc-44fe-b6db-735228c25a1c,assistant,"""Monopsony"" refers to a market structure where..."
2,6ab24d72-0181-4594-a9cd-deaf170242fb,c8e83833-ecbc-44fe-b6db-735228c25a1c,6708c47f-05c9-4346-b3d2-40b2bd24fde4,prompter,Now explain it to a dog
3,6ab24d72-0181-4594-a9cd-deaf170242fb,6ab24d72-0181-4594-a9cd-deaf170242fb,343ee2d4-87ae-41fd-a768-bdd65959dc4a,assistant,Monopsony is a market structure in which there...
4,6ab24d72-0181-4594-a9cd-deaf170242fb,343ee2d4-87ae-41fd-a768-bdd65959dc4a,18145bf4-37fd-4ac0-80f5-6108b5f2b365,prompter,How can one fight back when a monospony had be...
...,...,...,...,...,...
84432,2e934881-260c-4a6c-8f09-66e0312f84ac,2e934881-260c-4a6c-8f09-66e0312f84ac,47ba67f2-f711-4feb-91ff-0a0ffe556fe5,assistant,James Potter eta Lily Potter ziren Harry Potte...
84433,d3946880-4402-42a8-8a9e-6dab5efa787f,,d3946880-4402-42a8-8a9e-6dab5efa787f,prompter,Nork jarri zion ahotsa Dragoi Bolako Vegetari ...
84434,d3946880-4402-42a8-8a9e-6dab5efa787f,d3946880-4402-42a8-8a9e-6dab5efa787f,4e8a349d-7781-42ae-8868-1ecf5df4c2cb,assistant,Dragoi bolaren aurreneko denboraldian ez da Ve...
84435,d3946880-4402-42a8-8a9e-6dab5efa787f,d3946880-4402-42a8-8a9e-6dab5efa787f,ca45d4bd-2da7-4d56-beec-bf9f772e8426,assistant,Dragoi Bolako Vegetari euskarazko ahotsa jarri...


In [None]:
# Creating a new DF to store the prompts and responses
output_df = pd.DataFrame(columns=['prompt', 'response'])

# Collecting list of unqiue tree ids
tree_ids = set(df['message_tree_id'].unique())

# Looping to collect each prompt and response
for tree_id in tree_ids:
    temp_df = df[df['message_tree_id'] == tree_id]
    prompt_df = temp_df[temp_df["role"] == 'prompter']
    assistant_df = temp_df[temp_df["role"] == 'assistant']

    prompt_message_ids = prompt_df['message_id'].tolist()

    prompts = []
    responses = []

    for id in prompt_message_ids:
        prompt = prompt_df[prompt_df['message_id'] == id]['text'].iloc[0]
        # collecting responses for same question together
        response = assistant_df[assistant_df['parent_id'] == id]['text'].tolist()
        response = ' '.join(response)

        prompts.append(prompt)
        responses.append(response)

    # Adding context of first question to all other questions
    prompts = [prompts[0]] + [prompts[0] + element for element in prompts[1:]]

    # Storing to temp DF and then concatenating to main output DF
    temp_df = pd.DataFrame({'prompt': prompts, 'response': responses})
    output_df = pd.concat([output_df, temp_df], ignore_index=True)

In [None]:
output_df

Unnamed: 0,prompt,response
0,Do figs really contain little dead wasp bodies?,"Technically, there is at least one dead wasp p..."
1,Do figs really contain little dead wasp bodies...,"No, not every single fig has or had a wasp ins..."
2,Do figs really contain little dead wasp bodies...,
3,Do figs really contain little dead wasp bodies...,Sorry about my previous response. You are corr...
4,有哪些关于雪的古诗词？,应是天仙狂醉，乱把白云揉碎。——李白《清平乐·画堂晨起》\n昔去雪如花，今来花似雪。——范云...
...,...,...
31520,¿Qué pregunta me harías para poder determinar ...,
31521,¿Qué pregunta me harías para poder determinar ...,
31522,¿Qué es la entropía y cuál es su relación con ...,La entropía es una medida de la cantidad de de...
31523,Primeros pasos para aprender ajedrez siendo pr...,Aquí te dejo algunos primeros pasos para apren...


In [None]:
final_df = output_df[output_df['response'] != '']
final_df

Unnamed: 0,prompt,response
0,Do figs really contain little dead wasp bodies?,"Technically, there is at least one dead wasp p..."
1,Do figs really contain little dead wasp bodies...,"No, not every single fig has or had a wasp ins..."
3,Do figs really contain little dead wasp bodies...,Sorry about my previous response. You are corr...
4,有哪些关于雪的古诗词？,应是天仙狂醉，乱把白云揉碎。——李白《清平乐·画堂晨起》\n昔去雪如花，今来花似雪。——范云...
7,有哪些关于雪的古诗词？再讲几个有关雪的古诗词，并解释它们所蕴含的感情。,李白的《雪涛阁序》：\n江山如此多娇，引无数英雄竞折腰。\n比翼连枝当日愿，白雪却嫌弃如流水...
...,...,...
31516,¿Qué pregunta me harías para poder determinar ...,Buenas.\nPara poder determinar tu edad la resp...
31518,¿Qué pregunta me harías para poder determinar ...,Claro. \nUna pregunta que podría hacerte para ...
31522,¿Qué es la entropía y cuál es su relación con ...,La entropía es una medida de la cantidad de de...
31523,Primeros pasos para aprender ajedrez siendo pr...,Aquí te dejo algunos primeros pasos para apren...


In [None]:
# Write DataFrame to a JSON Lines file
with open('finetuning_data.jsonl', 'w') as file:
    for _, row in final_df.iterrows():
        file.write(row.to_json() + '\n')