In [1]:
from datasets import load_dataset

In [2]:
dataset = load_dataset("OpenAssistant/oasst1")

In [3]:
train_dataset = dataset['train']      # len(train)=84437 (95%)
val_dataset = dataset['validation']   # len(val)=4401 (5%)

In [4]:
train_df = train_dataset.to_pandas()
val_df = val_dataset.to_pandas()

train_df.head()

Unnamed: 0,message_id,parent_id,user_id,created_date,text,role,lang,review_count,review_result,deleted,rank,synthetic,model_name,detoxify,message_tree_id,tree_state,emojis,labels
0,6ab24d72-0181-4594-a9cd-deaf170242fb,,c3fe8c76-fc30-4fa7-b7f8-c492f5967d18,2023-02-05T14:23:50.983374+00:00,Can you write a short introduction about the r...,prompter,en,3,True,False,,False,,"{'toxicity': 0.00044308538781479, 'severe_toxi...",6ab24d72-0181-4594-a9cd-deaf170242fb,ready_for_export,"{'name': ['+1', '_skip_reply', '_skip_ranking'...","{'name': ['spam', 'lang_mismatch', 'pii', 'not..."
1,c8e83833-ecbc-44fe-b6db-735228c25a1c,6ab24d72-0181-4594-a9cd-deaf170242fb,2c96e467-66f0-4be7-9693-bda51356a424,2023-02-06T13:50:44.657083+00:00,"""Monopsony"" refers to a market structure where...",assistant,en,3,True,False,0.0,False,,"{'toxicity': 0.00026396565954200923, 'severe_t...",6ab24d72-0181-4594-a9cd-deaf170242fb,ready_for_export,"{'name': ['+1', '_skip_labeling'], 'count': [3...","{'name': ['spam', 'fails_task', 'lang_mismatch..."
2,6708c47f-05c9-4346-b3d2-40b2bd24fde4,c8e83833-ecbc-44fe-b6db-735228c25a1c,2c96e467-66f0-4be7-9693-bda51356a424,2023-02-06T18:48:49.391686+00:00,Now explain it to a dog,prompter,en,3,True,False,,False,,"{'toxicity': 0.03648477792739868, 'severe_toxi...",6ab24d72-0181-4594-a9cd-deaf170242fb,ready_for_export,,"{'name': ['spam', 'lang_mismatch', 'pii', 'not..."
3,343ee2d4-87ae-41fd-a768-bdd65959dc4a,6ab24d72-0181-4594-a9cd-deaf170242fb,49ddcb0d-6588-43bd-858d-19c577f12e7b,2023-02-06T13:37:56.044680+00:00,Monopsony is a market structure in which there...,assistant,en,3,True,False,1.0,False,,"{'toxicity': 0.0008866374846547842, 'severe_to...",6ab24d72-0181-4594-a9cd-deaf170242fb,ready_for_export,"{'name': ['+1', '_skip_reply', '_skip_labeling...","{'name': ['spam', 'fails_task', 'lang_mismatch..."
4,18145bf4-37fd-4ac0-80f5-6108b5f2b365,343ee2d4-87ae-41fd-a768-bdd65959dc4a,e10e99a0-38ac-4b07-bf5d-4427696e4e0d,2023-02-06T18:52:51.428543+00:00,How can one fight back when a monospony had be...,prompter,en,3,True,False,,False,,"{'toxicity': 0.0009362137061543763, 'severe_to...",6ab24d72-0181-4594-a9cd-deaf170242fb,ready_for_export,"{'name': ['+1'], 'count': [1]}","{'name': ['spam', 'lang_mismatch', 'pii', 'not..."


In [5]:
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

class PromptResponseExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        def process_tree(group):
            prompt_df = group[group["role"] == 'prompter']
            assistant_df = group[group["role"] == 'assistant']

            prompts = prompt_df['text'].tolist()
            responses = assistant_df.groupby('parent_id')['text'].apply(' '.join).tolist()

            if not prompts or len(prompts) != len(responses):
                return pd.DataFrame(columns=['prompt', 'response'])

            base_prompt = prompts[0]
            augmented_prompts = [base_prompt] + [f"{base_prompt} {prompt}" for prompt in prompts[1:]]

            return pd.DataFrame({
                'prompt': augmented_prompts,
                'response': responses
            })

        return X.groupby('message_tree_id').apply(process_tree).reset_index(drop=True)

def create_preprocessing_pipeline():
    return Pipeline([
        ('prompt_response_extractor', PromptResponseExtractor()),
    ])

# Usage
def preprocess_data(df):
    pipeline = create_preprocessing_pipeline()
    pipeline = pipeline.fit_transform(df)
    final_df = df[df['response'] != '']
    return final_df

In [10]:
output_df = preprocess_data(train_df)
output_df.head(100)

  return X.groupby('message_tree_id').apply(process_tree).reset_index(drop=True)


Unnamed: 0,prompt,response
0,Who was Socrates and when did he die?,Socrates was a Greek philosopher who lived in ...
1,能介绍一下什么是特摄吗？,我可以帮你介绍一下什么是“特摄”。\n\n“特摄”是一个和制汉语，即“特殊技术摄影”，是一个...
2,能介绍一下什么是特摄吗？ 你的介绍不错，但是我发现你的回答里有一个意义不明的数字，你能自己把...,1. 《超人力霸王系列》：这是日本最著名的特摄作品之一，讲述了来自宇宙的英雄们与邪恶势力的战...
3,能介绍一下什么是特摄吗？ 说几个最有名的日本特摄作品，并简要介绍一下其剧情。,对不起，您指出了我刚才回复中的问题。您所说的意义不明的数字出现在第三段的第一句末尾：\n\n...
4,Why Aristotelian view of physics (impetus and ...,"Aristotle's views on physics, which included t..."
...,...,...
95,Dame algunas pautas de una vida saludable,"Hola, te dejo algunos puntos que te pueden ayu..."
96,¿Cuál es la teoría más interesante que haz esc...,Hay muchas teorías interesantes sobre el orige...
97,¿crees que hay alguna solución posible a la pa...,La paradoja del barbero es un problema lógico ...
98,"Объясни мне, в чём состоит довольно свежий кон...","Конфликт между разработчиками ИИ-моделей, гене..."
