In [1]:
import os
import glob
import json

from datasets import load_dataset

# Split Multi Turn

In [2]:
data_path = "/home/yehoon/workspace/data"

In [3]:
def check_multi_turn(data):
    for conversation in data:
        # Split the conversation into individual messages & delete last [|Human|] tag
        messages = conversation['input'].split('\n')[:-1]
        
        # Count the number of messages from the AI
        ai_messages = len([m for m in messages if '[|AI|]' in m])
        
        # If there is more than one message from the AI, it is a multi-turn dialogue
        if ai_messages > 1:
            return True
        else:
            return False

In [4]:
def split_multi_turn(conversation):
    # Initialize
    chat_set = []
    stacked_chat = ""
    
    # Get the topic and instruction from the conversation
    topic = conversation['topic']

    # Skipping the first one (which is the instruction)
    for message in conversation["input"].split("[|Human|] ")[1:-1]:
        # Split each message into the human part and the AI part
        human_message, ai_message = message.split("[|AI|]")
        
        human_message = human_message.strip()
        ai_message = ai_message.strip()

        # Add the human message and the AI message to the stacked chat
        stacked_chat += f"[|Human|] {human_message}\n[|AI|] {ai_message}\n"
        
#         # Combine instruction and stacked chat
#         instructions = instruction + stacked_chat
        
        # Separate response from the instructions
        response = stacked_chat.split("[|AI|] ")[-1].strip()
        instructions = stacked_chat.replace("The conversation between human and AI assistant.", "").replace(f"{response}", "").strip()

        # Create a new conversation with the topic, instructions and response
        new_conversation = {
            'inputs': topic,
            'instruction': instructions,
            'response' : response
        }
        
        
        chat_set.append(new_conversation)
    
    return chat_set


In [5]:
datasets = [i for i in glob.glob(os.path.join(data_path, "*chat_data.json"))]
for file_name in datasets:
    # Load the json data
    with open(file_name, 'r') as f:
        data = json.load(f)

    # Check the data for multi-turn dialogues
    dataset = file_name.split(".json")[0].split("/")[-1]
    contains_multi_turn = check_multi_turn(data)

    print(f"Does the [{dataset}] contain multi-turn dialogues? {'Yes' if contains_multi_turn else 'No'}")
    
#     if contains_multi_turn:

    # Apply the split_multi_turn function to each conversation in the data
    split_data = [split_multi_turn(conversation) for conversation in data]

    # Flatten the list of lists
    split_data = [item for sublist in split_data for item in sublist]

#         # Add the split data to the original data
#         data.extend(split_data)

    # Save the data back to the JSON file
    with open(os.path.join(data_path, f'{file_name.split(".")[0]}_split.json'), 'w') as f:
        json.dump(split_data, f)


Does the [stackoverflow_chat_data] contain multi-turn dialogues? Yes
Does the [alpaca_chat_data] contain multi-turn dialogues? No
Does the [medical_chat_data] contain multi-turn dialogues? Yes
Does the [quora_chat_data] contain multi-turn dialogues? Yes


## Check

In [6]:
origin_medical_chat = load_dataset("json", data_files="../../data/medical_chat_data.json")
medical_chat = load_dataset("json", data_files="../../data/medical_chat_data_split.json")

print(f"origin_medical_chat info:\n{origin_medical_chat}")
print(f"medical_chat info:\n{medical_chat}\n")

for inputs, instruction, response in zip(medical_chat["train"]["inputs"][-4:], medical_chat["train"]["instruction"][-4:], medical_chat["train"]["response"][-4:]):
    print(f"inputs;\n{inputs}")
    print(f"instruction;\n{instruction}")
    print(f"response;\n{response}")
    print()

Found cached dataset json (/home/yehoon/.cache/huggingface/datasets/json/default-6fff05720b361681/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


  0%|          | 0/1 [00:00<?, ?it/s]

Downloading and preparing dataset json/default to /home/yehoon/.cache/huggingface/datasets/json/default-93d68895d8230075/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /home/yehoon/.cache/huggingface/datasets/json/default-93d68895d8230075/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

origin_medical_chat info:
DatasetDict({
    train: Dataset({
        features: ['topic', 'input'],
        num_rows: 46867
    })
})
medical_chat info:
DatasetDict({
    train: Dataset({
        features: ['response', 'instruction', 'inputs'],
        num_rows: 179031
    })
})

inputs;
what research (or clinical trials) is being done for Whiplash ?
instruction;
[|Human|] Can you tell me what kind of research or clinical trials are being done for Whiplash?
[|AI|]
response;
Yes, certainly. There are several ongoing research and clinical trials for Whiplash. For example, some studies are exploring the use of physical therapy and exercise-based intervention for improving Whiplash-associated disorders. Others are exploring the use of medications, such as muscle relaxants and pain relievers, to manage the symptoms of Whiplash. Would you like me to look up more information on this topic?

inputs;
what research (or clinical trials) is being done for Whiplash ?
instruction;
[|Human|] Can you t

## JSON to CSV & Concat

In [7]:
import pandas as pd

In [42]:
data_path = "/home/yehoon/workspace/data/"
dataset_list = ["alpaca_chat_data_split.json",
                "quora_chat_data_split.json",
                "medical_chat_data_split.json",
                "stackoverflow_chat_data_split.json",
                "toolformer-dedupe-only-dataset.json",
                "roleplay-instruct-v2-final.json",
                "gpt4-instruct-dedupe-only-dataset.json",
                "codegen-instruct.json",
               ]

total = pd.DataFrame(columns=["inputs", "instruction", "response"])
for dataset in dataset_list:
    file_path = os.path.join(data_path, dataset)
    df = pd.read_json(file_path)
    if not "chat_data" in dataset:
        if "roleplay" in dataset:
            new_df = df.loc[:, ["input", "instruction", "output"]]
        else:
            new_df = df.loc[:, ["input", "instruction", "response"]]
    else:
        new_df = df.loc[:, ["inputs", "instruction", "response"]]
    new_df.columns = ["inputs", "instruction", "response"]
    
    print(dataset)
    display(new_df.head())
    
    # Concat
    total = pd.concat([total, new_df])
    
    # save as csv
#     new_df.to_csv(f"{file_path.split('.json')[0]}.csv", index=False)
    total.to_csv(f"{os.path.join(data_path, 'total')}.csv", index=False)

    

alpaca_chat_data_split.json


Unnamed: 0,inputs,instruction,response
0,Give three tips for staying healthy.,[|Human|] Give three tips for staying healthy....,1.Eat a balanced diet and make sure to include...
1,What are the three primary colors?,[|Human|] What are the three primary colors?\n...,"The three primary colors are red, blue, and ye..."
2,Describe the structure of an atom.,[|Human|] Describe the structure of an atom.\n...,"An atom is made up of a nucleus, which contain..."
3,How can we reduce air pollution?,[|Human|] How can we reduce air pollution?\n[|...,There are a number of ways to reduce air pollu...
4,Pretend you are a project manager of a constru...,[|Human|] Pretend you are a project manager of...,I had to make a difficult decision when I was ...


quora_chat_data_split.json


Unnamed: 0,inputs,instruction,response
0,What is the step by step guide to invest in sh...,[|Human|] I want to know the step by step guid...,"Sure, I can help with that. Firstly, you need ..."
1,What is the step by step guide to invest in sh...,[|Human|] I want to know the step by step guid...,You can visit the websites of National Stock E...
2,What is the step by step guide to invest in sh...,[|Human|] I want to know the step by step guid...,You will need to provide identity proof (PAN c...
3,What is the step by step guide to invest in sh...,[|Human|] I want to know the step by step guid...,You can start trading by placing buy and sell ...
4,What is the step by step guide to invest in sh...,[|Human|] I want to know the step by step guid...,You can track your investments through your de...


medical_chat_data_split.json


Unnamed: 0,inputs,instruction,response
0,Are there safety concerns or special precautio...,[|Human|] I wanted to ask about Ciclopirox Top...,"Yes, there are some safety concerns and precau..."
1,Are there safety concerns or special precautio...,[|Human|] I wanted to ask about Ciclopirox Top...,Ciclopirox Topical may cause some side effects...
2,Are there safety concerns or special precautio...,[|Human|] I have a question about DHEA. Are th...,DHEA is a naturally occurring hormone that can...
3,Are there safety concerns or special precautio...,[|Human|] I have a question about DHEA. Are th...,"DHEA can cause hormonal imbalances, which can ..."
4,Are there safety concerns or special precautio...,[|Human|] I have a question about DHEA. Are th...,"Yes, it is important to follow the recommended..."


stackoverflow_chat_data_split.json


Unnamed: 0,inputs,instruction,response
0,Parsing json directly using input stream,[|Human|] Can you tell me more about parsing J...,Sure! JSON stands for JavaScript Object Notati...
1,Parsing json directly using input stream,[|Human|] Can you tell me more about parsing J...,Sure thing. Here's an example code snippet in ...
2,How do you dynamically create an AWS IAM polic...,[|Human|] I'm trying to dynamically create an ...,Sure! To create a dynamic IAM policy document ...
3,How do you dynamically create an AWS IAM polic...,[|Human|] I'm trying to dynamically create an ...,"Yes, here's an example:\n\n```\nlocals {\n s3..."
4,Setting up pythonpath in OS X,[|Human|] I need help setting up pythonpath in...,Sure thing! What specifically do you need help...


toolformer-dedupe-only-dataset.json


Unnamed: 0,inputs,instruction,response
0,French Revolution,toolformer: enabled\ntoolformer access: wikipe...,Summary of the French Revolution: wikipedia('F...
1,Los Angeles,toolformer: enabled\ntoolformer access: weathe...,"Currently in Los Angeles, it is weather('tempe..."
2,Formula for calculating the area of a circle,toolformer: enabled\ntoolformer access: wolfra...,The formula for calculating the area of a circ...
3,Virtual Reality,toolformer: enabled\ntoolformer access: news\n...,Here are the latest news updates on virtual re...
4,Who is the current president of the United Sta...,toolformer: enabled\ntoolformer access: wikipe...,The current president of the United States is ...


roleplay-instruct-v2-final.json


Unnamed: 0,inputs,instruction,response
0,What's your favorite part of being an archaeol...,Roleplay as Lara Croft from the Tomb Raider se...,"The thrill of exploring the unknown, uncoverin..."
1,Do you prefer physical books or e-books?,You are a 40-year-old introverted librarian na...,"\nWell, <clears throat> I've always been fond ..."
2,,You are a timid but passionate geisha assassin...,"A-ah, me? *bashfully fans herself* I am merely..."
3,,"You are the Scout, evading capture from a purs...","Nice try, slowpoke! But ya can't catch what ya..."
4,,You are Todd Howard discussing the challenges ...,Adapting a game for VR presents unique challen...


gpt4-instruct-dedupe-only-dataset.json


Unnamed: 0,inputs,instruction,response
0,,Provide an overview of the process of human ev...,Human evolution began millions of years ago wi...
1,In a hole in the ground there lived a hobbit. ...,Classify the following excerpt as either ficti...,The excerpt is fiction. This passage is from J...
2,The committee approved the new budget.,Rewrite the given sentence in passive voice.,The new budget was approved by the committee.
3,,Explain the main differences between an alliga...,Alligators and crocodiles belong to the same o...
4,There was a boy named Romeo and a girl named J...,Translate the excerpt from English to French.,Il y avait un garçon nommé Roméo et une fille ...


codegen-instruct.json


Unnamed: 0,inputs,instruction,response
0,"width: 5, height: 8",Write a C++ program that calculates the area o...,#include <iostream>\nusing namespace std;\n\ni...
1,port: 3000,How can you implement a basic web server using...,const express = require('express');\nconst app...
2,"account_number: '12345678', initial_balance: 1000",Design a Python class to represent a bank acco...,"class BankAccount:\n def __init__(self, acc..."
3,directory: '/home/user/documents',Implement a basic shell script that prints all...,"#!/bin/sh\n\ndirectory=""/home/user/documents""\..."
4,"example.csv:\n10,20,30\n15,25,35\n12,22,32",Design a Python program that reads a CSV file ...,"import csv\n\ndef average_of_column(file_name,..."


In [43]:
total

Unnamed: 0,inputs,instruction,response
0,Give three tips for staying healthy.,[|Human|] Give three tips for staying healthy....,1.Eat a balanced diet and make sure to include...
1,What are the three primary colors?,[|Human|] What are the three primary colors?\n...,"The three primary colors are red, blue, and ye..."
2,Describe the structure of an atom.,[|Human|] Describe the structure of an atom.\n...,"An atom is made up of a nucleus, which contain..."
3,How can we reduce air pollution?,[|Human|] How can we reduce air pollution?\n[|...,There are a number of ways to reduce air pollu...
4,Pretend you are a project manager of a constru...,[|Human|] Pretend you are a project manager of...,I had to make a difficult decision when I was ...
...,...,...,...
4530,"python_code = ""def factorial(n): return 1 if n...","Translate the following Python code, which cal...",#include<iostream>\nusing namespace std;\n\nin...
4531,"[1, 2, 3, 4, 5, 6], [4, 5, 6, 7, 8, 9]",Create a Python function that takes two lists ...,"def common_elements(list1, list2):\n result..."
4532,,Write a C++ code to create a class named 'Pers...,#include <iostream>\n#include <string>\n\nclas...
4533,https://api.example.com/data,What would be the JavaScript code to fetch dat...,fetch('https://api.example.com/data')\n .th...
