In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import numpy as np
from tqdm.notebook import tqdm
import pandas as pd
from accelerate import infer_auto_device_map

In [7]:
import json

In [14]:
langs = ["en", "enhi", "fr", "hi", "ko", "zh"]
splits = ["fewshot", "valid", "test"]

In [11]:
instructions = dict()
# Instruction for DST
instructions["DS"]="Given <state> and <history>, write a belief state in the format \"(object) trait equal_to \"value\"\". e.g. (book) author equal_to \"Harper Lee\", year published equal_to \"1960\"."
# Instruction for API
instructions["AP"]="Given <knowledge> and <history>, determine whether an API call is needed to query the database. Answer \"yes\" or \"no\"."
# Instruction for DAG
instructions["DA"] = "Given <knowledge>, <state> and <history>, write a dialogue representation in the format \"(object) action trait\". e.g. (book) request genre."
# Instruction for RG
instructions["RG"] = "Given <action> and <history>, write a response that contains all the necessary information."

In [12]:
with open("../dataset/en/fewshot.json") as json_file:
    data = json.load(json_file)
data = data["data"]
json_list = []
for i in range(len(data)):
    entry = dict()
    inst_key = data[i]["input_text"][:2]
    entry["instruction"] = instructions[inst_key]
    entry["input"] = data[i]["input_text"]
    entry["output"] = data[i]["output_text"]
    json_list.append(entry)

In [15]:
for lang in langs:
    for sp in splits:
        filename = "../dataset/" + lang + "/" + sp + ".json"
        with open(filename) as json_file:
            data = json.load(json_file)
        data = data["data"]
        json_list = []
        for i in range(len(data)):
            entry = dict()
            inst_key = data[i]["input_text"][:2]
            entry["instruction"] = instructions[inst_key]
            entry["input"] = data[i]["input_text"]
            entry["output"] = data[i]["output_text"]
            json_list.append(entry)
        outname = "../processed_inst/" + lang + sp + ".json"
        with open(outname, "w") as f:
            json.dump(json_list, f, indent=2, ensure_ascii=False)

In [28]:
with open('../dataset/en/valid.json', 'r') as json_file:
    data = json.load(json_file)
data = data["data"]

In [29]:
json_list = []
for i in range(len(data)):
    entry = dict()
    entry["instruction"] = ""
    entry["input"] = data[i]["input_text"]
    entry["output"] = data[i]["output_text"]
    json_list.append(entry)

In [30]:
with open("xris_enhi.json", "w") as f:
    json.dump(json_list, f, indent = 2)

In [31]:
import random
result = list()
filename = ["alpaca_data.json", "retrieve_and_generate_prompt_logs.json", "xris_ko.json"]
for f in filename:
    with open(f, 'r') as infile:
        result.extend(json.load(infile))
random.shuffle(result)
with open('multi_ko_data.json', 'w') as outfile:
    json.dump(result, outfile, indent=2)

In [12]:
texts = []
for i in range(1000):
    dictData = json.loads(json_list[i])
    idx = list(dictData.keys())[0]
    for dialog in dictData[idx]["dialog_history"]:
        if dialog["action"] == "Apprentice => Wizard" or dialog["action"] == "Wizard => Apprentice":
            texts.append(dialog["text"])
            break
    

In [22]:
with open("user_examples.txt", 'w') as file:
    for row in texts:
        file.write(row + '\n')

In [29]:
dictData = json.loads(json_list[1])
idx = list(dictData.keys())[0]
for dialog in dictData[idx]["dialog_history"]:
    if dialog["action"] == "Apprentice => Wizard" or dialog["action"] == "Wizard => Apprentice":
        print(dialog["text"])

What is your favorite Tom Hanks movie of them all?
My favorite has to be when he played Woody in Toy Story 2! Which one is your go-to? 
That is a good one. You know I completely forgot he was the voice in that movie. He has to have one of the most memorable voices by any actor in the last 2 to 3 decades
Absolutely, his work is amazing. Have you watched A Beautiful Day In The Neighborhood? It came out in 2019 ands won 2 Oscars though he deserved many more.
You know I don't think I've watched that one yet. I've seen trailers but it's hard to believe there aren't household movies that have won oscars. How many total oscars foes he have?
I believe it's something up there like 50, thought I'm not sure. He has too many to count. I know he won 6 for Forrest Gump. Were you aware Tom has an asteroid named after him? That's the level of success we all want. 
There so many memorable moments in his films too. I know we can all think of a movie by him and pick out our favorite moment. Which is your

In [5]:
result = list()
filename = ["alpaca_data.json", "retrieve_and_generate_prompt_logs.json"]
for f in filename:
    with open(f, 'r') as infile:
        result.extend(json.load(infile))
with open('wiki_alpaca_data.json', 'w') as outfile:
    json.dump(result, outfile, indent=2)

In [5]:
import json
import random
with open("wiki_alpaca_data.json") as f:
    data = json.load(f)

In [3]:
random.shuffle(data)

In [8]:
eval_size = len(data) // 20
train_size = len(data) - eval_size
train_data = data[:train_size]
eval_data = data[train_size:]

In [9]:
with open("train_data.json", "w") as f:
    json.dump(train_data, f, indent = 2)
with open("eval_data.json", "w") as f:
    json.dump(eval_data, f, indent = 2)

In [10]:
eval_data

[{'instruction': 'You are chatting with a user. See if doing a Google search would help you better respond to them. You are both located in the U.S. Today\'s date is 4/24/2023.\n- What do you type in the search box?\n- What date do you want the search results to be? Enter "recent" if you are looking for the newest results. Enter "none" if the date is not important.',
  'input': 'User: What is your favorite thing to do in Las Vegas?\n[Search needed? Yes. You Google "favorite things to do in Las Vegas". The year of the results is "recent".]\nYou: I\'m not sure, I haven\'t been to Las Vegas yet.\nUser: Oh, that\'s too bad. Well, have you heard anything interesting about Las Vegas that you\'d like to share?\n[Search needed? Yes. You Google "interesting facts about Las Vegas". The year of the results is "none".]\nYou: I\'m sorry, I don\'t have any interesting facts about Las Vegas to share.\nUser: That\'s alright. Well, I\'ve been to Las Vegas a few times and my favorite thing to do there i