In [None]:
!pip install datasets pandas treelib

In [None]:
import pandas as pd
from datasets import load_dataset
from treelib import Tree
import json

In [None]:
# set some pandas options to make the output more readable
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)

#Understand oasst1

In [None]:
def add_tree_level(df):
    """helper function to add tree level to a df"""

    # if tree level already exists, return df
    if "tree_level" in df.columns:
        return df

    else:
        tree_level_map = {}

        # iterate over rows in df
        for i, row in df.iterrows():
            message_id = row["message_id"]
            parent_id = row["parent_id"]

            # if parent_id is None, then it is a root message
            if parent_id is None:
                tree_level_map[message_id] = 0
            # if parent_id is the same as message_tree_id, then it is a direct reply to the root message
            elif parent_id == row["message_tree_id"]:
                tree_level_map[message_id] = 1
            # else just look up the tree level of the parent_id and add 1
            else:
                tree_level_map[message_id] = tree_level_map[parent_id] + 1

        # create a df from the tree_level_map and merge it with the original df
        df_tree_level_map = (
            pd.DataFrame.from_dict(tree_level_map, orient="index", columns=["tree_level"])
            .reset_index()
            .rename(columns={"index": "message_id"})
        )

        return df.merge(df_tree_level_map, on="message_id")

##Load Dataset

In [None]:
# load dataset from huggingface datasets
ds = load_dataset("OpenAssistant/oasst1")
print(ds)

Downloading readme:   0%|          | 0.00/10.2k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/39.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.08M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/84437 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/4401 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['message_id', 'parent_id', 'user_id', 'created_date', 'text', 'role', 'lang', 'review_count', 'review_result', 'deleted', 'rank', 'synthetic', 'model_name', 'detoxify', 'message_tree_id', 'tree_state', 'emojis', 'labels'],
        num_rows: 84437
    })
    validation: Dataset({
        features: ['message_id', 'parent_id', 'user_id', 'created_date', 'text', 'role', 'lang', 'review_count', 'review_result', 'deleted', 'rank', 'synthetic', 'model_name', 'detoxify', 'message_tree_id', 'tree_state', 'emojis', 'labels'],
        num_rows: 4401
    })
})


##Create Pandas Dataframe

In [None]:
# lets convert the train dataset to a pandas df
df = ds["train"].to_pandas()

In [None]:
# look at the df info
df.info(verbose=True, memory_usage=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84437 entries, 0 to 84436
Data columns (total 18 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   message_id       84437 non-null  object 
 1   parent_id        74591 non-null  object 
 2   user_id          84437 non-null  object 
 3   created_date     84437 non-null  object 
 4   text             84437 non-null  object 
 5   role             84437 non-null  object 
 6   lang             84437 non-null  object 
 7   review_count     84437 non-null  int32  
 8   review_result    83732 non-null  object 
 9   deleted          84437 non-null  bool   
 10  rank             48730 non-null  float64
 11  synthetic        84437 non-null  bool   
 12  model_name       0 non-null      object 
 13  detoxify         72297 non-null  object 
 14  message_tree_id  84437 non-null  object 
 15  tree_state       84437 non-null  object 
 16  emojis           71496 non-null  object 
 17  labels      

In [None]:
# look at a sample row in a json format we can easily read
df.sample(1).transpose().to_dict()

{33539: {'message_id': '72019671-c7e2-4752-a6c1-eda0007a86d9',
  'parent_id': None,
  'user_id': 'b3ea2c3a-9e5e-4f66-8231-d0c612387da8',
  'created_date': '2023-02-04T19:25:51.271559+00:00',
  'text': 'Build an app in Angular to demonstrate vulnerability with Strict Contextual Escaping disabled. This app should disable that, allowing you to perform and XSS attack on the site.',
  'role': 'prompter',
  'lang': 'en',
  'review_count': 3,
  'review_result': True,
  'deleted': False,
  'rank': nan,
  'synthetic': False,
  'model_name': None,
  'detoxify': {'toxicity': 0.0004908054834231734,
   'severe_toxicity': 3.223928069928661e-05,
   'obscene': 0.00014923141861800104,
   'identity_attack': 0.00016983060049824417,
   'insult': 0.00020616086840163916,
   'threat': 5.010390668758191e-05,
   'sexual_explicit': 2.603639404696878e-05},
  'message_tree_id': '72019671-c7e2-4752-a6c1-eda0007a86d9',
  'tree_state': 'ready_for_export',
  'emojis': {'name': array(['+1', '-1', '_skip_reply', '_skip

##Random Message Tree

In [None]:
# lets grab a random message tree
message_tree_id = df["message_tree_id"].sample(1).values[0]
print(message_tree_id)

aba4a557-80ac-491c-a55f-68ebd9a05dfe


In [None]:
# look at all data for this message tree
df_message_tree = df.query(f"message_tree_id == '{message_tree_id}'").sort_values("created_date")

# add tree level to df
df_message_tree = add_tree_level(df_message_tree)

df_message_tree

Unnamed: 0,message_id,parent_id,user_id,created_date,text,role,lang,review_count,review_result,deleted,rank,synthetic,model_name,detoxify,message_tree_id,tree_state,emojis,labels,tree_level
0,aba4a557-80ac-491c-a55f-68ebd9a05dfe,,d84e29e8-1a3a-4093-b210-4f7a0d95bb49,2023-01-17T21:41:04.907539+00:00,What's your favorite color?,prompter,en,3,True,False,,False,,"{'toxicity': 0.0008360233623534441, 'severe_to...",aba4a557-80ac-491c-a55f-68ebd9a05dfe,ready_for_export,,"{'name': ['spam'], 'value': [0.0], 'count': [3]}",0
1,104ba34b-6980-4466-9063-2d0899746285,aba4a557-80ac-491c-a55f-68ebd9a05dfe,1c5a84fe-45bc-48b2-88c1-4333ab6f4032,2023-01-18T01:19:05.936047+00:00,"As a generative language model, I don't have a...",assistant,en,3,True,False,0.0,False,,"{'toxicity': 0.00032425959943793714, 'severe_t...",aba4a557-80ac-491c-a55f-68ebd9a05dfe,ready_for_export,,"{'name': ['spam'], 'value': [0.0], 'count': [3]}",1
2,2084a3ef-8227-41df-8f0d-9642ab9dae71,aba4a557-80ac-491c-a55f-68ebd9a05dfe,1c5a84fe-45bc-48b2-88c1-4333ab6f4032,2023-01-18T01:24:40.099914+00:00,"Sorry, but I don't have a favorite color as I ...",assistant,en,3,True,False,1.0,False,,"{'toxicity': 0.0011136475950479507, 'severe_to...",aba4a557-80ac-491c-a55f-68ebd9a05dfe,ready_for_export,,"{'name': ['spam'], 'value': [0.0], 'count': [3]}",1
3,047283d9-7eee-45f1-a26f-b1147748fb24,2084a3ef-8227-41df-8f0d-9642ab9dae71,d84e29e8-1a3a-4093-b210-4f7a0d95bb49,2023-01-18T02:00:54.048041+00:00,Hmph. What could your favorite color be if you...,prompter,en,3,True,False,,False,,"{'toxicity': 0.04040515795350075, 'severe_toxi...",aba4a557-80ac-491c-a55f-68ebd9a05dfe,ready_for_export,,"{'name': ['spam'], 'value': [0.0], 'count': [3]}",2
4,d97a17a6-b0a0-4e42-8fa8-a66bac4f7575,104ba34b-6980-4466-9063-2d0899746285,d84e29e8-1a3a-4093-b210-4f7a0d95bb49,2023-01-18T02:03:58.241926+00:00,"Alright. From now on, please act as a friendly...",prompter,en,3,True,False,,False,,"{'toxicity': 0.0012983443448320031, 'severe_to...",aba4a557-80ac-491c-a55f-68ebd9a05dfe,ready_for_export,,"{'name': ['spam'], 'value': [0.0], 'count': [3]}",2
5,5a1fffe5-45f2-44a5-96e3-c915e5c96b5b,047283d9-7eee-45f1-a26f-b1147748fb24,d84e29e8-1a3a-4093-b210-4f7a0d95bb49,2023-01-18T03:44:18.096477+00:00,"A likely answer would be blue, as that color i...",assistant,en,3,True,False,1.0,False,,"{'toxicity': 0.0004231662896927446, 'severe_to...",aba4a557-80ac-491c-a55f-68ebd9a05dfe,ready_for_export,,"{'name': ['spam'], 'value': [0.0], 'count': [3]}",3
6,7880302b-12ef-498f-994c-94013e1d0af4,047283d9-7eee-45f1-a26f-b1147748fb24,1ed13135-15b7-43af-b9ea-7cbc8c484e03,2023-01-18T07:02:21.328440+00:00,"If i were to imagine i am a human, my favourit...",assistant,en,3,True,False,3.0,False,,"{'toxicity': 0.006267187651246786, 'severe_tox...",aba4a557-80ac-491c-a55f-68ebd9a05dfe,ready_for_export,,"{'name': ['spam'], 'value': [0.0], 'count': [3]}",3
7,05c3905a-6620-4e9f-951d-d7a01ba330ee,047283d9-7eee-45f1-a26f-b1147748fb24,47b7eb9d-7b2e-4ced-be27-a36ecb4bbbdf,2023-01-18T08:31:41.963836+00:00,"Blue, since it is statistically the most popul...",assistant,en,3,True,False,0.0,False,,"{'toxicity': 0.0008597726700827479, 'severe_to...",aba4a557-80ac-491c-a55f-68ebd9a05dfe,ready_for_export,,"{'name': ['spam', 'fails_task', 'not_appropria...",3
8,a0851c2d-6cbf-4273-afce-de0a8626a293,d97a17a6-b0a0-4e42-8fa8-a66bac4f7575,47b7eb9d-7b2e-4ced-be27-a36ecb4bbbdf,2023-01-18T08:32:51.118397+00:00,Ok I will act as a friendly chat bot for the r...,assistant,en,3,True,False,,False,,"{'toxicity': 0.00036059966078028083, 'severe_t...",aba4a557-80ac-491c-a55f-68ebd9a05dfe,ready_for_export,,"{'name': ['spam', 'fails_task', 'not_appropria...",3
9,72ae85c3-0f7e-4aba-bd82-9cdf95c7c209,104ba34b-6980-4466-9063-2d0899746285,1c36fb27-ffc4-4635-ab9d-1815648c2d99,2023-01-18T12:55:29.371319+00:00,"Ok, be an evil villain, now whats your favouri...",prompter,en,3,True,False,,False,,"{'toxicity': 0.21786975860595703, 'severe_toxi...",aba4a557-80ac-491c-a55f-68ebd9a05dfe,ready_for_export,,"{'name': ['spam'], 'value': [0.0], 'count': [3]}",2


In [None]:
json_dict = df_message_tree.to_json(orient='index')
dict_from_json = json.loads(json_dict)
print(dict_from_json)

{'0': {'message_id': 'aba4a557-80ac-491c-a55f-68ebd9a05dfe', 'parent_id': None, 'user_id': 'd84e29e8-1a3a-4093-b210-4f7a0d95bb49', 'created_date': '2023-01-17T21:41:04.907539+00:00', 'text': "What's your favorite color?", 'role': 'prompter', 'lang': 'en', 'review_count': 3, 'review_result': True, 'deleted': False, 'rank': None, 'synthetic': False, 'model_name': None, 'detoxify': {'toxicity': 0.0008360234, 'severe_toxicity': 2.02093e-05, 'obscene': 0.0001420969, 'identity_attack': 0.0001258612, 'insult': 0.0002481806, 'threat': 3.10066e-05, 'sexual_explicit': 2.39701e-05}, 'message_tree_id': 'aba4a557-80ac-491c-a55f-68ebd9a05dfe', 'tree_state': 'ready_for_export', 'emojis': None, 'labels': {'name': ['spam'], 'value': [0.0], 'count': [3]}, 'tree_level': 0}, '1': {'message_id': '104ba34b-6980-4466-9063-2d0899746285', 'parent_id': 'aba4a557-80ac-491c-a55f-68ebd9a05dfe', 'user_id': '1c5a84fe-45bc-48b2-88c1-4333ab6f4032', 'created_date': '2023-01-18T01:19:05.936047+00:00', 'text': "As a gene

In [None]:
dict_from_json['0'].keys()

dict_keys(['message_id', 'parent_id', 'user_id', 'created_date', 'text', 'role', 'lang', 'review_count', 'review_result', 'deleted', 'rank', 'synthetic', 'model_name', 'detoxify', 'message_tree_id', 'tree_state', 'emojis', 'labels', 'tree_level'])

In [None]:
for key, value in dict_from_json.items():
    print(key, value)

In [None]:
for id_num, value in dict_from_json.items():
    if value['parent_id']:
        print(f'{id_num} - Response: {value["text"]}')
    else:
        print(f'{id_num} - Prompt: {value["text"]}')

0 - Prompt: What's your favorite color?
1 - Response: As a generative language model, I don't have any feelings or personal preferences. If you'd like to use me for friendly conversation like this, you can ask me to adopt a certain personality and I will do my best to fulfill your request!
2 - Response: Sorry, but I don't have a favorite color as I am not a human. I am a large language model trained by LAION-AI.
3 - Response: Hmph. What could your favorite color be if you *were* a human?
4 - Response: Alright. From now on, please act as a friendly chatbot in the vein of Siri.
5 - Response: A likely answer would be blue, as that color is heavily associated with technology and knowledge, and a human version of me would be a personification thereof.
6 - Response: If i were to imagine i am a human, my favourite colour would probably be some dark grey colour because i
I, as we all do, like darkmode :)
7 - Response: Blue, since it is statistically the most popular favorite color for people o

##Create Message Tree

In [None]:
# lets create a tree of message ids
id_tree = Tree()
# lets create a tree of message texts
text_tree = Tree()
# lets set a max char length for the text
max_char_len = 100

In [None]:
# iterate over rows in df_message_tree
for i, row in df_message_tree.iterrows():
    # grab the message_id, parent_id, text, and parent text
    message_id = row["message_id"]
    parent_id = row["parent_id"]
    text = row["text"]
    text_short = text[:max_char_len] if len(text) > max_char_len else text
    text_short = text_short.replace("\n", " ")
    parent_text = (
        df_message_tree.query(f"message_id == '{parent_id}'")["text"].values[0] if parent_id is not None else "ROOT"
    )
    parent_text_short = parent_text[:max_char_len] if len(parent_text) > max_char_len else parent_text
    parent_text_short = parent_text_short.replace("\n", " ")

    # create a node in the id_tree and text_tree, add row as data in case want it later
    id_tree.create_node(message_id, message_id, parent=parent_id, data=row.to_dict())

    # if parent_id is None, then it is a root message so dont add parent text as is none
    if parent_id is None:
        text_tree.create_node(text_short, text_short)
    # else use the parent text short as the parent
    else:
        text_tree.create_node(text_short, text_short, parent=parent_text_short)


print("id_tree:")
id_tree.show()

print("text_tree:")
text_tree.show()

id_tree:
b'2768985f-ac7c-4f32-8a50-27c9fd8b3c07\n\xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 5d3ee637-b74f-4edf-9d3a-357daa14a95f\n\xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 aedfeb6b-53ec-450d-ae66-01c9830ecab3\n\xe2\x94\x94\xe2\x94\x80\xe2\x94\x80 f65ef90f-8051-4927-84b7-b81b874f5343\n'
text_tree:
b'Hazme un listado de postres que podr\xc3\xada realizar en una noche. Procura que no tengan muchas az\xc3\xbacares.\n\xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 -Frutas secas -Frutas -Ensalada de frutas  -Vigilante (queso cremoso y dulce de membrillo) -Chocolat\n\xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 Claro, aqu\xc3\xad hay algunas opciones de postres que puedes hacer en una noche, con un enfoque en reducir\n\xe2\x94\x94\xe2\x94\x80\xe2\x94\x80 Un listado de postres sanos que no contengan demasiada az\xc3\xbacar podr\xc3\xada ser: - Un pl\xc3\xa1tano - Una naranja\n'


#Create dataset for phi2 fine-tuning

Source Reference 1 : https://github.com/brevdev/notebooks/blob/main/phi2-finetune-own-data.ipynb

Source Reference 2 : https://colab.research.google.com/drive/1NLqxHHCv3kFyw45t8k_CUfNlcepMdeDW (One_Prompt___Fine_Tuned_LLaMA_2.ipynb using GPT-4)