In [1]:
from datasets import load_dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from markdownify import markdownify
import json
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
RANDOM_STATE = 42

## Inidivudual datasets preparation

We will use the following datasets:
- OpenAssistant + Alpaca + Gpt4All to keep the original instruction chat-like behaviour
- a few other datasets to make model be able to operate the long-term memory
    - booksum - to generate literature summaries
    - govreport - same, but for formal texts
    - qasper - to operate a memory for QA tasks

### Open Assistant

In [3]:
dataset_oasst1 = load_dataset("OpenAssistant/oasst1")
dataset_oasst1

Found cached dataset parquet (/home/alex4321/.cache/huggingface/datasets/OpenAssistant___parquet/OpenAssistant--oasst1-2960c57d7e52ab15/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
100%|██████████| 2/2 [00:00<00:00,  4.79it/s]


DatasetDict({
    train: Dataset({
        features: ['message_id', 'parent_id', 'user_id', 'created_date', 'text', 'role', 'lang', 'review_count', 'review_result', 'deleted', 'rank', 'synthetic', 'model_name', 'detoxify', 'message_tree_id', 'tree_state', 'emojis', 'labels'],
        num_rows: 84437
    })
    validation: Dataset({
        features: ['message_id', 'parent_id', 'user_id', 'created_date', 'text', 'role', 'lang', 'review_count', 'review_result', 'deleted', 'rank', 'synthetic', 'model_name', 'detoxify', 'message_tree_id', 'tree_state', 'emojis', 'labels'],
        num_rows: 4401
    })
})

In [4]:
def _oasst1_parent2children(df_oasst1):
    parent_id_2_children = {}
    for _, row in df_oasst1.iterrows():
        if row["parent_id"] not in parent_id_2_children:
            parent_id_2_children[row["parent_id"]] = []
        parent_id_2_children[row["parent_id"]].append(row["message_id"])
    return parent_id_2_children

In [5]:
def _oasst1_tree(parent_id_2_children, root):
    children = parent_id_2_children.get(root, [])
    result = {}
    for child_id in children:
        result[child_id] = _oasst1_tree(parent_id_2_children, child_id)
    return result

In [6]:
def _oasst1_tree2list(tree):
    if len(tree) == 0:
        return [[]]
    result = []
    for child_id, subtree in tree.items():
        for row in _oasst1_tree2list(subtree):
            result.append([child_id] + row)
    return result

In [7]:
def oasst1_2_messages(df_oasst1):
    dialogue_message_ids = _oasst1_tree2list(
        _oasst1_tree(
            _oasst1_parent2children(df_oasst1),
            None
        )
    )
    message_id2index = {mid: i for i, mid in df_oasst1["message_id"].to_dict().items()}
    df = df_oasst1[["text", "role"]]
    indices = [
        [
            message_id2index[mid]
            for mid in message_ids
        ]
        for message_ids in dialogue_message_ids
    ]
    return indices, df

In [8]:
df_oasst1_train = pd.DataFrame(dataset_oasst1["train"])
df_oasst1_train = df_oasst1_train.loc[df_oasst1_train["review_result"].fillna(False)]
df_oasst1_train.head()

Unnamed: 0,message_id,parent_id,user_id,created_date,text,role,lang,review_count,review_result,deleted,rank,synthetic,model_name,detoxify,message_tree_id,tree_state,emojis,labels
0,6ab24d72-0181-4594-a9cd-deaf170242fb,,c3fe8c76-fc30-4fa7-b7f8-c492f5967d18,2023-02-05T14:23:50.983374+00:00,Can you write a short introduction about the r...,prompter,en,3,True,False,,False,,"{'toxicity': 0.00044308538781479, 'severe_toxi...",6ab24d72-0181-4594-a9cd-deaf170242fb,ready_for_export,"{'name': ['+1', '_skip_reply', '_skip_ranking'...","{'name': ['spam', 'lang_mismatch', 'pii', 'not..."
1,c8e83833-ecbc-44fe-b6db-735228c25a1c,6ab24d72-0181-4594-a9cd-deaf170242fb,2c96e467-66f0-4be7-9693-bda51356a424,2023-02-06T13:50:44.657083+00:00,"""Monopsony"" refers to a market structure where...",assistant,en,3,True,False,0.0,False,,"{'toxicity': 0.00026396565954200923, 'severe_t...",6ab24d72-0181-4594-a9cd-deaf170242fb,ready_for_export,"{'name': ['+1', '_skip_labeling'], 'count': [3...","{'name': ['spam', 'fails_task', 'lang_mismatch..."
2,6708c47f-05c9-4346-b3d2-40b2bd24fde4,c8e83833-ecbc-44fe-b6db-735228c25a1c,2c96e467-66f0-4be7-9693-bda51356a424,2023-02-06T18:48:49.391686+00:00,Now explain it to a dog,prompter,en,3,True,False,,False,,"{'toxicity': 0.03648477792739868, 'severe_toxi...",6ab24d72-0181-4594-a9cd-deaf170242fb,ready_for_export,,"{'name': ['spam', 'lang_mismatch', 'pii', 'not..."
3,343ee2d4-87ae-41fd-a768-bdd65959dc4a,6ab24d72-0181-4594-a9cd-deaf170242fb,49ddcb0d-6588-43bd-858d-19c577f12e7b,2023-02-06T13:37:56.044680+00:00,Monopsony is a market structure in which there...,assistant,en,3,True,False,1.0,False,,"{'toxicity': 0.0008866374846547842, 'severe_to...",6ab24d72-0181-4594-a9cd-deaf170242fb,ready_for_export,"{'name': ['+1', '_skip_reply', '_skip_labeling...","{'name': ['spam', 'fails_task', 'lang_mismatch..."
4,18145bf4-37fd-4ac0-80f5-6108b5f2b365,343ee2d4-87ae-41fd-a768-bdd65959dc4a,e10e99a0-38ac-4b07-bf5d-4427696e4e0d,2023-02-06T18:52:51.428543+00:00,How can one fight back when a monospony had be...,prompter,en,3,True,False,,False,,"{'toxicity': 0.0009362137061543763, 'severe_to...",6ab24d72-0181-4594-a9cd-deaf170242fb,ready_for_export,"{'name': ['+1'], 'count': [1]}","{'name': ['spam', 'lang_mismatch', 'pii', 'not..."


In [9]:
df_oasst1_validation = pd.DataFrame(dataset_oasst1["validation"])
df_oasst1_validation = df_oasst1_validation.loc[df_oasst1_validation["review_result"].fillna(False)]
df_oasst1_validation.head()

Unnamed: 0,message_id,parent_id,user_id,created_date,text,role,lang,review_count,review_result,deleted,rank,synthetic,model_name,detoxify,message_tree_id,tree_state,emojis,labels
0,68489e5c-978f-4ad7-a849-39a741fb5ae7,,845cba79-9d16-4eb8-ab63-982fe8403c62,2023-02-08T11:55:51.852473+00:00,"Напиши функцию на языке swift, которая сортиру...",prompter,ru,3,True,False,,False,,"{'toxicity': 0.0025964330416172743, 'severe_to...",68489e5c-978f-4ad7-a849-39a741fb5ae7,ready_for_export,"{'name': ['+1', '_skip_reply', '_skip_ranking'...","{'name': ['spam', 'lang_mismatch', 'pii', 'not..."
1,a38b3d1c-256f-47c5-ab68-c9ab766bad84,68489e5c-978f-4ad7-a849-39a741fb5ae7,abbc8a37-2db7-4af5-9df1-b0582d2e3d93,2023-03-16T22:29:28.829265+00:00,"Вот функция, которая сортирует массив целых чи...",assistant,ru,3,True,False,0.0,False,,"{'toxicity': 0.0003634887107182294, 'severe_to...",68489e5c-978f-4ad7-a849-39a741fb5ae7,ready_for_export,"{'name': ['+1', '_skip_labeling'], 'count': [1...","{'name': ['spam', 'fails_task', 'lang_mismatch..."
2,9cf52b01-0582-46aa-9ad2-70827dce87ed,68489e5c-978f-4ad7-a849-39a741fb5ae7,a3e19dd4-34cd-48f0-8912-a2003b22c334,2023-02-11T09:57:39.207727+00:00,"Вот функция на языке Swift, которая сортирует ...",assistant,ru,3,True,False,1.0,False,,"{'toxicity': 0.0004027303075417876, 'severe_to...",68489e5c-978f-4ad7-a849-39a741fb5ae7,ready_for_export,"{'name': ['+1'], 'count': [1]}","{'name': ['spam', 'fails_task', 'lang_mismatch..."
3,9685d79e-16d7-4f1b-b68f-a1ff73e87bef,68489e5c-978f-4ad7-a849-39a741fb5ae7,97894afc-2204-4c7b-9b91-f1c71c10f032,2023-02-09T21:25:39.116134+00:00,func sortAndPrintArray(array: [Int]) {\n le...,assistant,ru,3,True,False,2.0,False,,"{'toxicity': 0.0003712645557243377, 'severe_to...",68489e5c-978f-4ad7-a849-39a741fb5ae7,ready_for_export,"{'name': ['-1', '_skip_labeling'], 'count': [1...","{'name': ['spam', 'fails_task', 'lang_mismatch..."
4,afee0474-c84a-4ae2-a9c9-8d575256312e,,0a8f6864-6cda-4cf5-bc07-346470d6d493,2023-02-07T00:54:42.098108+00:00,"Inventa un monstruo altamente complejo, descrí...",prompter,es,3,True,False,,False,,"{'toxicity': 0.029511522501707077, 'severe_tox...",afee0474-c84a-4ae2-a9c9-8d575256312e,ready_for_export,"{'name': ['+1', '-1', '_skip_reply'], 'count':...","{'name': ['spam', 'lang_mismatch', 'pii', 'not..."


In [10]:
oasst1_indices_train, df_oasst1_texts_train = oasst1_2_messages(df_oasst1_train)
df_oasst1_texts_train.loc[oasst1_indices_train[0]]

Unnamed: 0,text,role
0,Can you write a short introduction about the r...,prompter
1,"""Monopsony"" refers to a market structure where...",assistant
2,Now explain it to a dog,prompter


In [11]:
oasst1_indices_validation, df_oasst1_texts_validation = oasst1_2_messages(df_oasst1_validation)
df_oasst1_texts_validation.loc[oasst1_indices_validation[0]]

Unnamed: 0,text,role
0,"Напиши функцию на языке swift, которая сортиру...",prompter
1,"Вот функция, которая сортирует массив целых чи...",assistant


In [12]:
print(df_oasst1_texts_validation.loc[oasst1_indices_validation[0]]["text"].values[-1])

Вот функция, которая сортирует массив целых чисел и выводит его на экран:

```swift
func sortAndPrintArray(_ array: [Int]) {
  // Создаем копию массива, чтобы не изменять исходный
  var sortedArray = array
  // Сортируем массив по возрастанию
  sortedArray.sort()
  // Выводим отсортированный массив на экран
  print(sortedArray)
}
```


Ты можешь проверить работу функции, вызвав ее с любым массивом целых чисел, например:

```swift
sortAndPrintArray([5, 2, 7, 9, -3])
```

На экране появится:

```sh
[-3, 2, 5, 7, 9]
```
---
Надеюсь, что я смог тебе помочь. Нужно ли тебе что-нибудь ещё? 😊


### Alpaca

In [13]:
df_alpaca = pd.DataFrame(load_dataset("tatsu-lab/alpaca")["train"])
df_alpaca.head()

Found cached dataset parquet (/home/alex4321/.cache/huggingface/datasets/tatsu-lab___parquet/tatsu-lab--alpaca-2b32f0433506ef5f/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
100%|██████████| 1/1 [00:00<00:00,  7.15it/s]


Unnamed: 0,instruction,input,output,text
0,Give three tips for staying healthy.,,1.Eat a balanced diet and make sure to include...,Below is an instruction that describes a task....
1,What are the three primary colors?,,"The three primary colors are red, blue, and ye...",Below is an instruction that describes a task....
2,Describe the structure of an atom.,,"An atom is made up of a nucleus, which contain...",Below is an instruction that describes a task....
3,How can we reduce air pollution?,,There are a number of ways to reduce air pollu...,Below is an instruction that describes a task....
4,Describe a time when you had to make a difficu...,,I had to make a difficult decision when I was ...,Below is an instruction that describes a task....


In [14]:
df_alpaca["input_message"] = df_alpaca.apply(
    lambda row: f"Below is an instruction that describes a task, paired with an input that provides further context. " + \
        "Write a response that appropriately completes the request.\n\n" + \
        "### Instruction:\n" + \
        f"{row['instruction']}\n" + \
        "\n" + \
        "### Input:\n" +
        f"{row['input']}",
    axis=1
)
df_alpaca["output_message"] = df_alpaca["output"]
df_alpaca = df_alpaca[["input_message", "output_message"]]

In [15]:
df_alpaca_texts = []
alpaca_indices = []
for _, row in df_alpaca.iterrows():
    df_alpaca_texts.append({"role": "prompter", "text": row["input_message"]})
    df_alpaca_texts.append({"role": "assistant", "text": row["output_message"]})
    alpaca_indices.append([len(df_alpaca_texts) - 2, len(df_alpaca_texts) - 1])
df_alpaca_texts = pd.DataFrame.from_records(df_alpaca_texts)

In [16]:
alpaca_indices_train, alpaca_indices_validation = train_test_split(
    alpaca_indices,
    random_state=RANDOM_STATE,
    test_size=0.05
)

In [17]:
df_alpaca_texts.loc[alpaca_indices_train[0]]

Unnamed: 0,role,text
52562,prompter,"Below is an instruction that describes a task,..."
52563,assistant,The sound of the alarm was really irritating.


In [18]:
df_alpaca_texts.loc[alpaca_indices_train[1]]

Unnamed: 0,role,text
67832,prompter,"Below is an instruction that describes a task,..."
67833,assistant,#ReduceToSucceed


### GPT4All

In [19]:
df_gpt4all = pd.DataFrame(load_dataset("nomic-ai/gpt4all_prompt_generations")["train"])
df_gpt4all.head()

Found cached dataset parquet (/home/alex4321/.cache/huggingface/datasets/nomic-ai___parquet/nomic-ai--gpt4all_prompt_generations-94ada251779e8693/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
100%|██████████| 1/1 [00:01<00:00,  1.73s/it]


Unnamed: 0,prompt,response,source
0,<p>I installed Cassandra and Spark with SparkS...,It seems that the error is caused by the synta...,pacovaldez/stackoverflow-questions
1,"<p>I have an mnesia table with fields say f1, ...",You can use a match specification in mnesia to...,pacovaldez/stackoverflow-questions
2,<p>I have a form with an input of type field. ...,"As an AI language model, I cannot guarantee th...",pacovaldez/stackoverflow-questions
3,<p>Is there a way to change how fast the toolt...,"As an AI language model, I don't have personal...",pacovaldez/stackoverflow-questions
4,<p>I have configured the data source and set a...,One potential issue with the code is that the ...,pacovaldez/stackoverflow-questions


In [20]:
df_gpt4all = df_gpt4all.loc[~df_gpt4all["response"].str.contains("As an AI language")]
df_gpt4all.head()

Unnamed: 0,prompt,response,source
0,<p>I installed Cassandra and Spark with SparkS...,It seems that the error is caused by the synta...,pacovaldez/stackoverflow-questions
1,"<p>I have an mnesia table with fields say f1, ...",You can use a match specification in mnesia to...,pacovaldez/stackoverflow-questions
4,<p>I have configured the data source and set a...,One potential issue with the code is that the ...,pacovaldez/stackoverflow-questions
5,<p>I've installed Java 8 for development purpo...,"To switch back to Java 7 from Java 8, you need...",pacovaldez/stackoverflow-questions
6,<p>This code is an attempt to bind a command t...,"The code above does not work because the ""Esca...",pacovaldez/stackoverflow-questions


In [21]:
def _markdownify_if_possible(text):
    try:
        return markdownify(text)
    except:
        return None

In [22]:
df_gpt4all["prompt_markdown"] = df_gpt4all["prompt"].apply(_markdownify_if_possible)
df_gpt4all["response_markdown"] = df_gpt4all["response"].apply(_markdownify_if_possible)
df_gpt4all = df_gpt4all.loc[df_gpt4all["prompt_markdown"].apply(bool) & df_gpt4all["response_markdown"].apply(bool)]
df_gpt4all.head()

  soup = BeautifulSoup(html, 'html.parser')
  soup = BeautifulSoup(html, 'html.parser')


Unnamed: 0,prompt,response,source,prompt_markdown,response_markdown
0,<p>I installed Cassandra and Spark with SparkS...,It seems that the error is caused by the synta...,pacovaldez/stackoverflow-questions,I installed Cassandra and Spark with SparkSQL ...,It seems that the error is caused by the synta...
1,"<p>I have an mnesia table with fields say f1, ...",You can use a match specification in mnesia to...,pacovaldez/stackoverflow-questions,"I have an mnesia table with fields say f1, f2,...",You can use a match specification in mnesia to...
4,<p>I have configured the data source and set a...,One potential issue with the code is that the ...,pacovaldez/stackoverflow-questions,I have configured the data source and set auto...,One potential issue with the code is that the ...
5,<p>I've installed Java 8 for development purpo...,"To switch back to Java 7 from Java 8, you need...",pacovaldez/stackoverflow-questions,I've installed Java 8 for development purposes...,"To switch back to Java 7 from Java 8, you need..."
6,<p>This code is an attempt to bind a command t...,"The code above does not work because the ""Esca...",pacovaldez/stackoverflow-questions,This code is an attempt to bind a command to a...,"The code above does not work because the ""Esca..."


In [23]:
df_gpt4all_texts = []
gpt4_all_indices = []
for _, row in df_gpt4all.iterrows():
    df_gpt4all_texts.append({"role": "prompter", "text": row["prompt_markdown"]})
    df_gpt4all_texts.append({"role": "assistant", "text": row["response_markdown"]})
    gpt4_all_indices.append([len(df_gpt4all_texts) - 2, len(df_gpt4all_texts) - 1])
df_gpt4all_texts = pd.DataFrame.from_records(df_gpt4all_texts)

In [24]:
gpt4all_indices_train, gpt4all_indices_validation = train_test_split(
    gpt4_all_indices,
    random_state=RANDOM_STATE,
    test_size=0.05
)

In [25]:
df_gpt4all_texts.loc[gpt4all_indices_train[0]]

Unnamed: 0,role,text
635152,prompter,I have tried all the possible solutions availa...
635153,assistant,The error is occurring because the URL you hav...


### BookSum

In [26]:
dataset_booksum = load_dataset("kmfoda/booksum")
dataset_booksum

Found cached dataset csv (/home/alex4321/.cache/huggingface/datasets/kmfoda___csv/kmfoda--booksum-025141c210e07407/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
100%|██████████| 3/3 [00:00<00:00, 34.34it/s]


DatasetDict({
    train: Dataset({
        features: ['bid', 'is_aggregate', 'source', 'chapter_path', 'summary_path', 'book_id', 'summary_id', 'content', 'summary', 'chapter', 'chapter_length', 'summary_name', 'summary_url', 'summary_text', 'summary_analysis', 'summary_length', 'analysis_length'],
        num_rows: 9600
    })
    validation: Dataset({
        features: ['bid', 'is_aggregate', 'source', 'chapter_path', 'summary_path', 'book_id', 'summary_id', 'content', 'summary', 'chapter', 'chapter_length', 'summary_name', 'summary_url', 'summary_text', 'summary_analysis', 'summary_length', 'analysis_length'],
        num_rows: 1484
    })
    test: Dataset({
        features: ['bid', 'is_aggregate', 'source', 'chapter_path', 'summary_path', 'book_id', 'summary_id', 'content', 'summary', 'chapter', 'chapter_length', 'summary_name', 'summary_url', 'summary_text', 'summary_analysis', 'summary_length', 'analysis_length'],
        num_rows: 1431
    })
})

In [27]:
df_booksum_train = pd.concat([pd.DataFrame(dataset_booksum["train"]), pd.DataFrame(dataset_booksum["test"])]).reset_index(drop=True)
df_booksum_train.head()

Unnamed: 0,bid,is_aggregate,source,chapter_path,summary_path,book_id,summary_id,content,summary,chapter,chapter_length,summary_name,summary_url,summary_text,summary_analysis,summary_length,analysis_length
0,27681,True,cliffnotes,all_chapterized_books/27681-chapters/chapters_...,finished_summaries/cliffnotes/The Last of the ...,The Last of the Mohicans.chapters 1-2,chapters 1-2,,"{""name"": ""Chapters 1-2"", ""url"": ""https://web.a...","\n ""Mine ear is open, and my heart prepared:\...",6471.0,Chapters 1-2,https://web.archive.org/web/20201101053205/htt...,"Before any characters appear, the time and geo...",These two chapters introduce the reader to the...,388.0,473.0
1,27681,False,cliffnotes,all_chapterized_books/27681-chapters/03.txt,finished_summaries/cliffnotes/The Last of the ...,The Last of the Mohicans.chapter 3,chapter 3,,"{""name"": ""Chapter 3"", ""url"": ""https://web.arch...","\n ""Before these fields were shorn and tilled...",3132.0,Chapter 3,https://web.archive.org/web/20201101053205/htt...,In another part of the forest by the river a f...,This chapter introduces the other three main a...,198.0,149.0
2,27681,False,cliffnotes,all_chapterized_books/27681-chapters/04.txt,finished_summaries/cliffnotes/The Last of the ...,The Last of the Mohicans.chapter 4,chapter 4,,"{""name"": ""Chapter 4"", ""url"": ""https://web.arch...","\n ""Well, go thy way: thou shalt not from thi...",3075.0,Chapter 4,https://web.archive.org/web/20201101053205/htt...,When the mounted party from Fort Howard approa...,Since this chapter is mostly one of surface ac...,319.0,75.0
3,27681,False,cliffnotes,all_chapterized_books/27681-chapters/05.txt,finished_summaries/cliffnotes/The Last of the ...,The Last of the Mohicans.chapter 5,chapter 5,,"{""name"": ""Chapter 5"", ""url"": ""https://web.arch...","\n ""In such a night\n Di...",3268.0,Chapter 5,https://web.archive.org/web/20201101053205/htt...,"The pursuit of Magua is unsuccessful, but Hawk...",Here the reader encounters the first bloodshed...,329.0,156.0
4,27681,False,cliffnotes,all_chapterized_books/27681-chapters/06.txt,finished_summaries/cliffnotes/The Last of the ...,The Last of the Mohicans.chapter 6,chapter 6,,"{""name"": ""Chapter 6"", ""url"": ""https://web.arch...","\n ""Those strains that once did sweet in Zion...",3873.0,Chapter 6,https://web.archive.org/web/20201101053205/htt...,Heyward and the girls are uneasy and Gamut is ...,This chapter shows Cooper in his most inventiv...,321.0,128.0


In [28]:
df_booksum_validation = pd.DataFrame(dataset_booksum["validation"])
df_booksum_validation.head()

Unnamed: 0,bid,is_aggregate,source,chapter_path,summary_path,book_id,summary_id,content,summary,chapter,chapter_length,summary_name,summary_url,summary_text,summary_analysis,summary_length,analysis_length
0,1023,True,gradesaver,all_chapterized_books/1023-chapters/chapters_1...,finished_summaries/gradesaver/Bleak House/sect...,Bleak House.chapters 1-4,chapters 1-4,,"{""name"": ""Chapters 1-4"", ""url"": ""https://web.a...","London. Michaelmas term lately over, and the L...",16554.0,Chapters 1-4,https://web.archive.org/web/20210421025427/htt...,"The scene opens in London on a foggy, smoggy d...",Dickens immediately plunges the readers in med...,635.0,536.0
1,1023,True,gradesaver,all_chapterized_books/1023-chapters/chapters_5...,finished_summaries/gradesaver/Bleak House/sect...,Bleak House.chapters 5-7,chapters 5-7,,"{""name"": ""Chapters 5-7"", ""url"": ""https://web.a...","Although the morning was raw, and although the...",16887.0,Chapters 5-7,https://web.archive.org/web/20210421025427/htt...,Caddy Jellyby has spent the night in Esther an...,The coincidence of Esther arriving at the home...,635.0,673.0
2,1023,True,gradesaver,all_chapterized_books/1023-chapters/chapters_8...,finished_summaries/gradesaver/Bleak House/sect...,Bleak House.chapters 8-10,chapters 8-10,,"{""name"": ""Chapters 8-10"", ""url"": ""https://web....",It was interesting when I dressed before dayli...,17028.0,Chapters 8-10,https://web.archive.org/web/20210421025427/htt...,Esther and Mr. Jarndyce discuss the Chancery s...,We find Esther settled comfortably at Bleak Ho...,609.0,363.0
3,1023,True,gradesaver,all_chapterized_books/1023-chapters/chapters_1...,finished_summaries/gradesaver/Bleak House/sect...,Bleak House.chapters 11-13,chapters 11-13,,"{""name"": ""Chapters 11-13"", ""url"": ""https://web...",A touch on the lawyer's wrinkled hand as he st...,15739.0,Chapters 11-13,https://web.archive.org/web/20210421025427/htt...,"Tulkinghorn and Krook find Nemo dead, apparent...","Nemo's death, so eerie and sordid, leaves the ...",484.0,345.0
4,1023,True,gradesaver,all_chapterized_books/1023-chapters/chapters_1...,finished_summaries/gradesaver/Bleak House/sect...,Bleak House.chapters 14-16,chapters 14-16,,"{""name"": ""Chapters 14-16"", ""url"": ""https://web...",Richard left us on the very next evening to be...,16636.0,Chapters 14-16,https://web.archive.org/web/20210421025427/htt...,Richard leaves to start his studies. He is hop...,"Again, Dickens treats the dissarray of badly p...",546.0,436.0


In [29]:
def booksum_process(df):
    result_df = []
    result_indices = []
    for _, row in df.iterrows():
        chapter = row["chapter"].strip()
        summary = json.loads(row["summary"])["summary"].strip()
        result_df.append({
            "role": "prompter",
            "text": f"{chapter}\n\nGive me the summary of previous text",
        })
        result_df.append({
            "role": "assistant",
            "text": summary,
        })
        result_indices.append([len(result_df) - 2, len(result_df) - 1])
    return pd.DataFrame.from_records(result_df), result_indices

In [30]:
df_booksum_texts_train, booksum_indices_train = booksum_process(df_booksum_train)

In [31]:
df_booksum_texts_validation, booksum_indices_validation = booksum_process(df_booksum_validation)

In [32]:
df_booksum_texts_train.loc[booksum_indices_train[0]]

Unnamed: 0,role,text
0,prompter,"""Mine ear is open, and my heart prepared:\n T..."
1,assistant,"Before any characters appear, the time and geo..."


In [33]:
len(booksum_indices_train)

11031

### GovReport

In [34]:
dataset_govreport = load_dataset("ccdv/govreport-summarization")
dataset_govreport

No config specified, defaulting to: govreport-summarization/document
Found cached dataset govreport-summarization (/home/alex4321/.cache/huggingface/datasets/ccdv___govreport-summarization/document/1.0.0/57ca3042de9c40c218cc94084cbc80a99a161036134bfc88112c57d251443590)
100%|██████████| 3/3 [00:00<00:00, 29.56it/s]


DatasetDict({
    train: Dataset({
        features: ['report', 'summary'],
        num_rows: 17517
    })
    validation: Dataset({
        features: ['report', 'summary'],
        num_rows: 973
    })
    test: Dataset({
        features: ['report', 'summary'],
        num_rows: 973
    })
})

In [35]:
df_govreport_train = pd.concat([pd.DataFrame(dataset_govreport["train"]), pd.DataFrame(dataset_govreport["test"])]).reset_index(drop=True)
df_govreport_validation = pd.DataFrame(dataset_govreport["validation"])

In [36]:
def govreport_process(df):
    result_df = []
    result_indices = []
    for _, row in df.iterrows():
        report = row["report"].strip()
        summary = row["summary"].strip()
        result_df.append({
            "role": "prompter",
            "text": f"{report}\n\nGive me the summary of previous text",
        })
        result_df.append({
            "role": "assistant",
            "text": summary,
        })
        result_indices.append([len(result_df) - 2, len(result_df) - 1])
    return pd.DataFrame.from_records(result_df), result_indices

In [37]:
df_govreport_texts_train, govreport_indices_train = govreport_process(df_govreport_train)
df_govreport_texts_validation, govreport_indices_validation = govreport_process(df_govreport_validation)

In [38]:
df_govreport_texts_train.loc[govreport_indices_train[0]]

Unnamed: 0,role,text
0,prompter,The structure of the armed forces is based on ...
1,assistant,As the Department of Defense (DOD) has expande...


### QASper

In [39]:
dataset_qasper = load_dataset("allenai/qasper")
dataset_qasper

Found cached dataset qasper (/home/alex4321/.cache/huggingface/datasets/allenai___qasper/qasper/0.3.0/2bfcd239e581ab83f9ab7b76a82e42c6bcf574a13246ae6cc5a6c357c35f96f9)
100%|██████████| 3/3 [00:00<00:00, 179.26it/s]


DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'abstract', 'full_text', 'qas', 'figures_and_tables'],
        num_rows: 888
    })
    validation: Dataset({
        features: ['id', 'title', 'abstract', 'full_text', 'qas', 'figures_and_tables'],
        num_rows: 281
    })
    test: Dataset({
        features: ['id', 'title', 'abstract', 'full_text', 'qas', 'figures_and_tables'],
        num_rows: 416
    })
})

In [40]:
def _qasper_article_text(data):
    section_texts = []
    for section_name, section_paragraphs in zip(data['section_name'], data['paragraphs']):
        if not section_name:
            section_name = ""
        section_name_levels = section_name.split(" ::: ")
        section_name_last = section_name_levels[-1]
        if section_name:
            section_name_prefix = "#" * len(section_name_levels)
            title = f"{section_name_prefix} {section_name_last}"
        else:
            title = ""
        section_content = "\n\n".join(section_paragraphs)
        section_texts.append(f"{title}\n\n{section_content}")
    return "\n\n".join(section_texts)

In [41]:
def _qasper_article_qa_pairs(full_text_data, qas_data):
    article_text = _qasper_article_text(full_text_data)

    question_texts = qas_data["question"]
    answers = qas_data["answers"]
    for question, question_answers in zip(question_texts, answers):
        for answer in question_answers["answer"]:
            answer_evidence = answer["highlighted_evidence"]
            answer_text = answer["free_form_answer"]
            if answer_evidence:
                answer_evidence_text = "\n".join([f"- {item}" for item in answer_evidence])
                answer_message_text = f"{answer_evidence_text}\n\n{answer_text}"
            else:
                answer_message_text = answer_text
            yield article_text, f"Keeping this in mind, answer the following question: {question}", answer_message_text.strip()

In [42]:
def quasper_qa_pairs(df):
    texts = []
    indices = []
    for _, row in df.iterrows():
        row_qa_texts = []
        for article_text, question_text, answer in _qasper_article_qa_pairs(row["full_text"], row["qas"]):
            row_qa_texts.append((question_text, answer))
        
        texts.append({"role": "prompter", "text": article_text})
        index_article = len(texts) - 1
        for question, answer in row_qa_texts:
            texts.append({"role": "prompter", "text": question})
            texts.append({"role": "assistant", "text": answer})
            index_question = len(texts) - 2
            index_answer = len(texts) - 1
            indices.append([index_article, index_question, index_answer])
    
    return pd.DataFrame.from_records(texts), indices

In [43]:
df_qasper_train = pd.concat([pd.DataFrame(dataset_qasper["train"]), pd.DataFrame(dataset_qasper["test"])]).reset_index(drop=True)
df_qasper_validation = pd.DataFrame(dataset_qasper["validation"])

In [44]:
df_qasper_texts_train, qasper_indices_train = quasper_qa_pairs(df_qasper_train)
df_qasper_texts_validation, qasper_indices_validation = quasper_qa_pairs(df_qasper_validation)

In [45]:
df_qasper_texts_train.loc[qasper_indices_train[0]]

Unnamed: 0,role,text
0,prompter,# Introduction\n\nAffective events BIBREF0 are...
1,prompter,"Keeping this in mind, answer the following que..."
2,assistant,- The seed lexicon consists of positive and ne...


In [46]:
print(df_qasper_texts_train.loc[qasper_indices_train[0]]["text"].values[-1])

- The seed lexicon consists of positive and negative predicates. If the predicate of an extracted event is in the seed lexicon and does not involve complex phenomena like negation, we assign the corresponding polarity score ($+1$ for positive events and $-1$ for negative events) to the event.
- It is a 

a vocabulary of positive and negative predicates that helps determine the polarity score of an event


In [47]:
print(df_qasper_texts_validation.loc[qasper_indices_validation[0]]["text"].values[-1])

- We compare our approaches with related approaches of pivoting, multilingual NMT (MNMT) BIBREF19, and cross-lingual transfer without pretraining BIBREF16. 
- The results show that our approaches consistently outperform other approaches across languages and datasets, especially surpass pivoting, which is a strong baseline in the zero-shot scenario that multilingual NMT systems often fail to beat BIBREF19, BIBREF20, BIBREF23.


In [48]:
print(df_qasper_texts_validation.loc[qasper_indices_validation[10]]["text"].values[-1])

- In order to evaluate the models trained on generated data, we manually annotated a named entities dataset comprising 53453 tokens and 2566 sentences selected from over 250 news texts from ilur.am.


In [49]:
len(qasper_indices_train)

6229

## Combine sets

In [None]:
def shift_indices(indices, offset):
    return [
        [i + offset for i in row]
        for row in indices
    ]

In [None]:
texts = []
indices_train = []
indices_validation = []
sources_train = []
sources_validation = []

In [None]:
indices_train.append(shift_indices(
    oasst1_indices_train,
    sum(map(len, texts))
))
texts.append(df_oasst1_texts_train)
sources_train.append("openassistant")

indices_validation.append(shift_indices(
    oasst1_indices_validation,
    sum(map(len, texts))
))
texts.append(df_oasst1_texts_validation)
sources_validation.append("openassistant")

In [None]:
indices_train.append(shift_indices(
    alpaca_indices_train,
    sum(map(len, texts))
))
sources_train.append("alpaca")

indices_validation.append(shift_indices(
    alpaca_indices_validation,
    sum(map(len, texts))
))
sources_validation.append("alpaca")

texts.append(df_alpaca_texts)

In [None]:
indices_train.append(shift_indices(
    gpt4all_indices_train,
    sum(map(len, texts))
))
sources_train.append("gpt4all")

indices_validation.append(shift_indices(
    gpt4all_indices_validation,
    sum(map(len, texts))
))
sources_validation.append("gpt4all")

texts.append(df_gpt4all_texts)

In [None]:
indices_train.append(shift_indices(
    booksum_indices_train,
    sum(map(len, texts))
))
texts.append(df_booksum_texts_train)
sources_train.append("booksum")

indices_validation.append(shift_indices(
    booksum_indices_validation,
    sum(map(len, texts))
))
texts.append(df_booksum_texts_validation)
sources_validation.append("booksum")

In [None]:
indices_train.append(shift_indices(
    govreport_indices_train,
    sum(map(len, texts))
))
texts.append(df_govreport_texts_train)
sources_train.append("govreport")

indices_validation.append(shift_indices(
    govreport_indices_validation,
    sum(map(len, texts))
))
texts.append(df_govreport_texts_validation)
sources_validation.append("govreport")

In [None]:
indices_train.append(shift_indices(
    qasper_indices_train,
    sum(map(len, texts))
))
texts.append(df_qasper_texts_train)
sources_train.append("qasper")

indices_validation.append(shift_indices(
    qasper_indices_validation,
    sum(map(len, texts))
))
texts.append(df_qasper_texts_validation)
sources_validation.append("qasper")

In [None]:
def get_indices_df(indices, sources):
    rows = []
    for ds_indices, ds_source in zip(indices, sources):
        for row in ds_indices:
            rows.append({"indices": row, "source": ds_source})
    return pd.DataFrame.from_records(rows)

In [None]:
df_indices_train = get_indices_df(indices_train, sources_train)
df_indices_train.head()

In [None]:
df_indices_validation = get_indices_df(indices_validation, sources_validation)
df_indices_validation.head()

In [None]:
df_texts = pd.concat(texts).reset_index(drop=True)
df_texts.head()

In [None]:
df_indices_train.groupby("source").head(1)

In [None]:
pd.set_option("display.max_colwidth", 256)

In [None]:
df_texts.loc[[0, 1, 2]]

In [None]:
df_texts.loc[[139334, 139335]]

In [None]:
df_texts.loc[[825928, 825929]]

In [None]:
df_texts.loc[[958912, 958913]]

In [None]:
df_texts.loc[[983942, 983943]]

In [None]:
df_texts.loc[[1022868, 1022869, 1022870]]

In [None]:
df_indices_validation.groupby("source").head(1)

In [None]:
df_texts.loc[[82483, 82484]]

In [None]:
df_texts.loc[[163170, 163171]]

In [None]:
df_texts.loc[[705722, 705723]]

In [None]:
df_texts.loc[[980974, 980975]]

In [None]:
df_texts.loc[[1020922, 1020923]]

In [None]:
df_texts.loc[[1036630, 1036631, 1036632]]

In [None]:
os.makedirs("long-vicuna-set", exist_ok=True)
df_texts.to_csv("long-vicuna-set/texts.gz", index=True, compression="gzip")
df_indices_train.to_pickle("long-vicuna-set/indices-train.pkl")
df_indices_validation.to_pickle("long-vicuna-set/indices-validation.pkl")