## Setup Prompts

Take the data and turn it into text for the transformer to generate and complete. The data is 

|File|Description|Columns|
|-|-|-|
|`documents.tsv`|Entire documents, including articles, books, screenplays, etc. in toki pona and their translations.|`name`: the name of the content. `content_type`: the type of content (such as `"biblical text"`). `tok`: the Toki Pona content string. `eng`: the English content string. `cmn`: the Chinese content string.|
|`chapters.tsv`|Chapters in toki pona and their translations.|`name`: the name of the content. `chapter_number`: the number of this chapter. `content_type`: the type of content (such as `"biblical text"`). `tok`: the Toki Pona content string. `eng`: the English content string. `cmn`: the Chinese content string.|
|`sentence_translations.tsv`|Translations of sentences in Toki Pona, Chinese, and English.|`tok`: the Toki Pona sentence. `eng`: the English sentence. `cmn`: the Chinese sentence.|
|`sentence.tsv`|A collection of miscellaneous sentences in Toki Pona.|`content_type`: the type of content associated with the sentence. `tok`: the sentence in Toki Pona.|

In [1]:
!pip install pandas numpy datasets transformers torch huggingface_hub

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m63.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface_hub
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
Collecting aiohttp
  Downloading aiohttp-3.8.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess
  Downloading multip

In [2]:
!git clone https://github.com/adam-mcdaniel/toki-pona-dataset
!cp toki-pona-dataset/processed/* .
!rm -Rf toki-pona-dataset

Cloning into 'toki-pona-dataset'...
remote: Enumerating objects: 2102, done.[K
remote: Counting objects: 100% (2102/2102), done.[K
remote: Compressing objects: 100% (2048/2048), done.[K
remote: Total 2102 (delta 49), reused 2092 (delta 39), pack-reused 0[K
Receiving objects: 100% (2102/2102), 5.73 MiB | 12.45 MiB/s, done.
Resolving deltas: 100% (49/49), done.


In [3]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [16]:
import pandas as pd
import numpy as np
import math
from random import choice
from datasets import Dataset, DatasetDict, load_from_disk
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import ClassLabel
import random
from copy import deepcopy
from IPython.display import display, HTML

CONTENT_TYPES = [
    ENCYCLOPEDIA_ARTICLE := 'encyclopedia article',
    ARTICLE := 'article',
    BLOG_POST := 'blog post',
    MAGAZINE := 'magazine',
    BIBLE := 'biblical text',
    STORY := 'story',
    POEM := 'poem',
    SCREENPLAY := 'screenplay',
    BOOK := 'book',
    CHAPTER := 'chapter',
    ESSAY := 'essay',
    CHAT := 'chat',
    OTHER := 'other',
]

FORMATS = [
    TEXT := 'text',
    MARKDOWN := 'markdown',
    IRC_LOG := 'irc log',
]

In [17]:
def collate(toki_pona_text, english_text, content_type, title=None):
    yield {'text': toki_pona_text}
    yield {'text': toki_pona_text}
    yield {'text': toki_pona_text}

    toki_pona_verb = choice(['toki', 'sitelen'])
    english_verb = choice(['write', 'tell me', 'show me', 'write me', 'tell'])

    if content_type == STORY:
        yield {'text': f'{english_verb} a story\n=====\n{toki_pona_text}'}
        yield {'text': f'o pali e lipu musi\n=====\n{toki_pona_text}'}

    elif content_type == ARTICLE:
        yield {'text': f'{english_verb} an article\n=====\n{toki_pona_text}'}
        yield {'text': f'o pali e lipu pi pana sona\n=====\n{toki_pona_text}'}

    elif content_type == POEM:
        yield {'text': f'{english_verb} a poem\n=====\n{toki_pona_text}'}
        yield {'text': f'o pali e toki musi\n=====\n{toki_pona_text}'}
    
    elif content_type == BIBLE:
        yield {'text': f'o toki e toki sewi\n=====\n{toki_pona_text}'}
        yield {'text': f'{english_verb} from the bible\n=====\n{toki_pona_text}'}
    
    elif content_type == SCREENPLAY:
        yield {'text': f'{english_verb} a screenplay\n=====\n{toki_pona_text}'}
        yield {'text': f'o {toki_pona_verb} e lipu musi\n=====\n{toki_pona_text}'}
    
    elif content_type == BOOK:
        yield {'text': f'{english_verb} a book\n=====\n{toki_pona_text}'}
        yield {'text': f'o {toki_pona_verb} e lipu\n=====\n{toki_pona_text}'}
    
    elif content_type == MAGAZINE:
        yield {'text': f'{english_verb} a magazine article\n=====\n{toki_pona_text}'}
        yield {'text': f'o {toki_pona_verb} e lipu tenpo\n=====\n{toki_pona_text}'}

    elif content_type == CHAT:
        yield {'text': f'{english_verb} a chat\n=====\n{toki_pona_text}'}

    # Create the prompt
    if english_text:
        yield {'text': f'{toki_pona_text}\n>>>>>\n{english_text}'}
        yield {'text': f'{english_text}\n>>>>>\n{toki_pona_text}'}
        yield {'text': f'{toki_pona_text}\n>>>>>\n{english_text}'}
        yield {'text': f'{english_text}\n>>>>>\n{toki_pona_text}'}

def collate_documents():
    # Reload them and confirm that they are the same, convert NaNs to None
    documents_copy = pd.read_csv("documents.tsv", sep='\t')
    documents_copy = documents_copy.replace(np.nan, None)

    # Add the prompts to the documents
    for i, row in documents_copy.iterrows():
        # The genre of content
        content_type = row['content_type']

        # Toki Pona text
        toki_pona_text = row['tok']
    
        # English text
        english_text = row['eng']

        for pair in collate(toki_pona_text, english_text, content_type):
            yield pair        

def collate_chapters():
    # Reload them and confirm that they are the same, convert NaNs to None
    chapters_copy = pd.read_csv("chapters.tsv", sep='\t')
    chapters_copy = chapters_copy.replace(np.nan, None)

    # Add the prompts to the documents
    for i, row in chapters_copy.iterrows():
        # The genre of content
        content_type = row['content_type']

        # Toki Pona text
        toki_pona_text = row['tok']
    
        # English text
        english_text = row['eng']

        for pair in collate(toki_pona_text, english_text, content_type):
            yield pair

def collate_sentences():
    # Reload them and confirm that they are the same, convert NaNs to None
    sentence_translations = pd.read_csv("sentence_translations.tsv", sep='\t')
    sentence_translations = sentence_translations.replace(np.nan, None)

    # Add the prompts to the documents
    for i, row in sentence_translations.iterrows():
        # Toki Pona text
        toki_pona_text = row['tok']
    
        # English text
        english_text = row['eng']

        for pair in collate(toki_pona_text, english_text, None):
            yield pair

    sentences = pd.read_csv("sentences.tsv", sep='\t')
    sentences = sentences.replace(np.nan, None)
    for i, row in sentences.iterrows():
        # Toki Pona text
        toki_pona_text = row['sentence']
        yield {'text': toki_pona_text}


In [18]:
def collate_all():
    for pair in collate_documents():
        yield pair
    
    for pair in collate_chapters():
        yield pair
    
    for pair in collate_sentences():
        yield pair

datasets = Dataset.from_generator(collate_all)

train_dataset, test_dataset = datasets.train_test_split(test_size=0.1).values()
datasets = DatasetDict({"train": train_dataset,"test": test_dataset})
print(len(train_dataset), len(test_dataset))


Downloading and preparing dataset generator/default to /root/.cache/huggingface/datasets/generator/default-8e9834eda48c531d/0.0.0...


Generating train split: 0 examples [00:00, ? examples/s]

Dataset generator downloaded and prepared to /root/.cache/huggingface/datasets/generator/default-8e9834eda48c531d/0.0.0. Subsequent calls will reuse this data.
347703 38634


In [20]:
model_to_finetune = "distilgpt2"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_to_finetune, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(model_to_finetune)

In [23]:
def split_into_sequences(inputs):
    # The maximum sequence length to use
    max_sequence_length = 128

    # First get all of the texts together into a collated dictionary
    collated = {}
    for k in inputs.keys():
        if k not in collated:
            collated[k] = []
        for v in inputs[k]:
            collated[k] += v

    # Get the length of the first text
    first_id = next(iter(inputs))
    total_length_of_texts = len(collated[first_id]) // max_sequence_length * max_sequence_length

    # Now split the texts into sequences
    result = {}
    for k, v in collated.items():
        sequences = []
        # Split the text into chunks of max_sequence_length
        for i in range(0, total_length_of_texts, max_sequence_length):
            sequence = v[i:i + max_sequence_length]
            sequences.append(sequence)
        # Add the sequences to the result
        result[k] = sequences
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
tokenized = datasets.map(lambda x: tokenizer(x["text"]), batched=True, num_proc=4, remove_columns=["text"])
sequences = tokenized.map(split_into_sequences, batched=True, batch_size=1000, num_proc=4)

In [None]:
args = training_args = TrainingArguments("toki-pona-better", evaluation_strategy = "epoch", learning_rate=0.00002, warmup_steps=500, num_train_epochs=6, weight_decay=0.01, push_to_hub=True, save_total_limit=10)
trainer = Trainer(model=model, args=args, train_dataset=sequences["train"], eval_dataset=sequences["test"])
trainer.train()
trainer.push_to_hub()
eval_results = trainer.evaluate()
perplexity = math.exp(eval_results['eval_loss'])
print('Perplexity =', perplexity)