# Generate Toki-Pona datasets

This notebook consolidates the data in each of the different folders into a single dataset. It generates a file for sentence translations between Toki Pona, English, and optionally Chinese, a file for sentences in Toki Pona, and file containing entire documents in each language (if available).

In [60]:
import pandas as pd
import numpy as np
import os
import re
from glob import glob

In [4]:
CONTENT_TYPES = [
    ENCYCLOPEDIA_ARTICLE := 'encyclopedia article',
    ARTICLE := 'article',
    BLOG_POST := 'blog post',
    MAGAZINE := 'magazine',
    BIBLE := 'biblical text',
    STORY := 'story',
    POEM := 'poem',
    SCREENPLAY := 'screenplay',
    BOOK := 'book',
    CHAPTER := 'chapter',
    ESSAY := 'essay',
    CHAT := 'chat',
    OTHER := 'other',
]

FORMATS = [
    TEXT := 'text',
    MARKDOWN := 'markdown',
    IRC_LOG := 'irc log',
]

sentence_translations = pd.DataFrame(columns=['id', 'tok', 'eng', 'cmn'])
sentences = pd.DataFrame(columns=['id', 'content_type', 'sentence'])
documents = pd.DataFrame(columns=['id', 'name', 'content_type', 'tok', 'eng', 'cmn'])
chapters = pd.DataFrame(columns=['id', 'name', 'chapter_number', 'content_type', 'tok', 'eng', 'cmn'])

## Sentence translations

Go through the files in the `phrases` folder and generate a file containing the sentence translations. These files are:

|File|Language|Description|Length|
|----|--------|-----------|------|
|`common.md`|Toki Pona and English|Common phrases and responses|~100 pairs|
|`common2.tsv`|Toki Pona and English|Common sentences|~2000 pairs|
|`tatoeba-dev.eng-toki.tsv`|Toki Pona and English|Some Tatoeba translations between Toki Pona and English ([from this dataset dated to 2021](https://huggingface.co/datasets/Helsinki-NLP/tatoeba_mt/blob/main/dev/tatoeba-dev.eng-toki.tsv))|~8000 pairs|
|`tatoeba-test.eng-toki.tsv`|Toki Pona and English|Some Tatoeba translations between Toki Pona and English ([from this dataset dated to 2021](https://huggingface.co/datasets/Helsinki-NLP/tatoeba_mt/blob/main/test/tatoeba-test.eng-toki.tsv))|~5000 pairs|
|`translations.tsv`|Toki Pona, English, and Chinese|Tatoeba translations between Toki Pona, English, and Chinese (dated 4/14/2023)|~33000 pairs|

In [49]:
f = open(os.path.expanduser("phrases/common2.tsv"), "r", encoding="utf-8")
tsv = [line.strip().split("\t") for line in f]
f.close()
for tok, eng in tsv:
    sentences.loc[len(sentences)] = [len(sentences), OTHER, tok]
    sentence_translations.loc[len(sentence_translations)] = [len(sentence_translations), tok, eng, None]

print(len(sentences), len(sentence_translations))

1907 1907


In [50]:
f = open(os.path.expanduser("phrases/tatoeba-dev.eng-toki.tsv"), "r", encoding="utf-8")
tsv = [line.strip().split("\t") for line in f]
f.close()
for _, _, eng, tok in tsv:
    sentences.loc[len(sentences)] = [len(sentences), OTHER, tok]
    sentence_translations.loc[len(sentence_translations)] = [len(sentence_translations), tok, eng, None]

print(len(sentences), len(sentence_translations))

10609 10609


In [51]:
f = open(os.path.expanduser("phrases/tatoeba-test.eng-toki.tsv"), "r", encoding="utf-8")
tsv = [line.strip().split("\t") for line in f]
f.close()
for _, _, eng, tok in tsv:
    sentences.loc[len(sentences)] = [len(sentences), OTHER, tok]
    sentence_translations.loc[len(sentence_translations)] = [len(sentence_translations), tok, eng, None]

print(len(sentences), len(sentence_translations))

15599 15599


In [52]:
f = open(os.path.expanduser("phrases/translations.tsv"), "r", encoding="utf-8")
tsv = [line.strip().split("\t") for line in f]
f.close()
for row in tsv:
    if len(row) == 4:
        _, tok, eng, cmn = row
    elif len(row) == 3:
        _, tok, eng = row
        cmn = None
    if eng == '':
        eng = None
    if cmn == '':
        cmn = None

    sentences.loc[len(sentences)] = [len(sentences), OTHER, tok]
    sentence_translations.loc[len(sentence_translations)] = [len(sentence_translations), tok, eng, cmn]

print(len(sentences), len(sentence_translations))

48740 48740


In [53]:
print('Toki Pona:', sentences['sentence'].str.split().str.len().sum())
print('Toki Pona:', int(sentence_translations['tok'].str.split().str.len().sum()), 'English:', int(sentence_translations['eng'].str.split().str.len().sum()), 'Chinese:', sentence_translations['cmn'].str.split().str.len().sum())

Toki Pona: 402085
Toki Pona: 402085 English: 254765 Chinese: 5761.0


In [59]:
# Save the sentences and translations to a file.
sentences.to_csv(os.path.expanduser("phrases/sentences.tsv"), sep='\t', index=False)
sentence_translations.to_csv(os.path.expanduser("phrases/sentence_translations.tsv"), sep='\t', index=False)

# Reload them and confirm that they are the same.
sentences_copy = pd.read_csv(os.path.expanduser("phrases/sentences.tsv"), sep='\t')
assert sentences.equals(sentences_copy)

sentence_translations_copy = pd.read_csv(os.path.expanduser("phrases/sentence_translations.tsv"), sep='\t')
assert sentence_translations.equals(sentence_translations_copy)

## Documents and translations

Go through the files in each of the folders and add their entire contents to each field in the dataset. These files are in:

|Folder|Language|Description|Length|
|------|--------|-----------|------|
|`articles`|Toki Pona and English|Articles from Lipu Kule|Unknown|
|`chat`|Toki Pona and English|Chat logs from Unknown|Unknown|
|`comments`|Toki Pona|Comments on blog posts and reviews of books|Unknown|
|`dictionary`|Toki Pona and English|Toki Pona dictionary|Unknown|
|`encyclopedia`|Toki Pona|Articles from Wikipesija. The name of the document is the subject of the article.|Unknown|
|`magazines`|Toki Pona|Entire copies of Lipu Tenpo|Unknown|
|`stories`|Toki Pona and English|Stories in Toki Pona and English.|Unknown|
|`poems`|Toki Pona|Poems in Toki Pona.|Unknown|
|`screenplays`|Toki Pona and English|Screenplays and their translations.|Unknown|
|`bible`|Toki Pona and English|Texts relating to the bible.|Unknown|
|`livejournal-blog`|Toki Pona and English|Texts from LiveJournal blogs.|Unknown|

In [38]:
documents = pd.DataFrame(columns=['id', 'name', 'content_type', 'tok', 'eng', 'cmn'])

def get_files(dir, ext):
    # Get all the files in articles/tok/ and articles/eng/
    tok_files = glob(os.path.expanduser(f"{dir}/tok/*.{ext}"))
    eng_files = glob(os.path.expanduser(f"{dir}/eng/*.{ext}"))

    # Strip the path and extension from the filenames
    tok_files = [os.path.basename(f) for f in tok_files]
    eng_files = [os.path.basename(f) for f in eng_files]

    # Get the shared set of files
    tok_files = set(tok_files)
    eng_files = set(eng_files)
    shared_files = tok_files.intersection(eng_files)

    # Get the set of files that are only in tok/ or eng/
    tok_only_files = tok_files.difference(eng_files)
    eng_only_files = eng_files.difference(tok_files)

    return shared_files, tok_only_files, eng_only_files

def read_files(dir, content_type):
    shared_files, tok_only_files, _ = get_files(dir, "*")

    # Get the shared files and save them in the documents table
    for f in shared_files:
        tok = open(os.path.expanduser(f"{dir}/tok/{f}"), "r", encoding="utf-8").read()
        eng = open(os.path.expanduser(f"{dir}/eng/{f}"), "r", encoding="utf-8").read()
        if tok == '':
            tok = None
        if eng == '':
            eng = None
        documents.loc[len(documents)] = [len(documents), os.path.basename(f).replace('__', '_').replace('__', '_').replace('_', ' '), content_type, tok, eng, None]

    # Get the files that are only in tok/ and save them in the documents table
    for f in tok_only_files:
        tok = open(os.path.expanduser(f"{dir}/tok/{f}"), "r", encoding="utf-8").read()
        if tok == '':
            tok = None
        documents.loc[len(documents)] = [len(documents), os.path.basename(f).replace('__', '_').replace('__', '_').replace('_', ' '), content_type, tok, None, None]

    print(len(documents))


In [39]:
read_files("articles", ARTICLE)
read_files("bible", BIBLE)
read_files("chat", CHAT)
read_files("comments", CHAT)
read_files("dictionary", OTHER)
read_files("encyclopedia", ENCYCLOPEDIA_ARTICLE)
read_files("jan Kipu Corpus", OTHER)
read_files("livejournal-blog", BLOG_POST)
read_files("magazines", MAGAZINE)
read_files("poems", POEM)
read_files("screenplays", SCREENPLAY)
read_files("stories", STORY)

52
75
116
140
141
500
1840
1928
1946
2028
2029
2091


In [40]:
# For each document, find the word count and add it all up
print('Toki Pona:', int(documents['tok'].str.split().str.len().sum()), 'English:', int(documents['eng'].str.split().str.len().sum()), 'Chinese:', documents['cmn'].str.split().str.len().sum())

Toki Pona: 1167019 English: 63119 Chinese: 0


In [63]:
# Save the sentences and translations to a file.
documents.to_csv(os.path.expanduser("output-documents.tsv"), sep='\t', index=False)

# Reload them and confirm that they are the same, convert NaNs to None
documents_copy = pd.read_csv(os.path.expanduser("output-documents.tsv"), sep='\t')
documents_copy = documents_copy.replace(np.nan, None)

assert documents.equals(documents_copy)

## Chapters and Translations

Go through the files of books and screenplays and add their corresponding translated chapters to this dataset. These files are:

|Folder|Language|Description|Length|
|------|--------|-----------|------|
|`bible`|Toki Pona and English|The Gospel of John|22 chapters|
|`screenplays`|Toki Pona and English|Monty Python and the Holy Grail|24 scenes|

In [71]:
chapters = pd.DataFrame(columns=['id', 'name', 'chapter_number', 'content_type', 'tok', 'eng', 'cmn'])

In [72]:
'''
Get the chapters for the gospel of john
'''

tok_gospel_of_john = open(os.path.expanduser(f"bible/tok/gospel of john.txt"), "r", encoding="utf-8").read()
tok_gospel_of_john_chapters = re.split('\nkipisi[^\.\n]+\n', tok_gospel_of_john)

eng_gospel_of_john = open(os.path.expanduser(f"bible/eng/gospel of john.txt"), "r", encoding="utf-8").read()
eng_gospel_of_john_chapters = list(filter(lambda x: x != '', map(lambda x: x.strip(), re.split('Chapter[^\.\n]+\n', eng_gospel_of_john))))

for i, (tok, eng) in enumerate(zip(tok_gospel_of_john_chapters, eng_gospel_of_john_chapters)):
    chapters.loc[len(chapters)] = [len(chapters), 'gospel of john', i + 1, BIBLE, tok, eng, None]

print(len(tok_gospel_of_john_chapters), len(eng_gospel_of_john_chapters))

21 21


In [73]:
'''
Get the chapters for the monty python screenplay
'''

tok_monty_python = open(os.path.expanduser(f"screenplays/tok/monty python.txt"), "r", encoding="utf-8").read()
tok_monty_python_chapters = re.split('\nkipisi[^\.\n]+\n', tok_monty_python)

eng_monty_python = open(os.path.expanduser(f"screenplays/eng/monty python.txt"), "r", encoding="utf-8").read()
eng_monty_python_chapters = list(filter(lambda x: x != '', map(lambda x: x.strip(), re.split('Scene \d+\n', eng_monty_python))))

for i, (tok, eng) in enumerate(zip(tok_monty_python_chapters, eng_monty_python_chapters)):
    chapters.loc[len(chapters)] = [len(chapters), 'Monty Python and the Holy Grail', i + 1, SCREENPLAY, tok, eng, None]

print(len(tok_monty_python_chapters), len(eng_monty_python_chapters))

24 24


In [74]:
# For each document, find the word count and add it all up
print('Toki Pona:', int(chapters['tok'].str.split().str.len().sum()), 'English:', int(chapters['eng'].str.split().str.len().sum()), 'Chinese:', chapters['cmn'].str.split().str.len().sum())

Toki Pona: 34077 English: 29438 Chinese: 0


In [75]:
# Save the sentences and translations to a file.
chapters.to_csv(os.path.expanduser("chapters.tsv"), sep='\t', index=False)

# Reload them and confirm that they are the same, convert NaNs to None
chapters_copy = pd.read_csv(os.path.expanduser("chapters.tsv"), sep='\t')
chapters_copy = chapters.replace(np.nan, None)

assert chapters.equals(chapters)