In [1]:
from pathlib import Path
import pandas as pd
import json
import random

# set the seed for reproducibility
random.seed(42)

In [2]:
ts_task_description_en = """### Human: Please rewrite the following complex sentence in order to make it easier to understand.
You can do so by replacing complex words with simpler synonyms (i.e. paraphrasing), deleting unimportant information (i.e. compression), and/or splitting a long complex sentence into several simpler ones.
The final simplified sentence needs to be grammatical, fluent, and retain the main ideas of its original counterpart without altering its meaning.

Complex sentence: {complex_sentence}

### Assistant: """

ts_task_description_de = """### Human: Bitte schreibe den folgenden komplexen Satz so um, dass er leichter zu verstehen ist.
Du kannst dies tun, indem du komplexe Wörter durch einfachere Synonyme ersetzt (d.h. Paraphrasierung), unwichtige Informationen streichst (d.h. Komprimierung) und/oder einen langen komplexen Satz in mehrere einfachere Sätze aufteilst.
Der endgültige vereinfachte Satz muss grammatikalisch einwandfrei und flüssig sein und die Hauptgedanken des ursprünglichen Satzes beibehalten, ohne dessen Bedeutung zu verändern.

Komplexer Satz: {complex_sentence}

### Assistant: """

ts_task_description_fr = """### Human: Veuillez réécrire la phrase complexe suivante afin de la rendre plus facile à comprendre.
Vous pouvez le faire en remplaçant les mots complexes par des synonymes plus simples (paraphrase), en supprimant les informations non importantes (compression) et/ou en divisant une longue phrase complexe en plusieurs phrases plus simples.
La phrase simplifiée finale doit être grammaticale, fluide et conserver les idées principales de sa contrepartie originale sans en modifier le sens.

Phrase complexe: {complex_sentence}

### Assistant: """

ts_task_description_it = """### Human: Si prega di riscrivere la seguente frase complessa per renderla più comprensibile.
Per farlo, si possono sostituire parole complesse con sinonimi più semplici (parafrasare), eliminare informazioni non importanti (compressione) e/o suddividere una frase lunga e complessa in molteplici frasi più semplici.
La frase semplificata finale deve essere grammaticalmente scorrevole e mantenere le nozioni principali della sua controparte originale senza alterarne il significato.

Frase complessa: {complex_sentence}

### Assistant: """

# to check
ts_task_description_ru = """### Human: Перепиши следующее сложное предложение, чтобы сделать его более понятным.
Для этого можно заменить сложные слова более простыми синонимами (перефразирование), удалить несущественную информацию (сжатие) и/или разбить длинное сложное предложение на несколько более простых.
Итоговое упрощенное предложение должно быть грамматически правильным, беглым и сохранять основные идеи своего оригинала без изменения его смысла.

Сложное предложение: {complex_sentence}

### Assistant: """

# to check
ts_task_description_pt = """### Human: Reescreva a frase complexa a seguir para facilitar a compreensão.
Você pode fazer isso substituindo palavras complexas por sinônimos mais simples (ou seja, parafraseando), excluindo informações sem importância (ou seja, comprimindo) e/ou dividindo uma frase longa e complexa em várias frases mais simples.
A frase final simplificada precisa ser gramatical, fluente e manter as ideias principais de sua contraparte original sem alterar seu significado.

Frase complexa: {complex_sentence}

### Assistant: """

ts_task_description_ja = """### Human: 次の複雑な文章を、より理解しやすく書き直してください。
複雑な単語をより簡単な同義語に置き換える（パラフレーズ）、重要でない情報を削除する（圧縮）、または長く複雑な文をいくつかの簡単な文に分けることで行えます。
最終的な簡略化された文章は、文法的に正しく、流暢で、元の文章の主要な内容を変えずに保持する必要があります。

複雑な文章：{complex_sentence}

### Assistant: """

ts_task_description_sl = """### Human: Prosim, prepišite naslednji zapleten stavek, da bo lažje razumljiv. 
To lahko storite tako, da nadomestite zapletene besede s preprostejšimi sopomenkami (t.j. parafraziranje), odstranite nepomembne informacije (t.j. stiskanje) in/ali razdelite dolg in zapleten stavek na več preprostejših stavkov. 
Končni poenostavljeni stavek mora biti slovnično pravilen, tekoč in ohraniti glavne ideje izvirnika brez spreminjanja njegovega pomena.

Zapleten stavek: {complex_sentence}

### Assistant: """

In [3]:

# gather all data available into a large dataframe

# prepare data from multisim

data_path = Path('/home/user/kew/projects/MultiSim/data')
output_path = Path('../data/multisim/')
output_path.mkdir(exist_ok=True, parents=True)

langs = {
    "en": ("English", ts_task_description_en),
    "de": ("German", ts_task_description_de),
    "fr": ("French", ts_task_description_fr),
    "it": ("Italian", ts_task_description_it),
    "ru": ("Russian", ts_task_description_ru),
    "pt": ("Brazilian Portuguese", ts_task_description_pt),
    "sl": ("Slovene", ts_task_description_sl),
    "ja": ("Japanese", ts_task_description_ja),
    # "eu": ("Basque", ts_task_description_eu),
}

for lang in langs:
    tgt_dir = data_path / langs[lang][0]

    en_outfile = output_path / f'en-{lang}.json'
    x_outfile = output_path / f'{lang}-{lang}.json'
    
    c = 0

    items = []
    
    for dataset_split in sorted(tgt_dir.glob('*.csv')):
        if lang not in ['de', 'pt', 'sl'] and '_train' in dataset_split.name:
            pass
        else:
            print(f'Processing {dataset_split}')
            df = pd.read_csv(dataset_split)
            
            # # drop rows that are empty
            # for col in df.columns:
            #     df = df[df[col].notna()]

            # print(f'Loaded {len(df)} rows')
            # print(f'Columns: {df.columns}')

            # write to json
            for idx, row in df.iterrows():
                # gather targets from all columns except 'original' provided they are not empty
                targets = [row[col].strip() for col in df.columns if col != 'original' and isinstance(row[col], str) and row[col].strip()]

                # ensure targets are unique
                targets = list(set(targets))

                en_item = {
                    'instruction': ts_task_description_en.format(complex_sentence=row['original']),
                    'source': row['original'],
                    'target': targets,
                    'id': idx,
                    'dataset': dataset_split.name,
                }

                x_item = {
                    'instruction': langs[lang][1].format(complex_sentence=row['original']),
                    'source': row['original'],
                    'target': targets,
                    'id': idx,
                    'dataset': dataset_split.name,
                }

                items.append((en_item, x_item))
    
    # shuffle items
    random.shuffle(items)

    # truncate to 1000 items
    items = items[:1371]

    with open(en_outfile, 'w', encoding='utf8') as en_out_f:
        with open(x_outfile, 'w', encoding='utf8') as x_out_f:
            for item in items:
                en_out_f.write(f'{json.dumps(item[0], ensure_ascii=False)}\n')
                x_out_f.write(f'{json.dumps(item[1], ensure_ascii=False)}\n')
            print(f'Wrote {len(items)} items to {en_outfile}')
            print(f'Wrote {len(items)} items to {x_outfile}')

Processing /home/user/kew/projects/MultiSim/data/English/ASSET_test.csv
Processing /home/user/kew/projects/MultiSim/data/English/ASSET_val.csv
Processing /home/user/kew/projects/MultiSim/data/English/WikiAuto_test.csv
Processing /home/user/kew/projects/MultiSim/data/English/WikiAuto_val.csv
Wrote 1371 items to ../data/multisim/en-en.json
Wrote 1371 items to ../data/multisim/en-en.json
Processing /home/user/kew/projects/MultiSim/data/German/GEOLino Corpus_test.csv
Processing /home/user/kew/projects/MultiSim/data/German/GEOLino Corpus_train.csv
Processing /home/user/kew/projects/MultiSim/data/German/GEOLino Corpus_val.csv
Processing /home/user/kew/projects/MultiSim/data/German/TextComplexityDE Parallel Corpus_test.csv
Processing /home/user/kew/projects/MultiSim/data/German/TextComplexityDE Parallel Corpus_train.csv
Processing /home/user/kew/projects/MultiSim/data/German/TextComplexityDE Parallel Corpus_val.csv
Wrote 1371 items to ../data/multisim/en-de.json
Wrote 1371 items to ../data/mu

In [4]:
# # prepare data from multisim

# data_path = Path('/home/user/kew/projects/MultiSim/data')
# output_path = Path('../data/multisim/')
# output_path.mkdir(exist_ok=True, parents=True)

# langs = {
#     "en": ("English", ts_task_description_en),
#     "de": ("German", ts_task_description_de),
#     "fr": ("French", ts_task_description_fr),
#     "it": ("Italian", ts_task_description_it),
#     "ru": ("Russian", ts_task_description_ru),
#     "pt": ("Brazilian Portuguese", ts_task_description_pt),
#     "sl": ("Slovene", ts_task_description_sl),
#     "ja": ("Japanese", ts_task_description_ja),
#     # "eu": ("Basque", ts_task_description_eu),
# }

# for lang in langs:
#     tgt_dir = data_path / langs[lang][0]

#     en_outfile = output_path / f'en-{lang}.json'
#     x_outfile = output_path / f'{lang}-{lang}.json'
    
#     c = 0

#     items = []
    
#     for dataset_split in sorted(tgt_dir.glob('*.csv')):
#         if '_train' in dataset_split.name:
#             pass
#         else:
#             print(f'Processing {dataset_split}')
#             df = pd.read_csv(dataset_split)
            
#             # # drop rows that are empty
#             # for col in df.columns:
#             #     df = df[df[col].notna()]

#             # print(f'Loaded {len(df)} rows')
#             # print(f'Columns: {df.columns}')

#             # write to json
#             for idx, row in df.iterrows():
#                 # gather targets from all columns except 'original' provided they are not empty
#                 targets = [row[col].strip() for col in df.columns if col != 'original' and isinstance(row[col], str) and row[col].strip()]

#                 # ensure targets are unique
#                 targets = list(set(targets))

#                 en_item = {
#                     'instruction': ts_task_description_en.format(complex_sentence=row['original']),
#                     'source': row['original'],
#                     'target': targets,
#                     'id': idx,
#                     'dataset': dataset_split.name,
#                 }

#                 x_item = {
#                     'instruction': langs[lang][1].format(complex_sentence=row['original']),
#                     'source': row['original'],
#                     'target': targets,
#                     'id': idx,
#                     'dataset': dataset_split.name,
#                 }

#                 items.append((en_item, x_item))
    
#     # shuffle items
#     random.shuffle(items)

#     # truncate to 1000 items
#     items = items[:1000]

#     with open(en_outfile, 'w', encoding='utf8') as en_out_f:
#         with open(x_outfile, 'w', encoding='utf8') as x_out_f:
#             for item in items:
#                 en_out_f.write(f'{json.dumps(item[0], ensure_ascii=False)}\n')
#                 x_out_f.write(f'{json.dumps(item[1], ensure_ascii=False)}\n')
#             print(f'Wrote {len(items)} items to {en_outfile}')
#             print(f'Wrote {len(items)} items to {x_outfile}')
        



In [5]:
# alector

input_dir = '/home/user/kew/projects/muss_original/resources/datasets/alector'

output_file = f'../data/ts/alector.json'
Path(output_file).parent.mkdir(parents=True, exist_ok=True)

# read data

src_texts = []
tgt_texts = []
for split in ['test', 'valid']:
    with open(f'{input_dir}/{split}.complex') as src_f:
        for line in src_f:
            src_texts.append(line.strip())
    with open(f'{input_dir}/{split}.simple') as tgt_f:
        for line in tgt_f:
            tgt_texts.append(line.strip())
    
assert len(src_texts) == len(tgt_texts)

# write lines to jsonl
c = 0
with open(output_file, 'w') as out_f:
    for src, tgt in zip(src_texts, tgt_texts):
        formatted_item = {
            'instruction': ts_task_description_en.format(complex_sentence=src).strip(),
            'source': src,
            'target': tgt
        }
        out_f.write(f'{json.dumps(formatted_item, ensure_ascii=False)}\n')
        c += 1
print(f'Wrote {c} items to {output_file}')

Wrote 472 items to ../data/ts/alector.json


In [6]:
input_dir = '/home/user/kew/projects/muss_original/resources/datasets/cefet_small/'

output_file = f'../data/ts/cefet.json'
Path(output_file).parent.mkdir(parents=True, exist_ok=True)

# read data

src_texts = []
tgt_texts = []
for split in ['test', 'valid']:
    with open(f'{input_dir}/{split}.complex') as src_f:
        for line in src_f:
            src_texts.append(line.strip())
    with open(f'{input_dir}/{split}.simple') as tgt_f:
        for line in tgt_f:
            tgt_texts.append(line.strip())
    
assert len(src_texts) == len(tgt_texts)

# write lines to jsonl
c = 0
with open(output_file, 'w') as out_f:
    for src, tgt in zip(src_texts, tgt_texts):
        formatted_item = {
            'instruction': ts_task_description_en.format(complex_sentence=src).strip(),
            'source': src,
            'target': tgt
        }
        out_f.write(f'{json.dumps(formatted_item, ensure_ascii=False)}\n')
        c += 1
print(f'Wrote {c} items to {output_file}')

Wrote 273 items to ../data/ts/cefet.json


In [7]:
input_dir = '/home/user/kew/projects/muss_tbd/resources/data/text_complexity_de19'

output_file = f'../data/ts/de19.json'
Path(output_file).parent.mkdir(parents=True, exist_ok=True)

# read data

src_texts = []
tgt_texts = []
for split in ['test', 'valid']:
    with open(f'{input_dir}/{split}.complex') as src_f:
        for line in src_f:
            src_texts.append(line.strip())
    with open(f'{input_dir}/{split}.simple') as tgt_f:
        for line in tgt_f:
            tgt_texts.append(line.strip())
    
assert len(src_texts) == len(tgt_texts)

# write lines to jsonl
c = 0
with open(output_file, 'w') as out_f:
    for src, tgt in zip(src_texts, tgt_texts):
        formatted_item = {
            'instruction': ts_task_description_en.format(complex_sentence=src).strip(),
            'source': src,
            'target': tgt
        }
        out_f.write(f'{json.dumps(formatted_item, ensure_ascii=False)}\n')
        c += 1
print(f'Wrote {c} items to {output_file}')

Wrote 249 items to ../data/ts/de19.json


In [8]:
# # apa

# input_dir = '../data/ts/apa_lha/A2-OR/'

# # for file in sorted(Path(input_dir).iterdir()):
# #     print(file)

# src_files = list(Path(input_dir).glob('*.de'))
# print(len(src_files))

# for src_file in src_files:
#     tgt_file = Path(input_dir) / f'{src_file.stem}_A2.simpde'
#     if not tgt_file.exists():
#         print(f'No target file for {tgt_file}')
#     with 