In [1]:
!pip install datasets



In [2]:
import pandas as pd
import random
import json
from datasets import load_dataset

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
!ls drive/MyDrive/diploma/data

aligned_chekhov_75.000_sents_09_02.json		     aligned_konstitution_sents_09_02.json
aligned_chekhov_chameleon_sents_09_02.json	     aligned_land_direct_sents_09_02.json
aligned_chekhov_maria_ivanovna_sents_09_02.json      aligned_news_names_09_02.json
aligned_chekhov_person_in_case_sents_09_02.json      aligned_news_sents_09_02.json
aligned_chekhov_prishibaev_sents_09_02.json	     aligned_shchedrin_konyaga_sents_09_02.json
aligned_chekhov_self-delusion_sents_09_02.json	     aligned_shchedrin_vilage_fire_sents_09_02.json
aligned_chekhov_trifon_sents_09_02.json		     aligned_wikipedia_sents_09_02.json
aligned_furmanov_chapaev_sents_09_02.json	     all_dicts_data.tsv
aligned_gaydar_distant_countries_sents_09_02.json    all_phrases.tsv
aligned_gogol_dead_souls_sample_sents_09_02.json     e-mordovia
aligned_gorkiy_happiness_sents_09_02.json	     mdf_mono
aligned_gorkiy_hero_sents_09_02.json		     moksha_bible.tsv
aligned_gorkiy_italian_11_sents_09_02.json	     moksha_pravda.tsv
aligned_gork

In [5]:
PARSED_DATA_PATH_PREFIX = 'drive/MyDrive/diploma/data/'

In [6]:
ALIGNED_DATA_PATH_PREFIX = 'drive/MyDrive/diploma/data/'

## Load sentence-parallel data

### words and phrases

In [7]:
word_df = pd.read_csv(PARSED_DATA_PATH_PREFIX + 'all_dicts_data.tsv', sep='\t')

assert not word_df.isna().sum().sum()

word_pairs = list(zip(word_df['mdf'], word_df['ru']))

print(len(word_pairs))
print(random.choice(word_pairs))

3589
('фатямс', 'схватить')


In [8]:
phrases_df = pd.read_csv(PARSED_DATA_PATH_PREFIX + 'all_phrases.tsv', sep='\t')

assert not phrases_df.isna().sum().sum()

phrases_pairs = list(zip(phrases_df['mdf'], phrases_df['ru']))

print(len(phrases_pairs))
print(random.choice(phrases_pairs))

726
('Сёксень ши, илядь, ков.', 'Осенний день, вечер, месяц.')


### news

In [9]:
with open(ALIGNED_DATA_PATH_PREFIX + 'aligned_news_names_09_02.json', 'r') as f:
    news_names_df = pd.DataFrame(json.load(f))

assert not news_names_df.isna().sum().sum(), "Missing values detected in news_names"

news_names_pairs = list(zip(news_names_df['mdf'], news_names_df['ru']))

print(len(news_names_pairs))
print(random.choice(news_names_pairs))

4560
('Мордовиясь и Альфа-Банксь подписандасть сотрудничендамать колга соглашения', 'Мордовия и Альфа-Банк подписали соглашение о сотрудничестве')


In [10]:
with open(ALIGNED_DATA_PATH_PREFIX + 'aligned_news_sents_09_02.json', 'r') as f:
    news_sents_df = pd.DataFrame(json.load(f))

assert not news_sents_df.isna().sum().sum(), "Missing values detected in news_sents"

news_sents_pairs = list(zip(news_sents_df['mdf'], news_sents_df['ru']))

print(len(news_sents_pairs))
print(random.choice(news_sents_pairs))

65970
('Торжественнай церемонияда меле поладовсть кибер-баталиятне, конатнень коряс и кочкафтольхть инь вии киберспортсметтне.', 'По завершении церемонии продолжились кибер-баталии, которые и определили сильнейших киберспортсменов.')


### wikisource

In [11]:
document_names = [
    'chekhov_75.000_sents',
    'chekhov_chameleon_sents',
    'chekhov_maria_ivanovna_sents',
    'chekhov_person_in_case_sents',
    'chekhov_prishibaev_sents',
    'chekhov_self-delusion_sents',
    'chekhov_trifon_sents',
    'furmanov_chapaev_sents',
    'gaydar_distant_countries_sents',
    'gogol_dead_souls_sample_sents',
    'gorkiy_happiness_sents',
    'gorkiy_hero_sents',
    'gorkiy_italian_11_sents',
    'gorkiy_mother_sents',
    'gorkiy_mother_part2_sents',
    'gorkiy_russian_fairy_tales_sents',
    'konstitution_sents',
    'land_direct_sents',
    'shchedrin_konyaga_sents',
    'shchedrin_vilage_fire_sents',
]

In [12]:
wikisource_pairs = {}

for doc_name in document_names:
    file_path = ALIGNED_DATA_PATH_PREFIX + f"aligned_{doc_name}_09_02.json"
    with open(file_path, 'r') as f:
        df = pd.DataFrame(json.load(f))

    assert not df.isna().sum().sum(), f"Missing values detected in {doc_name}"

    pairs = list(zip(df['mdf'], df['ru']))

    if doc_name == 'konstitution_sents':
        pairs = [pair for pair in pairs if not pair[1].startswith('Статья')]

    wikisource_pairs[doc_name] = pairs

    print(f"{doc_name}: {len(pairs)} pairs")
    print(random.choice(pairs))

    print()

chekhov_75.000_sents: 130 pairs
('Сембе вдь тинь купидоттада.', 'Все ведь вы купидоны.')

chekhov_chameleon_sents: 92 pairs
('Тон, Хрюкин, кирьдеть эздонза кальдявкс и тя тевть тяфтак тят кадонда...', 'Ты, Хрюкин, пострадал и дела этого так не оставляй...')

chekhov_maria_ivanovna_sents: 27 pairs
('Ламос ащесь од ломанць пяк мазы авать инголе.', 'Долго стоял молодой человек перед прекрасной женщиной.')

chekhov_person_in_case_sents: 186 pairs
('— Кда тон корхтат монь мархтон тяфтама вайгяльса, то мон не могу корхтамс сяда тов, — мярьгсь сон.', '— Если вы говорите со мной таким тоном, то я не могу продолжать, — сказал он.')

chekhov_prishibaev_sents: 84 pairs
('Ништа можна нолямс, штоба народсь безобразничендаль?', 'Нешто можно дозволять, чтобы народ безобразил?')

chekhov_self-delusion_sents: 33 pairs
('Брандмейстер атясь венептсь тейнза нилеце рюмкать и мярьгсь ялгакс: — Верондан!', 'Старик брандмейстер подал ему четвертую рюмку и заметил дружески: — Верю-с!')

chekhov_trifon_sents: 9

### wikipedia

In [13]:
with open(ALIGNED_DATA_PATH_PREFIX + 'aligned_wikipedia_sents_09_02.json', 'r') as f:
    wiki_sents_df = pd.DataFrame(json.load(f))

assert not wiki_sents_df.isna().sum().sum(), "Missing values detected in news_sents"

wiki_sents_pairs = list(zip(wiki_sents_df['mdf'], wiki_sents_df['ru']))

print(len(wiki_sents_pairs))
print(random.choice(wiki_sents_pairs))

1400
('Васень лятфтаматне 1511 кизоня.', 'Впервые упоминается в 1511 году.')


### Union

In [14]:
all_pairs = sorted({
    c for c in
    word_pairs + phrases_pairs + news_names_pairs + news_sents_pairs + [pair for pairs in wikisource_pairs.values() for pair in pairs] + wiki_sents_pairs
    if c[0] and c[1]
})
print(len(all_pairs))

77165


In [15]:
with open(ALIGNED_DATA_PATH_PREFIX + 'train_test_splitting/all_pairs.json', 'w') as f:
    json.dump(all_pairs, f, ensure_ascii=False, indent=4)

# Select traindev data

In [16]:
print(len(set(news_sents_pairs)))

random.seed(1)
news_sents_devtest = random.sample(sorted(set(news_sents_pairs)), k=1000)

62308


In [17]:
print(len(set(wikisource_pairs['furmanov_chapaev_sents'])))

random.seed(1)
furmanov_chapaev_devtest = random.sample(sorted(set(wikisource_pairs['furmanov_chapaev_sents'])), k=400)

756


In [18]:
print(len(set(wikisource_pairs['gorkiy_mother_sents'])))

random.seed(1)
gorkiy_mother_devtest = random.sample(sorted(set(wikisource_pairs['gorkiy_mother_sents'])), k=200)

1477


In [19]:
print(len(set(wikisource_pairs['gaydar_distant_countries_sents'])))

random.seed(1)
gaydar_distant_countries_devtest = random.sample(sorted(set(wikisource_pairs['gaydar_distant_countries_sents'])), k=200)

1100


In [20]:
components = {
    'news_sents': news_sents_devtest,
    'gorkiy_mother': gorkiy_mother_devtest,
    'furmanov_chapaev': furmanov_chapaev_devtest,
    'gaydar_distant_countries': gaydar_distant_countries_devtest
}
dev = {k: v[:len(v)//2] for k, v in components.items()}
test = {k: v[len(v)//2:] for k, v in components.items()}

In [21]:
print(len({s for c in dev.values() for s in c}))
print(len({s for c in test.values() for s in c}))

900
900


In [22]:
devtest = {s for c in components.values() for s in c}
train = sorted(set(all_pairs).difference(devtest))
print(len(train))

75365


# Add bible data from dataset "slone/finugorbib"

In [23]:
bible_ds = load_dataset("slone/finugorbib", "all_to_rus")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [24]:
bible_train_df = bible_ds['train'].to_pandas()

bible_train_df = bible_train_df[bible_train_df['lang_code'] == 'mdf'][['text', 'other']]
bible_train_df = bible_train_df[~bible_train_df['text'].str.contains('Глава')]
bible_train_df = bible_train_df.drop_duplicates()

bible_train_pairs = list(zip(bible_train_df['text'], bible_train_df['other']))

print(len(bible_train_pairs))
print(random.choice(bible_train_pairs))

(10036, 2)

In [33]:
train = sorted(set(train).union(set(bible_train_pairs)))
print(len(train))

85401


In [34]:
bible_dev_df = bible_ds['validation'].to_pandas()

bible_dev_df = bible_dev_df[bible_dev_df['lang_code'] == 'mdf'][['text', 'other']]
bible_dev_df = bible_dev_df[~bible_dev_df['text'].str.contains('Глава')]
bible_dev_df = bible_dev_df.drop_duplicates()

bible_dev_pairs = list(zip(bible_dev_df['text'], bible_dev_df['other']))

print(len(bible_dev_pairs))
print(random.choice(bible_dev_pairs))

817
('Ошть персезе оцю, сери перяфкс, и эсонза ульсть кемгафтува ортат, и кемгафтува ортатнень видеса ащесть кемгафтува ангелхт. Ортатнень лангс тяштьфтольхть Израилень кемгафтува юропнень лемсна, эрь ортаса фкя лем.', 'Он имеет большую и высокую стену, имеет двенадцать ворот и на них двенадцать Ангелов; на воротах написаны имена двенадцати колен сынов Израилевых:')


In [35]:
dev['bible'] = sorted(bible_dev_pairs)

In [39]:
bible_test_df = bible_ds['test'].to_pandas()

bible_test_df = bible_test_df[bible_test_df['lang_code'] == 'mdf'][['text', 'other']]
bible_test_df = bible_test_df[~bible_test_df['text'].str.contains('Глава')]
bible_test_df = bible_test_df.drop_duplicates()

bible_test_pairs = list(zip(bible_test_df['text'], bible_test_df['other']))

print(len(bible_test_pairs))
print(random.choice(bible_test_pairs))

775
('«Но ся шитнень, ся ризфта меле, ши валдсь шобдалгоды, ковське аф кармай валдоптома,', 'Но в те дни, после скорби той, солнце померкнет, и луна не даст света своего,')


In [40]:
test['bible'] = sorted(bible_dev_pairs)

# Save data

In [41]:
with open(ALIGNED_DATA_PATH_PREFIX + 'train_test_splitting/dev.json', 'w') as f:
    json.dump(dev, f, ensure_ascii=False, indent=4)

In [42]:
with open(ALIGNED_DATA_PATH_PREFIX + 'train_test_splitting/test.json', 'w') as f:
    json.dump(test, f, ensure_ascii=False, indent=4)

In [43]:
with open(ALIGNED_DATA_PATH_PREFIX + 'train_test_splitting/train.json', 'w') as f:
    json.dump(train, f, ensure_ascii=False, indent=4)