In [2]:
import pandas as pd
import re
import random
import json
from tqdm.auto import tqdm, trange

In [7]:
PARSED_DATA_PATH_PREFIX = '../parsing_aligned_data/results/'

In [52]:
ALIGNED_DATA_PATH_PREFIX = '../aligner/v1_07_02_25/'

## Load sentence-parallel data

### bible

In [8]:
bible_df = pd.read_csv(PARSED_DATA_PATH_PREFIX + 'moksha_bible.tsv', sep='\t')
print(bible_df.shape)

bible_df = bible_df.dropna()
print(bible_df.shape)

bible_df = bible_df[~bible_df['mdf'].str.startswith('Глава ')]
print(bible_df.shape)
print(bible_df.columns)

(11650, 4)
(11633, 4)
(11583, 4)
Index(['Unnamed: 0', 'mdf', 'ru', 'source'], dtype='object')


In [10]:
bible_pairs = list(zip(bible_df['mdf'], bible_df['ru']))

print(len(bible_pairs))
print(random.choice(bible_pairs))

11583


### words and phrases

In [12]:
word_df = pd.read_csv(PARSED_DATA_PATH_PREFIX + 'all_dicts_data.tsv', sep='\t')

assert not word_df.isna().sum().sum()

word_pairs = list(zip(word_df['mdf'], word_df['ru']))

print(len(word_pairs))
print(random.choice(word_pairs))

In [20]:
phrases_df = pd.read_csv(PARSED_DATA_PATH_PREFIX + 'all_phrases.tsv', sep='\t')

assert not phrases_df.isna().sum().sum()

phrases_pairs = list(zip(phrases_df['mdf'], phrases_df['ru']))

print(len(phrases_pairs))
print(random.choice(phrases_pairs))

### news

In [55]:
with open(ALIGNED_DATA_PATH_PREFIX + 'aligned_news_names_06_02.json', 'r') as f:
    news_names_df = pd.DataFrame(json.load(f))

assert not news_names_df.isna().sum().sum(), "Missing values detected in news_names"

news_names_pairs = list(zip(news_names_df['mdf'], news_names_df['ru']))

print(len(news_names_pairs))
print(random.choice(news_names_pairs))

In [60]:
with open(ALIGNED_DATA_PATH_PREFIX + 'aligned_news_sents_06_02.json', 'r') as f:
    news_sents_df = pd.DataFrame(json.load(f))

assert not news_sents_df.isna().sum().sum(), "Missing values detected in news_sents"

news_sents_pairs = list(zip(news_sents_df['mdf'], news_sents_df['ru']))

print(len(news_sents_pairs))
print(random.choice(news_sents_pairs))

### wikisource

In [73]:
document_names = [
    'chekhov_75.000_sents',
    'chekhov_chameleon_sents',
    'chekhov_maria_ivanovna_sents',
    'chekhov_person_in_case_sents',
    'chekhov_prishibaev_sents',
    'chekhov_self-delusion_sents',
    'chekhov_trifon_sents',
    'furmanov_chapaev_sents',
    'gaydar_distant_countries_sents',
    'gogol_dead_souls_sample_sents',
    'gorkiy_happiness_sents',
    'gorkiy_hero_sents',
    'gorkiy_italian_11_sents',
    'gorkiy_mother_sents',
    'gorkiy_mother_part2_sents',
    'gorkiy_russian_fairy_tales_sents',
    'konstitution_sents',
    'land_direct_sents',
    'shchedrin_konyaga_sents',
    'shchedrin_vilage_fire_sents',
]

In [74]:
wikisource_pairs = {}

for doc_name in document_names:
    file_path = ALIGNED_DATA_PATH_PREFIX + f"aligned_{doc_name}_06_02.json"
    with open(file_path, 'r') as f:
        df = pd.DataFrame(json.load(f))
    
    assert not df.isna().sum().sum(), f"Missing values detected in {doc_name}"
    
    pairs = list(zip(df['mdf'], df['ru']))

    if doc_name == 'konstitution_sents':
        pairs = [pair for pair in pairs if not pair[1].startswith('Статья')]

    wikisource_pairs[doc_name] = pairs

    print(f"{doc_name}: {len(pairs)} pairs")
    print(random.choice(pairs))

    print()

land_dicret_sents: 5 pairs
('4. Модать лиякс явондомаса оцю тефнень ланкса руководствань вятеманди, ингольпяли мянь Учредительнай пуромксть мархта окончательнайста кемокстамозост, араза тяда меле няфтьф сьора-видиень наказсь, конац тийф 242 вастонь сьора видиень накаснень коряс „Сьора-видиень Депутатонь Всероссийскай Советть Известиянзон“ мархта и пячатлаф нят „Известиятнень“ 88 номерсост.', '4. Для руководства по осуществлению великих земельных преобразований, впредь до окончательного их решения Учредительным собранием, должен повсюду служить следующий крестьянский наказ, составленный на основании 242 местных крестьянских наказов редакцией «Известий Всероссийского Совета Крестьянских Депутатов» и опубликованный в номере 88 этих «Известий».')

russian_gorkiy_sents: 191 pairs
('Шарьхкодсть фкя-фкянь, ваяфтозь дипломатснон ляйти и кармасть корхтама толк мархта:', 'Поняли друг друга, утопили дипломатов в реке и давай говорить толком:')

hero_gorkiy_sents: 82 pairs
('Мзярда хлыст мархта ло

### Union

In [81]:
all_pairs = sorted({
    c for c in
    word_pairs + phrases_pairs + bible_pairs + news_names_pairs + news_sents_pairs + [pair for pairs in wikisource_pairs.values() for pair in pairs]
    if c[0] and c[1]
})
print(len(all_pairs))

36008


In [None]:
with open(ALIGNED_DATA_PATH_PREFIX + 'train_test_splitting/all_pairs.json', 'w') as f:
    json.dump(set(all_pairs), f, indent=2, ensure_ascii=False)

# Select traindev data

In [35]:
print(len(set(bible_pairs)))

random.seed(1)
bible_devtest = random.sample(sorted(set(bible_pairs)), k=800)

11583


In [86]:
print(len(set(wikisource_pairs['chapaev_sents'])))

random.seed(1)
chapaev_devtest = random.sample(sorted(set(wikisource_pairs['chapaev_sents'])), k=400)

811


In [87]:
print(len(set(wikisource_pairs['Mother_gorkiy_sents'])))

random.seed(1)
mother_gorkiy_devtest = random.sample(sorted(set(wikisource_pairs['Mother_gorkiy_sents'])), k=200)

1739


In [88]:
print(len(set(wikisource_pairs['distant_countries_gaydar_sents'])))

random.seed(1)
distant_countries_gaydar_devtest = random.sample(sorted(set(wikisource_pairs['distant_countries_gaydar_sents'])), k=200)

1133


In [89]:
components = {
    'bible': bible_devtest,
    'mother_gorkiy': mother_gorkiy_devtest,
    'chapaev': chapaev_devtest,
    'distant_countries_gaydar': distant_countries_gaydar_devtest
}
dev = {k: v[:len(v)//2] for k, v in components.items()}
test = {k: v[len(v)//2:] for k, v in components.items()}

In [90]:
print(len({s for c in dev.values() for s in c}))
print(len({s for c in test.values() for s in c}))

800
800


In [92]:
with open(ALIGNED_DATA_PATH_PREFIX + 'train_test_splitting/dev.json', 'w') as f:
    json.dump(dev, f, indent=2, ensure_ascii=False)

In [93]:
with open(ALIGNED_DATA_PATH_PREFIX + 'train_test_splitting/test.json', 'w') as f:
    json.dump(test, f, indent=2, ensure_ascii=False)

In [94]:
devtest = {s for c in components.values() for s in c}
len(devtest)

1600

In [95]:
train = sorted(set(all_pairs).difference(devtest))
print(len(train))

34428


In [97]:
with open(ALIGNED_DATA_PATH_PREFIX + 'train_test_splitting/train.json', 'w') as f:
    json.dump(train, f, indent=2, ensure_ascii=False)