In [None]:
!pip install razdel

In [None]:
import json

import pandas as pd
from transformers import AutoModelForPreTraining, BertTokenizer

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
DATA_PATH_PREFIX = 'drive/MyDrive/diploma/data/'

# prepare data

In [None]:
mdf_names_df = pd.read_csv(DATA_PATH_PREFIX + 'mdf_names_df.tsv', sep='\t')

In [None]:
ru_names_df = pd.read_csv(DATA_PATH_PREFIX + 'ru_names_df.tsv', sep='\t')

In [None]:
ru_names_df.head(3)

Unnamed: 0.1,Unnamed: 0,name,text,fn
0,0,Интерактивная карта «От Уразаевки до Рузаевки»...,Интерактивная литературно-краеведческая карта ...,article_10505.json
1,1,За сутки в Мордовии выявлено 36 случаев корона...,В федеральной статистике зафиксирован новый ле...,article_12538.json
2,2,Бизнес Мордовии положительно оценил инвестицио...,Минэкономразвития России и АНО «Диалог Регионы...,article_04116.json


In [None]:
aligned_name_pairs_df = pd.read_csv(DATA_PATH_PREFIX + 'aligned_name_pairs.tsv', sep='\t')

In [None]:
aligned_name_pairs_df.head(3)

Unnamed: 0.1,Unnamed: 0,fn,closest_fns_1,closest_fns_1.1,name,closest_1
0,359,article_01123.json,article_04675.json,article_04675.json,Медицинскяй сестрань Международнай шить мархта...,Поздравление Председателя Правительства Мордов...
1,1508,article_01580.json,article_06068.json,article_06068.json,Мордовиянь Оцюнясь примазень ломаттнень кизефк...,Глава Мордовии провел прием граждан
2,4265,article_03900.json,article_16402.json,article_16402.json,Мордовиянь Оцюнясь Владимир Волков поздравлянд...,Глава Мордовии Владимир Волков поздравил журна...


In [None]:
aligned_name_pairs_df['mdf_text'] = aligned_name_pairs_df['fn'].apply(lambda x: mdf_names_df[mdf_names_df['fn'] == x]['text'].item())

In [None]:
aligned_name_pairs_df['ru_text'] = aligned_name_pairs_df['closest_fns_1'].apply(lambda x: ru_names_df[ru_names_df['fn'] == x]['text'].item())

In [None]:
aligned_name_pairs_df = aligned_name_pairs_df.fillna('')

# align and save

In [None]:
# here can `import align_ru_mdf` or defenition of all funcs

In [None]:
import align_ru_mdf from ru_mdf_aligner

In [None]:
model_path = 'drive/MyDrive/diploma/labse_moksha_60k+50k+2k+3k+1k'

In [None]:
tokenizer = BertTokenizer.from_pretrained(model_path)
model = AutoModelForPreTraining.from_pretrained(model_path)
model.cuda()

In [None]:
all_aligned_pairs = []

for idx, (ru_text, mdf_text) in enumerate(zip(
    aligned_name_pairs_df['ru_text'],
    aligned_name_pairs_df['mdf_text']
)):
    if not ru_text or not mdf_text:
        print(f"empty pair: ({idx}), {ru_text}, {mdf_text}")
        continue

    aligned_pairs = align_ru_mdf(ru_text, mdf_text, model_path)

    if not aligned_pairs:
        print(f"0 aligned pairs: {idx}, {ru_text}, {mdf_text}")
    else:
        all_aligned_pairs += aligned_pairs


In [None]:
data = []
for ru, mdf in all_aligned_pairs:
    data.append({'ru': ru, 'mdf': mdf})

In [None]:
import json

In [None]:
with open(DATA_PATH_PREFIX + 'aligned_sents.json', "w") as file:
    json.dump(data, file, indent=2)