In [1]:
!pip install razdel

Collecting razdel
  Downloading razdel-0.5.0-py3-none-any.whl.metadata (10.0 kB)
Downloading razdel-0.5.0-py3-none-any.whl (21 kB)
Installing collected packages: razdel
Successfully installed razdel-0.5.0


In [2]:
import os
import json

import pandas as pd
from transformers import AutoModelForPreTraining, BertTokenizer

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
DATA_PATH_PREFIX = 'drive/MyDrive/diploma/data/'

In [10]:
DATA_DIR = DATA_PATH_PREFIX + 'texts_for_align/'

# align and save

In [7]:
model_path = 'drive/MyDrive/diploma/labse_moksha_40k+70k+2k_ce_0602_v2'

In [8]:
model = AutoModelForPreTraining.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(model_path)

BertForPreTraining(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(65143, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [None]:
model.cuda();

In [None]:
def get_mdf_ru_texts(key: str) -> tuple[str, str]:
    # проверяем, что все ключи релевантны
    if not key.startswith('mdf') and not key.startswith('ru'):
        raise RuntimeError("")

    # избежать повторную обработку, обрабатываем только если key начинается с mdf
    if not key.startswith('mdf'):
        return '', ''

    # если ключ составной, то используем idx (key_parts[1]). иначе просто берем 'mdf', 'ru'
    key_parts = key.split('_')
    if len(key_parts) == 1:
        return texts['mdf'], texts['ru']

    if len(key_parts) == 2:
        return texts[f'mdf_{key_parts[1]}'], texts[f'ru_{key_parts[1]}']

    raise RuntimeError()


def align_wikisource_doc(filename, print_non_parallel_texts:bool=False):
    with open(filename, 'r') as f:
        texts = json.load(f)

    all_aligned_pairs = []

    for key in texts.keys():
        mdf_text, ru_text = get_mdf_ru_texts(key)

        if not ru_text or not mdf_text:
            print(f"empty pair: ({key}), {mdf_text}, {ru_text}")
            continue

        aligned_pairs = align_sentences(mdf_text, ru_text, model, tokenizer)
        all_aligned_pairs += aligned_pairs

        if print_non_parallel_texts and not aligned_pairs:
            print(f"0 aligned pairs: {key}, {mdf_text}, {ru_text}")

    return all_aligned_pairs

In [24]:
for filename in os.listdir(DATA_DIR):
    if not filename.endswith('json'):
        continue

    book = '.'.join(filename.split('.')[:-1])
    print(f"{book=}")

    all_aligned_pairs = align_wikisource_doc(DATA_DIR + filename)
    print(f"{len(all_aligned_pairs)=}")

    data = []
    for mdf, ru in all_aligned_pairs:
        if not is_text_valid(mdf) or not is_text_valid(ru):
            continue

        data.append({'mdf': mdf, 'ru': ru})

    with open(DATA_PATH_PREFIX + f'aligned_{book}_sents_06_02.json', "w") as file:
        json.dump(data, file, ensure_ascii=False, indent=4)

    print()

book='Mother_gorkiy'
len(all_aligned_pairs)=1754

book='gogol'
len(all_aligned_pairs)=221

book='vilage_fire_shchedrin'
len(all_aligned_pairs)=144

book='75.000_chekhov'
len(all_aligned_pairs)=133

book='chapaev'
len(all_aligned_pairs)=813

book='distant_countries_gaydar'
len(all_aligned_pairs)=1135

book='konyaga_shchedrin'
len(all_aligned_pairs)=109

book='Maria_Ivanovna_chekhov'
len(all_aligned_pairs)=32

book='self-delusion_chekhov'
len(all_aligned_pairs)=36

book='trifon_chekhov'
len(all_aligned_pairs)=108

book='italian_11_gorkiy'
len(all_aligned_pairs)=89

book='happiness_gorkiy'
len(all_aligned_pairs)=54

book='chameleon_chekhov'
len(all_aligned_pairs)=92

book='hero_gorkiy'
len(all_aligned_pairs)=82

book='land_dicret'
len(all_aligned_pairs)=5

book='Mother_gorkiy_part2'
len(all_aligned_pairs)=263

book='person_in_case_chekhov'
len(all_aligned_pairs)=202

book='prishibaev_chekhov'
len(all_aligned_pairs)=88

book='russian_gorkiy'
len(all_aligned_pairs)=191

book='konstitution'
