In [None]:
!pip install hnswlib
# !pip install razdel

Collecting razdel
  Downloading razdel-0.5.0-py3-none-any.whl.metadata (10.0 kB)
Downloading razdel-0.5.0-py3-none-any.whl (21 kB)
Installing collected packages: razdel
Successfully installed razdel-0.5.0


In [None]:
import json

import hnswlib
import numpy as np
import pandas as pd
import torch
from transformers import BertModel, BertTokenizerFast

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!ls drive/MyDrive/diploma/data

aligned_news_names_09_02.json  all_phrases.tsv	e-mordovia  moksha_bible.tsv  texts_for_align
all_dicts_data.tsv	       dev.json		mdf_mono    test.json	      train.json


In [None]:
from utils import clean_text, is_text_valid

In [None]:
DATA_PATH_PREFIX = 'drive/MyDrive/diploma/data/'

In [None]:
MODEL_PATH = 'drive/MyDrive/diploma/labse_moksha_40k+5k'

In [None]:
BATCH_SIZE = 128

In [None]:
MAX_LENGTH = 256

# Load data

In [None]:
ru_names_df = pd.read_csv(DATA_PATH_PREFIX + 'e-mordovia/ru_names_df.tsv', sep='\t')
ru_names_df = ru_names_df.dropna()

In [None]:
ru_names_df.shape

In [None]:
ru_names_df.head()

In [None]:
mdf_names_df = pd.read_csv(DATA_PATH_PREFIX + 'e-mordovia/mdf_names_df.tsv', sep='\t')
mdf_names_df = mdf_names_df.dropna()

In [None]:
mdf_names_df.shape

In [None]:
mdf_names_df.head()

# Get papers features

In [None]:
model = BertModel.from_pretrained(MODEL_PATH)
tokenizer = BertTokenizerFast.from_pretrained(MODEL_PATH)

In [None]:
model.cuda();

In [None]:
st = 0
ru_names_embs = []

with torch.inference_mode():
    for end in list(range(BATCH_SIZE, ru_names_df.shape[0] + BATCH_SIZE, BATCH_SIZE)):
        toks = tokenizer(
            ru_names_df['text'].values.tolist()[st:end],
            return_tensors='pt',
            padding=True,
            truncation=True,
            max_length=MAX_LENGTH
        )

        model_output = model(**toks.to(model.device))
        embs = torch.nn.functional.normalize(model_output.pooler_output)

        ru_names_embs.extend(embs.tolist())
        st = end

In [None]:
ru_names_embs = torch.Tensor(ru_names_embs)
ru_names_embs.shape

torch.Size([18183, 768])

In [None]:
st = 0
mdf_names_embs = []

with torch.no_grad():
    for end in list(range(BATCH_SIZE, mdf_names_df.shape[0] + BATCH_SIZE, BATCH_SIZE)):
        toks = tokenizer(
            mdf_names_df['text'].values.tolist()[st:end],
            return_tensors='pt',
            padding=True,
            truncation=True,
            max_length=MAX_LENGTH
        )

        model_output = model(**toks.to(model.device))
        embs = torch.nn.functional.normalize(model_output.pooler_output)

        mdf_names_embs.extend(embs.tolist())
        st = end

In [None]:
mdf_names_embs = torch.Tensor(mdf_names_embs)
mdf_names_embs.shape

torch.Size([4560, 768])

# Get most relevant pairs with hnswlib

In [None]:
data_dim = 768
num_elements = 30000

index = hnswlib.Index(space='l2', dim=data_dim)
index.init_index(max_elements=num_elements, ef_construction=200, M=16)
index.add_items(ru_names_embs.numpy())

In [None]:
labels, distances = index.knn_query(mdf_names_embs.numpy(), k=2)
distances = 1 - distances

In [None]:
for i in range(2):
    mdf_names_df[f'closest_{i+1}'] = [ru_names_df.loc[label[i], 'name'] for label in labels]
    mdf_names_df[f'closest_fns_{i+1}'] = [ru_names_df.loc[label[i], 'fn'] for label in labels]
    mdf_names_df[f'closest_text_{i+1}'] = [ru_names_df.loc[label[i], 'text'] for label in labels]
    mdf_names_df[f'distances_{i+1}'] = distances[:, i]

In [None]:
results = mdf_names_df.copy()

In [None]:
results['diff'] = results.apply(lambda x: x['distances_1'] - x['distances_2'], axis=1)

In [None]:
np.percentile(results['diff'].values, 20)

In [None]:
aligned_pairs = results[(results['diff'] >= 0.1) & (results['distances_1'] > 0.45)][[
    'name', 'text', 'fn',
    'closest_1', 'closest_text_1', 'closest_fns_1', 'distances_1',
    'diff'
]].sort_values('diff')

In [None]:
aligned_pairs.shape

(1116, 8)

# Get pairs for markup

In [None]:
df_for_markup = results[~results['fn'].isin(aligned_pairs['fn'].values)]

In [None]:
df_for_markup.shape

(3444, 12)

In [None]:
df_for_markup[['name', 'closest_1', 'closest_2', 'diff']].values[0]

array(['Розатнень касфнемаснон коряс од тепличнай комплекссь Кадошкинаса панжеви сентябрьста',
       'Росреестр Мордовии участвует в «Ёлке желаний»',
       'В Приволжском федеральном округе стартовала акция «Елка желаний»',
       0.017722666263580322], dtype=object)

# Save data

In [None]:
aligned_pairs.to_csv(DATA_PATH_PREFIX + 'e-mordovia/aligned_pairs_09_02.tsv', sep='\t', index=False)

In [None]:
df_for_markup.to_excel(DATA_PATH_PREFIX + 'e-mordovia/df_for_markup_09_02.xlsx', index=False)

# Process markuped samples

In [None]:
markuped = pd.read_excel(DATA_PATH_PREFIX + 'e-mordovia/markup_09_02.xlsx')

In [None]:
markuped['closest'] = markuped.apply(lambda x: x['closest_2'] if x['ans'] == 2 else x['closest_1'], axis=1)

In [None]:
markuped['closest_text'] = markuped.apply(lambda x: x['closest_text_2'] if x['ans'] == 2 else x['closest_text_1'], axis=1)

In [None]:
aligned_pairs = pd.read_csv(DATA_PATH_PREFIX + 'e-mordovia/aligned_pairs_09_02.tsv', sep='\t')

In [None]:
all_aligned_pairs = pd.concat([
  aligned_pairs[
      ['name', 'closest_1', 'text', 'closest_text_1']
  ].rename(
      columns={'closest_1': 'closest', 'closest_text_1': 'closest_text'}
  ),
  markuped[
      ['name', 'closest', 'text', 'closest_text']
  ]
])

# Save parallel pairs

In [None]:
data = []
for _, row in all_aligned_pairs.iterrows():
    data.append({
      'mdf': row['text'],
      'ru': row['closest_text']
    })


with open(DATA_PATH_PREFIX + "e-mordovia/aligned_news_texts_09_02.json", 'w') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)

In [None]:
data = []
for _, row in all_aligned_pairs.iterrows():
    cleaned_mdf = clean_text(row['name'])
    cleaned_ru = clean_text(row['closest'])

    if not is_text_valid(cleaned_mdf) or not is_text_valid(cleaned_ru):
        continue

    data.append({
      'mdf': cleaned_mdf,
      'ru': cleaned_ru
    })


with open(DATA_PATH_PREFIX + "aligned_news_names_09_02.json", 'w') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)