In [1]:
!pip install hnswlib

Collecting hnswlib
  Downloading hnswlib-0.8.0.tar.gz (36 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: hnswlib
  Building wheel for hnswlib (pyproject.toml) ... [?25l[?25hdone
  Created wheel for hnswlib: filename=hnswlib-0.8.0-cp311-cp311-linux_x86_64.whl size=2383818 sha256=432ed6022c296a92776c973c83a12f53b81f335bbc94f40fb6b8fbff5008f93f
  Stored in directory: /root/.cache/pip/wheels/ea/4e/27/39aebca9958719776e36fada290845a7ef10f053ad70e22ceb
Successfully built hnswlib
Installing collected packages: hnswlib
Successfully installed hnswlib-0.8.0


In [2]:
import hnswlib
import numpy as np
import pandas as pd
import torch
from transformers import AutoModelForPreTraining, BertTokenizer

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!ls drive/MyDrive/diploma/data

all_dicts_data.tsv  all_phrases.tsv  e-mordovia  mdf_mono  moksha_bible.tsv  texts_for_align


In [3]:
DATA_PATH_PREFIX = 'drive/MyDrive/diploma/data/'

In [6]:
MODEL_DIR = 'labse_moksha_40k+70k+1k+500mlm+500_ce'

In [7]:
BATCH_SIZE = 2048

# get papers features

In [8]:
mdf_names_df = pd.read_csv(DATA_PATH_PREFIX + 'e-mordovia/mdf_names_df.tsv', sep='\t')

In [9]:
ru_names_df = pd.read_csv(DATA_PATH_PREFIX + 'e-mordovia/ru_names_df.tsv', sep='\t')

In [10]:
tokenizer = BertTokenizer.from_pretrained('drive/MyDrive/diploma/' + MODEL_DIR)
tuned_model = AutoModelForPreTraining.from_pretrained('drive/MyDrive/diploma/' + MODEL_DIR)

In [11]:
tuned_model.cuda();

In [12]:
st = 0
ru_names_embs = []

with torch.no_grad():
    for end in list(range(BATCH_SIZE, ru_names_df.shape[0] + BATCH_SIZE, BATCH_SIZE)):
        toks = tokenizer(
            ru_names_df['name'].values.tolist()[st:end],
            return_tensors='pt',
            padding=True,
            truncation=True,
            # max_length=128
        )

        embs = tuned_model.bert(**toks.to(tuned_model.device)).pooler_output
        embs = torch.nn.functional.normalize(embs)

        ru_names_embs.extend(embs.tolist())
        st = end

In [13]:
ru_names_embs = torch.Tensor(ru_names_embs)
ru_names_embs.shape

torch.Size([18210, 768])

In [14]:
st = 0
mdf_names_embs = []

with torch.no_grad():
    for end in list(range(BATCH_SIZE, mdf_names_df.shape[0] + BATCH_SIZE, BATCH_SIZE)):
        toks = tokenizer(
            mdf_names_df['name'].values.tolist()[st:end],
            return_tensors='pt',
            padding=True,
            truncation=True,
            # max_length=128
        )

        embs = tuned_model.bert(**toks.to(tuned_model.device)).pooler_output
        embs = torch.nn.functional.normalize(embs)

        mdf_names_embs.extend(embs.tolist())
        st = end

In [None]:
mdf_names_embs = torch.Tensor(mdf_names_embs)
mdf_names_embs.shape

## Get most relevant pairs with hnswlib

In [16]:
data_dim = 768
num_elements = 30000

index = hnswlib.Index(space='l2', dim=data_dim)
index.init_index(max_elements=num_elements, ef_construction=200, M=16)
index.add_items(ru_names_embs.numpy())

In [None]:
labels, distances = index.knn_query(mdf_names_embs.numpy(), k=2)
distances = 1 - distances

In [18]:
for i in range(2):
    mdf_names_df[f'closest_{i+1}'] = [ru_names_df.loc[label[i], 'name'] for label in labels]
    mdf_names_df[f'closest_fns_{i+1}'] = [ru_names_df.loc[label[i], 'fn'] for label in labels]
    mdf_names_df[f'closest_text_{i+1}'] = [ru_names_df.loc[label[i], 'text'] for label in labels]
    mdf_names_df[f'distances_{i+1}'] = distances[:, i]

## get the most confident pairs of parallel texts

In [19]:
results = mdf_names_df.copy()

In [20]:
results['diff'] = results.apply(lambda x: x['distances_1'] - x['distances_2'], axis=1)

In [21]:
np.percentile(results['diff'].values, 50) # 0.138

0.05055758357048035

In [22]:
aligned_pairs = results[(results['diff'] >= 0.05) & (results['distances_1'] > 0.45)][[
    'name', 'text', 'fn',
    'closest_1', 'closest_text_1', 'closest_fns_1', 'distances_1',
    'diff'
]].sort_values('diff')

In [23]:
aligned_pairs.shape

(1116, 8)

In [24]:
df_for_markup = results[~results['fn'].isin(aligned_pairs['fn'].values)]

In [25]:
df_for_markup.shape

(3444, 12)

In [26]:
df_for_markup[['name', 'closest_1', 'closest_2', 'diff']].values[0]

array(['Розатнень касфнемаснон коряс од тепличнай комплекссь Кадошкинаса панжеви сентябрьста',
       'Росреестр Мордовии участвует в «Ёлке желаний»',
       'В Приволжском федеральном округе стартовала акция «Елка желаний»',
       0.017722666263580322], dtype=object)

In [27]:
assert df_for_markup.shape[0] + aligned_pairs.shape[0] == results.shape[0]

## save data

In [28]:
import json


data = []
for _, row in aligned_pairs.iterrows():
    data.append({
      'text': row['text'],
      'closest_text_1': row['closest_text_1']
    })


with open(DATA_PATH_PREFIX + "e-mordovia/aligned_news_pairs_06_02.json", 'w') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)

In [None]:
import re

data = []
for _, row in aligned_pairs.iterrows():
    mdf_name = row['name'].replace('\xa0', ' ')
    mdf_name = re.sub('\s+', ' ', mdf_name).strip().replace('* ', '')
    mdf_name = mdf_name.replace('-\n', '').replace('\n', ' ').strip()

    ru_name = row['name'].replace('\xa0', ' ')
    ru_name = re.sub('\s+', ' ', ru_name).strip().replace('* ', '')
    ru_name = ru_name.replace('-\n', '').replace('\n', ' ').strip()

    data.append({
      'mdf': mdf_name,
      'ru': ru_name
    })


with open(DATA_PATH_PREFIX + "aligned_news_names_06_02.json", 'w') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)

In [29]:
aligned_pairs.to_csv(DATA_PATH_PREFIX + 'e-mordovia/aligned_pairs_06_02.tsv', sep='\t', index=False)

In [None]:
df_for_markup.to_excel(DATA_PATH_PREFIX + 'e-mordovia/df_for_markup_06_02.xlsx', index=False)