In [None]:
!pip install hnswlib

In [None]:
import hnswlib

In [None]:
import json
import re

import numpy as np
import pandas as pd
import torch
from transformers import BertModel, BertTokenizerFast

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
DATA_PATH_PREFIX = 'drive/MyDrive/diploma/data/'

In [None]:
MDF_MODEL = 'drive/MyDrive/diploma/labse_moksha_v3_500+3500_64bs_700_without_CE_teacher_2e-5_48bs_64mlm'
MYV_MODEL = "slone/LaBSE-en-ru-myv-v2"

In [None]:
BATCH_SIZE = 128

In [None]:
MAX_LENGTH = 256

In [None]:
# lang_pair = 'MDF-RU'
# lang_pair = 'MYV-RU'
lang_pair = 'MYV-MDF'

In [None]:
if lang_pair == 'MDF-RU':
    new_columns_dict = {
        'name': 'ru_name',
        'text': 'ru_text',
        'fn': 'ru_fn',
        'date': 'ru_date',
        'link': 'ru_link',
        'closest_name': 'mdf_name',
        'closest_text': 'mdf_text',
        'closest_fn': 'mdf_fn',
        'closest_date': 'mdf_date',
        'closest_link': 'mdf_link',
    }

if lang_pair == 'MYV-RU':
    new_columns_dict = {
        'name': 'ru_name',
        'text': 'ru_text',
        'fn': 'ru_fn',
        'date': 'ru_date',
        'link': 'ru_link',
        'closest_name': 'myv_name',
        'closest_text': 'myv_text',
        'closest_fn': 'myv_fn',
        'closest_date': 'myv_date',
        'closest_link': 'myv_link',
    }

if lang_pair == 'MYV-MDF':
    new_columns_dict = {
        'name': 'mdf_name',
        'text': 'mdf_text',
        'fn': 'mdf_fn',
        'date': 'mdf_date',
        'link': 'mdf_link',
        'closest_name': 'myv_name',
        'closest_text': 'myv_text',
        'closest_fn': 'myv_fn',
        'closest_date': 'myv_date',
        'closest_link': 'myv_link',
    }

In [None]:
small_lang_columns = list(new_columns_dict.values())[5:]
large_lang_columns = list(new_columns_dict.values())[:5]

small_lang_text_column = new_columns_dict['closest_text']
small_lang_fn = new_columns_dict['closest_fn']

# Load data

In [None]:
ru_names_df = pd.read_csv(DATA_PATH_PREFIX + 'e-mordovia/ru_names_df.tsv', sep='\t')
ru_names_df = ru_names_df.dropna().reset_index(drop=True)

In [None]:
ru_names_df.shape

In [None]:
ru_names_df.head()

In [None]:
mdf_names_df = pd.read_csv(DATA_PATH_PREFIX + 'e-mordovia/mdf_names_df.tsv', sep='\t')
mdf_names_df = mdf_names_df.dropna().reset_index(drop=True)

In [None]:
mdf_names_df.shape

In [None]:
mdf_names_df.head()

In [None]:
myv_names_df = pd.read_csv(DATA_PATH_PREFIX + 'e-mordovia/myv_names_df.tsv', sep='\t')
myv_names_df = myv_names_df.dropna().reset_index(drop=True)

In [None]:
myv_names_df.shape

In [None]:
myv_names_df.head()

# Get closest samples

## Load models

In [None]:
if 'MDF' in lang_pair:
    mdf_model = BertModel.from_pretrained(MDF_MODEL)
    mdf_tokenizer = BertTokenizerFast.from_pretrained(MDF_MODEL)
    mdf_model.cuda();

In [None]:
if 'MYV' in lang_pair:
    myv_model = BertModel.from_pretrained(MYV_MODEL)
    myv_tokenizer = BertTokenizerFast.from_pretrained(MYV_MODEL)
    myv_model.cuda();

## Get features

In [None]:
if 'RU' in lang_pair:
    st = 0
    ru_names_embs = []

    if 'MDF' in lang_pair:
        ru_tokenizer = mdf_tokenizer
        ru_model = mdf_model
    else:
        ru_tokenizer = myv_tokenizer
        ru_model = myv_model


    with torch.inference_mode():
        for end in list(range(BATCH_SIZE, ru_names_df.shape[0] + BATCH_SIZE, BATCH_SIZE)):
            toks = ru_tokenizer(
                ru_names_df['text'].values.tolist()[st:end],
                return_tensors='pt',
                padding=True,
                truncation=True,
                max_length=MAX_LENGTH
            )
            model_output = ru_model(**toks.to(ru_model.device))
            embs = torch.nn.functional.normalize(model_output.pooler_output)

            ru_names_embs.extend(embs.tolist())
            st = end

    ru_names_embs = torch.Tensor(ru_names_embs)
    ru_names_embs.shape

In [None]:
if 'MDF' in lang_pair:
    st = 0
    mdf_names_embs = []

    with torch.no_grad():
        for end in list(range(BATCH_SIZE, mdf_names_df.shape[0] + BATCH_SIZE, BATCH_SIZE)):
            toks = mdf_tokenizer(
                mdf_names_df['text'].values.tolist()[st:end],
                return_tensors='pt',
                padding=True,
                truncation=True,
                max_length=MAX_LENGTH
            )

            model_output = mdf_model(**toks.to(mdf_model.device))
            embs = torch.nn.functional.normalize(model_output.pooler_output)

            mdf_names_embs.extend(embs.tolist())
            st = end

    mdf_names_embs = torch.Tensor(mdf_names_embs)
    mdf_names_embs.shape

In [None]:
if 'MYV' in lang_pair:
    st = 0
    myv_names_embs = []

    with torch.no_grad():
        for end in list(range(BATCH_SIZE, myv_names_df.shape[0] + BATCH_SIZE, BATCH_SIZE)):
            toks = myv_tokenizer(
                myv_names_df['text'].values.tolist()[st:end],
                return_tensors='pt',
                padding=True,
                truncation=True,
                max_length=MAX_LENGTH
            )

            model_output = myv_model(**toks.to(myv_model.device))
            embs = torch.nn.functional.normalize(model_output.pooler_output)

            myv_names_embs.extend(embs.tolist())
            st = end

    myv_names_embs = torch.Tensor(myv_names_embs)
    myv_names_embs.shape

## Get most relevant pairs with hnswlib

In [None]:
def get_closest_articles(small_lang_df, small_lang_embs, large_lang_df, large_lang_embs, k=10):
    index = hnswlib.Index(space='l2', dim=768)
    index.init_index(max_elements=30000, ef_construction=200, M=16)

    index.add_items(small_lang_embs.numpy())

    labels, distances = index.knn_query(large_lang_embs.numpy(), k=k)
    distances = 1 - distances

    results = large_lang_df.copy()

    results[f'closest_name'] = [small_lang_df.loc[label, 'name'] for label in labels]
    results[f'closest_text'] = [small_lang_df.loc[label, 'text'] for label in labels]
    results[f'closest_fn'] = [small_lang_df.loc[label, 'fn'] for label in labels]
    results[f'closest_date'] = [small_lang_df.loc[label, 'date'] for label in labels]
    results[f'closest_link'] = [small_lang_df.loc[label, 'link'] for label in labels]
    results[f'distance'] = distances.tolist()

    exploded_results = results.explode(['closest_name', 'closest_text', 'closest_fn', 'closest_date', 'closest_link', 'distance']).reset_index(drop=True)

    return exploded_results


In [None]:
if lang_pair == 'MDF-RU':
    exploded_results = get_closest_articles(mdf_names_df, mdf_names_embs, ru_names_df, ru_names_embs)

if lang_pair == 'MYV-RU':
    exploded_results = get_closest_articles(myv_names_df, myv_names_embs, ru_names_df, ru_names_embs)

if lang_pair == 'MYV-MDF':
    exploded_results = get_closest_articles(myv_names_df, myv_names_embs, mdf_names_df, mdf_names_embs)

# Filtering using scores

mdf-ru:
- сначала фильтровалось по 0.45. затем были провалидированы сэмплы для случаев,когда к одному тексту матчатся только один русский текст
  - провалидировал для 5% (0.65) - есть правильные, но их не очень много
  - провалидировал для 2.5% (0.604) - много ложных

In [None]:
# for threshold in [0.6, 0.65, 0.7]:
#     print(exploded_results[exploded_results['distance'] > threshold].shape)

print(exploded_results[exploded_results['distance'] > 0.3].shape)

In [None]:
exploded_results = exploded_results[exploded_results['distance'] > 0.25]

In [None]:
exploded_results.shape

In [None]:
exploded_results['date'] = pd.to_datetime(exploded_results['date'], format='%d-%m-%Y', errors='coerce')
exploded_results['closest_date'] = pd.to_datetime(exploded_results['closest_date'], format='%d-%m-%Y', errors='coerce')

exploded_results['days_diff'] = (exploded_results['date'] - exploded_results['closest_date']).dt.days

exploded_results['date'] = exploded_results['date'].apply(lambda x: x.strftime('%d-%m-%Y'))
exploded_results['closest_date'] = exploded_results['closest_date'].apply(lambda x: x.strftime('%d-%m-%Y'))

In [None]:
exploded_results = exploded_results[
    (exploded_results['days_diff'] < 30) & (exploded_results['days_diff'] > -30)
]

In [None]:
exploded_results.shape

In [None]:
exploded_results['text_sentences_count'] = exploded_results['text'].apply(
    lambda x: len(list(re.finditer(r"(?<![А-ЯЁ]\.[А-ЯЁ]\.)(?<=[.!?])(?=\s*(?:[А-ЯЁЁ]|$))", x)))
)

In [None]:
exploded_results['closest_text_sentences_count'] = exploded_results['closest_text'].apply(
    lambda x: len(list(re.finditer(r"(?<![А-ЯЁ]\.[А-ЯЁ]\.)(?<=[.!?])(?=\s*(?:[А-ЯЁЁ]|$))", x)))
    )

In [None]:
exploded_results['sentences_count_diff'] = exploded_results.apply(
    lambda x: abs(x['closest_text_sentences_count'] - x['text_sentences_count']) / max(x['closest_text_sentences_count'], x['text_sentences_count']),
    axis=1
)

In [None]:
exploded_results['text_length'] = exploded_results['text'].apply(len)

In [None]:
exploded_results['closest_text_length'] = exploded_results['closest_text'].apply(len)

In [None]:
exploded_results['length_diff'] = exploded_results.apply(
    lambda x: abs(x['text_length'] - x['closest_text_length']) / max(x['text_length'], x['closest_text_length']),
    axis=1
)

In [None]:
exploded_results = exploded_results[~((exploded_results['sentences_count_diff'] > 0.55) & (exploded_results['length_diff'] > 0.3))]

In [None]:
exploded_results.shape

In [None]:
exploded_results = exploded_results.drop(
    [
        'text_sentences_count', 'closest_text_sentences_count', 'sentences_count_diff',
        'text_length', 'closest_text_length', 'length_diff'
    ],
    axis=1
)

# Save for markup

here we save all samples into 2 files

1st file contains samples where text in `small lang` meets once, 2nd file - where twice or more

In [None]:
results_gb = exploded_results.groupby(['closest_name', 'closest_text', 'closest_fn', 'closest_date', 'closest_link']).agg(list).reset_index()

In [None]:
results_gb['text'].apply(len).value_counts()

In [None]:
once_matched_samples = results_gb[
    results_gb['text'].apply(len) == 1
].explode(
    ['name', 'text', 'fn', 'date', 'link', 'distance', 'days_diff']
).sort_values('distance')

once_matched_samples = once_matched_samples.rename(columns=new_columns_dict)

once_matched_samples.to_excel(DATA_PATH_PREFIX + f'e-mordovia/once_matched_samples_{lang_pair}.xlsx', index=False)

In [None]:
several_matched_samples = results_gb[
    results_gb['text'].apply(len) > 2
].explode(
    ['name', 'text', 'fn', 'date', 'link', 'distance', 'days_diff']
).sort_values(['closest_text', 'distance'], ascending=[True, False])

several_matched_samples = several_matched_samples.rename(columns=new_columns_dict)

several_matched_samples.to_excel(DATA_PATH_PREFIX + f'e-mordovia/several_matched_samples_{lang_pair}.xlsx')

# Process semi-markup

here we markuped some samples and we can use the `delta` for auto markup some samples

In [None]:
# here is needed 'answer' column in table

# semi_processed_several_matched = pd.read_excel(DATA_PATH_PREFIX + f'e-mordovia/several_matched_samples_{lang_pair}_semi_processed.xlsx')
semi_processed_several_matched = several_matched_samples.copy()

import numpy as np
semi_processed_several_matched['answer'] = np.nan

In [None]:
semi_processed_several_matched = semi_processed_several_matched.drop('Unnamed: 0', axis=1)

In [None]:
semi_processed_several_matched.shape

In [None]:
unchecked = semi_processed_several_matched[semi_processed_several_matched['answer'].isna()]

In [None]:
unchecked.shape

In [None]:
unchecked_gb = unchecked.groupby(small_lang_columns).agg(list).reset_index()

In [None]:
delta = 0.1

In [None]:
unchecked_gb.loc[
    unchecked_gb['distance'].apply(lambda x: x[0] - x[1] > delta),
    'answer'
] = unchecked_gb.loc[
    unchecked_gb['distance'].apply(lambda x: x[0] - x[1] > delta)
].apply(lambda x: [1] + [0] * (len(x['distance']) - 1), axis=1)

In [None]:
several_matched = pd.concat([
    unchecked_gb.explode(large_lang_columns + ['distance', 'days_diff', 'answer']),
    semi_processed_several_matched[~semi_processed_several_matched['answer'].isna()]
]).sort_values([small_lang_text_column, 'distance'], ascending=[True, False])

In [None]:
several_matched.shape[0], several_matched['answer'].isna().sum()

In [None]:
several_matched.to_excel(DATA_PATH_PREFIX + f'e-mordovia/several_matched_samples_{lang_pair}_semi_processed_2.0.xlsx')

# process markup

at taht moment we processed all samples

In [None]:
once_matched = pd.read_excel(DATA_PATH_PREFIX + f'e-mordovia/once_matched_samples_{lang_pair}_processed.xlsx')
once_matched.shape

In [None]:
once_matched = once_matched[once_matched['answer'] == 1]
once_matched.shape

In [None]:
several_matched = pd.read_excel(DATA_PATH_PREFIX + f'e-mordovia/several_matched_samples_{lang_pair}_processed.xlsx')
several_matched.shape

In [None]:
several_matched = several_matched[several_matched['answer'] == 1]
several_matched.shape

In [None]:
all_matched = pd.concat([once_matched, several_matched])

In [None]:
all_matched.shape

In [None]:
inverse_new_columns_dict = {v: k for k, v in new_columns_dict.items()}

In [None]:
all_matched = all_matched.rename(columns=inverse_new_columns_dict)

In [None]:
all_matched = all_matched[list(inverse_new_columns_dict.values()) + ['distance']]

In [None]:
all_matched_gb = all_matched.groupby(['closest_name', 'closest_text', 'closest_fn', 'closest_date', 'closest_link']).agg(list).reset_index()

In [None]:
all_matched_gb.shape

In [None]:
dict_with_matches = {}

for _, row in all_matched_gb.iterrows():
    article_small_lang = {
        'name': row['closest_name'],
        'text': row['closest_text'],
        'fn': row['closest_fn'],
        'date': row['closest_date'],
        'link': row['closest_link'],
    }

    large_lang_candidates = []
    for name, text, fn, date, link, distance in zip(row['name'], row['text'], row['fn'], row['date'], row['link'], row['distance']):
        large_lang_candidates.append({
            'name': name,
            'text': text,
            'fn': fn,
            'date': date,
            'link': link,
            'distance': distance
        })

    dict_with_matches[row['closest_fn']] = {
        'article': article_small_lang,
        'candidates': large_lang_candidates
    }

In [None]:
import json

with open(DATA_PATH_PREFIX + f'e-mordovia/{lang_pair}.json', 'w') as f:
    json.dump(dict_with_matches, f, ensure_ascii=False)

## split train/dev/test

In [None]:
import numpy as np

In [None]:
percentile_90 = pd.to_datetime('2024-02-01', format='%Y-%m-%d', errors='coerce')

In [None]:
percentile_95 = pd.to_datetime('2024-05-25', format='%Y-%m-%d', errors='coerce')

In [None]:
import json

with open(DATA_PATH_PREFIX + f'e-mordovia/{lang_pair}.json', 'r') as f:
    dict_with_matches = json.load(f)

In [None]:
train_matches = {}
val_matches = {}
test_matches = {}


for fn, data in dict_with_matches.items():
    if pd.to_datetime(data['article']['date'], format='%d-%m-%Y', errors='coerce') < percentile_90:
        train_matches[fn] = data
    elif pd.to_datetime(data['article']['date'], format='%d-%m-%Y', errors='coerce') < percentile_95:
        val_matches[fn] = data
    else:
        test_matches[fn] = data

In [None]:
len(train_matches)

In [None]:
len(val_matches)

In [None]:
len(test_matches)

In [None]:
with open(DATA_PATH_PREFIX + f'e-mordovia/{lang_pair}_train.json', 'w') as f:
    json.dump(train_matches, f, ensure_ascii=False)

In [None]:
with open(DATA_PATH_PREFIX + f'e-mordovia/{lang_pair}_dev.json', 'w') as f:
    json.dump(val_matches, f, ensure_ascii=False)

In [None]:
with open(DATA_PATH_PREFIX + f'e-mordovia/{lang_pair}_test.json', 'w') as f:
    json.dump(test_matches, f, ensure_ascii=False)