In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedGroupKFold
from multiprocesspandas import applyparallel
from tqdm import tqdm
from transformers import AutoTokenizer

In [2]:
input_dir = '/root/kaggle/input_dir/'
topic_df = pd.read_csv(input_dir + 'topics.csv')
content_df = pd.read_csv(input_dir + 'content.csv')
corr_df = pd.read_csv(input_dir + 'correlations.csv')
# topic_df = topic_df.rename(columns={'id': 'topic_id'}).merge(corr_df)

In [3]:
corr_df['content_ids'] = corr_df['content_ids'].apply(lambda x:x.split())
corr_df = corr_df.explode('content_ids').reset_index(drop=True)

In [4]:
topic_df = topic_df.fillna('')
topic_df['topic_full_text'] =  topic_df['title'] + ' [SEP] ' + topic_df['description']
topic_df = topic_df[['id', 'topic_full_text', 'language']]
df = corr_df.merge(topic_df, left_on='topic_id', right_on='id', how='left')
df = df[['topic_id','content_ids','topic_full_text','language']]
df = df.rename(columns={'language':'topic_language'})

In [5]:
content_df = content_df.fillna('')
content_df['content_full_text'] =  content_df['title'] + ' [SEP] ' + content_df['description'] + ' [SEP] ' + content_df['text']
content_df = content_df[['id', 'content_full_text', 'language']]
df = df.merge(content_df, left_on='content_ids', right_on='id', how='left')
df = df.rename(columns={'language':'content_language'})
df['label'] = 1

In [6]:
df.head()

Unnamed: 0,topic_id,content_ids,topic_full_text,topic_language,id,content_full_text,content_language,label
0,t_00004da3a1b2,c_1108dd0c7a5d,Откриването на резисторите [SEP] Изследване на...,bg,c_1108dd0c7a5d,Молив като резистор [SEP] Моливът причинява пр...,bg,1
1,t_00004da3a1b2,c_376c5a8eb028,Откриването на резисторите [SEP] Изследване на...,bg,c_376c5a8eb028,Да чуем променливото съпротивление [SEP] Тук ч...,bg,1
2,t_00004da3a1b2,c_5bc0e1e2cba0,Откриването на резисторите [SEP] Изследване на...,bg,c_5bc0e1e2cba0,Променлив резистор (реостат) с графит от молив...,bg,1
3,t_00004da3a1b2,c_76231f9d0b5e,Откриването на резисторите [SEP] Изследване на...,bg,c_76231f9d0b5e,Последователно свързване на галваничен елемент...,bg,1
4,t_00068291e9a4,c_639ea2ef9c95,Entradas e saídas de uma função [SEP] Entenda ...,pt,c_639ea2ef9c95,Dados e resultados de funções: gráficos [SEP] ...,pt,1


## random sample according to language

In [8]:
neg_df = []
sample_n = 20
sample_from_same_lamguage  = 16
path = r'/root/kaggle/input_dir/model/mdeberta-v3-base'
tokenizer = AutoTokenizer.from_pretrained(path)
def negative_smaple(x, candidates):
    topic_language = x['topic_language'][0]
    candidates = candidates[candidates['content_language'] == topic_language]
    return candidates[['topic_full_text', 'content_full_text']].sample(n=sample_n)


for topic_id in tqdm(df['topic_id'].unique()):
    sub_df = df[df['topic_id'] == topic_id]
    topic_language = sub_df['topic_language'].unique()[0]
    topic_full_text = sub_df['topic_full_text'].unique()[0]
    candidates_same_language = df[df['content_language'] == topic_language]
    candidates_diff_language = df[df['content_language'] != topic_language]
    ## random same language negative sample
    random_same_language = []
    for i in sub_df['topic_full_text'].to_list():
        sample_nums = min(len(candidates_same_language),sample_from_same_lamguage)
        sample_neg = candidates_same_language[['topic_full_text', 'content_full_text']].sample(n=sample_nums)
        sample_neg = sample_neg[-(sample_neg['content_full_text'].isin(sub_df['content_full_text'].to_list()))]
        sample_neg['topic_id'] = topic_id
        sample_neg['label'] = 0
        sample_neg['topic_full_text'] = i
        random_same_language.append(sample_neg)
    
    #random other language negative sample
    
    random_diff_language = []
    for i in sub_df['topic_full_text'].to_list():
        sample_nums = min(len(candidates_diff_language),sample_n - sample_from_same_lamguage)
        sample_neg = candidates_diff_language[['topic_full_text', 'content_full_text']].sample(n=sample_n - sample_from_same_lamguage)
        sample_neg = sample_neg[-(sample_neg['content_full_text'].isin(sub_df['content_full_text'].to_list()))]
        sample_neg['topic_id'] = topic_id
        sample_neg['label'] = 0
        sample_neg['topic_full_text'] = i
        random_diff_language.append(sample_neg)
    random_same_language = pd.concat(random_same_language)
    random_diff_language = pd.concat(random_diff_language)
    neg_df.append(random_same_language)
    neg_df.append(random_diff_language)

neg_df = pd.concat(neg_df)
neg_df = neg_df.drop_duplicates()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  0%|          | 8/61517 [00:03<6:25:36,  2.66it/s]


KeyboardInterrupt: 

In [38]:
neg_df

Unnamed: 0,topic_full_text,content_full_text,topic_id,label
215841,Откриването на резисторите [SEP] Изследване на...,"Отношения, представени с лентови диаграми [SEP...",t_00004da3a1b2,0
101092,Откриването на резисторите [SEP] Изследване на...,Ъгли - основни понятия [SEP] Определи кой ъгъл...,t_00004da3a1b2,0
250766,Откриването на резисторите [SEP] Изследване на...,Умножение на две обикновени дроби с помощта на...,t_00004da3a1b2,0
103993,Откриването на резисторите [SEP] Изследване на...,Последователно свързани резистори [SEP] Резист...,t_00004da3a1b2,0
49535,Откриването на резисторите [SEP] Изследване на...,Артериолосклероза - част 2 [SEP] Виж как хипер...,t_00004da3a1b2,0
...,...,...,...,...
194656,تحديد العلاقة بين الإحداثيّات القطبية والإحداث...,7. المتغيرات [SEP] [SEP],t_fffe811a6da9,0
67934,تحديد العلاقة بين الإحداثيّات القطبية والإحداث...,Level 2 : Remainder and factor theorem [SEP] ...,t_fffe811a6da9,0
203306,تحديد العلاقة بين الإحداثيّات القطبية والإحداث...,Problemas de subtração até 10 [SEP] Sal resolv...,t_fffe811a6da9,0
165147,تحديد العلاقة بين الإحداثيّات القطبية والإحداث...,Tatouage Henné : Motif Simple 4-9 [SEP] [SEP]...,t_fffe811a6da9,0


In [39]:
neg_df.to_parquet('random_negative_for_recall_exp4.parquet')