In [1]:
import os,sys,gc

from pathlib import Path

import numpy as np
import pandas as pd
import polars as pl
from glob import glob
import pickle

from datetime import timedelta

import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.nn.functional as F

import warnings
warnings.filterwarnings('ignore')

#Logging
import logging

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
file_handler = logging.FileHandler('logs.log')

formatter = logging.Formatter('[%(asctime)s][%(levelname)s] %(message)s')
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)

stream_handler = logging.StreamHandler()
stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler)



# Data Load

In [4]:
#PATH
TRAIN_DIR = Path('/home/data/train_large')
TEST_DIR = Path('/home/data/ebnerd_testset')

In [5]:

articles = pd.read_parquet('/home/data/articles.parquet')

In [6]:
#text=title + subtitle + body

articles['text'] = articles['title'] + ' ' + articles['subtitle'] + ' ' + articles['body']

In [7]:
article_id_list = articles['article_id'].to_list()
article_text_list = articles['text'].to_list()

In [8]:
len(articles)

125541

In [9]:
len(article_text_list)

125541

In [10]:
#!pip install sentence_transformers

In [None]:
from sentence_transformers import SentenceTransformer

In [12]:
model_name = "intfloat/multilingual-e5-large-instruct"
model_tag = "multilingual-e5-large-instruct"

In [13]:
model = SentenceTransformer(model_name)

In [14]:
embeddings = model.encode(article_text_list,batch_size = 128,show_progress_bar=True)

Batches:   0%|          | 0/981 [00:00<?, ?it/s]

In [15]:
vec_df = pd.DataFrame(embeddings)
vec_df.columns = ['vector_'+str(i) for i in range(vec_df.shape[1])]    

In [16]:
vec_df.index = article_id_list
vec_df = vec_df.reset_index()
vec_df = vec_df.rename(columns={'index':'article_id'})
vec_df['article_id'] = vec_df['article_id'].astype('int32')
vec_df

Unnamed: 0,article_id,vector_0,vector_1,vector_2,vector_3,vector_4,vector_5,vector_6,vector_7,vector_8,...,vector_1014,vector_1015,vector_1016,vector_1017,vector_1018,vector_1019,vector_1020,vector_1021,vector_1022,vector_1023
0,3000022,0.041592,0.036939,-0.028729,-0.047646,0.039175,-0.014492,-0.020138,0.045008,0.009312,...,-0.024985,-0.028937,-0.016903,-0.020801,-0.005191,0.021026,0.011858,-0.052957,-0.016096,0.014601
1,3000063,0.033269,0.016943,-0.024200,-0.029367,0.020860,-0.019021,-0.029731,0.032847,0.036841,...,-0.013013,-0.033814,-0.003223,-0.022727,-0.000225,0.025412,-0.026295,-0.045524,-0.006492,0.015536
2,3000613,0.031512,0.012187,-0.029119,-0.044338,0.028903,-0.012179,-0.014930,0.017124,0.041693,...,-0.010633,-0.021796,0.007381,-0.006376,-0.020152,0.018552,0.017019,-0.043418,-0.005032,0.018726
3,3000700,0.032644,0.017983,-0.025564,-0.014386,0.020997,-0.007014,-0.015845,0.023249,0.026944,...,-0.009483,-0.026856,-0.013847,-0.030667,-0.003847,0.038637,0.044005,-0.030810,-0.020270,0.034830
4,3000840,0.034178,0.066137,-0.036457,-0.043256,0.025989,-0.028042,-0.020350,0.015908,0.025811,...,-0.014931,-0.020730,0.017001,-0.008465,0.010567,0.000845,0.017410,-0.058425,-0.019666,0.003741
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125536,9803505,0.047599,0.027253,-0.019702,-0.050925,0.029644,0.020419,-0.019647,0.026762,0.030372,...,-0.039689,-0.033923,0.016548,-0.031708,0.005191,-0.012108,-0.023511,-0.046316,-0.015116,-0.002053
125537,9803510,0.023637,0.032511,-0.022845,-0.049562,0.022435,-0.011798,-0.039580,0.027108,0.051269,...,-0.012995,-0.047795,0.003099,-0.041378,0.020139,0.024213,0.000836,-0.062128,-0.056860,0.035667
125538,9803525,0.014563,0.028312,-0.025777,-0.026197,0.003721,-0.004705,-0.019182,0.048098,0.042244,...,-0.014981,-0.034015,-0.007030,-0.049283,-0.008183,0.033857,-0.006020,-0.045291,-0.018446,0.006979
125539,9803560,0.036995,0.024908,-0.015726,-0.050903,0.041055,-0.012985,-0.010060,0.042424,0.029701,...,-0.027996,-0.013542,-0.008119,-0.016129,0.025896,0.034681,0.024347,-0.029181,-0.014498,0.032150


In [17]:
SAVE_DIR = f'/home/data/{model_tag}'

In [18]:
#make dir
os.makedirs(SAVE_DIR,exist_ok=True)

In [20]:
#save vec_df
vec_df.to_parquet(f'{SAVE_DIR}/{model_tag}_vec_df.parquet')

In [21]:
vec_df.shape

(125541, 1025)

In [22]:
pd.read_parquet('/home/data/Ekstra_Bladet_contrastive_vector/contrastive_vector.parquet').shape

(125541, 2)