In [1]:
import tensorflow as tf
import tensorflow_text  # noqa
import tensorflow_hub as hub

In [2]:
import sys
sys.path.insert(0, "../..")
import config as cfg
import gc
import os
from tqdm.notebook import tqdm
from helper import check_path
from collections import defaultdict

import pandas as pd
import numpy as np

In [3]:
EMB_SIZE = 768
BATCH_SIZE = 128
EMB_NAME = 'smaller_LaBSE_15lang'

In [4]:
# Loading models from tfhub.dev
encoder = hub.KerasLayer("https://tfhub.dev/jeongukjae/smaller_LaBSE_15lang/1")
preprocessor = hub.KerasLayer("https://tfhub.dev/jeongukjae/smaller_LaBSE_15lang_preprocess/1")

# Constructing model to encode texts into high-dimensional vectors
sentences = tf.keras.layers.Input(shape=(), dtype=tf.string, name="sentences")
encoder_inputs = preprocessor(sentences)
sentence_representation = encoder(encoder_inputs)["pooled_output"]
normalized_sentence_representation = tf.nn.l2_normalize(sentence_representation, axis=-1)  # for cosine similarity
model = tf.keras.Model(sentences, normalized_sentence_representation)
model.summary()

2022-07-22 13:58:26.247176: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-07-22 13:58:26.279620: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-07-22 13:58:26.279784: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-07-22 13:58:26.280549: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 sentences (InputLayer)         [(None,)]            0           []                               
                                                                                                  
 keras_layer_1 (KerasLayer)     {'input_type_ids':   0           ['sentences[0][0]']              
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128),                                                          
                                 'input_word_ids':                                                
                                (None, 128)}                                                  

In [5]:
train = pd.read_csv(cfg.ORIG_TRAIN_PATH).set_index('id')
test = pd.read_csv(cfg.ORIG_TEST_PATH).set_index('id')

In [6]:
def get_embedding(text: pd.Series, emb_name='') -> pd.DataFrame:
    n = len(text)
    embeddings = np.zeros(shape=(n, EMB_SIZE))
    for i in tqdm(range(0, n, BATCH_SIZE), total=n // BATCH_SIZE):
        sentences = tf.constant(text.iloc[i:i+BATCH_SIZE].tolist())
        embeddings[i:i+BATCH_SIZE, :] = model(sentences)
    embeddings = pd.DataFrame(
        embeddings, 
        columns=[f'{emb_name}_{c}' for c in range(EMB_SIZE)],
        index=text.index)
    return embeddings    

In [7]:
train_embeddings = get_embedding(train[cfg.TEXT_COL], emb_name=EMB_NAME)
test_embeddings = get_embedding(test[cfg.TEXT_COL], emb_name=EMB_NAME)

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

In [8]:
emb_path = os.path.join(cfg.DATA_PATH, EMB_NAME)
check_path(emb_path)

In [9]:
train_embeddings.to_pickle(os.path.join(emb_path, 'train.pkl'))
test_embeddings.to_pickle(os.path.join(emb_path, 'test.pkl'))