# Import dependencies

In [1]:
import numpy as np
import pandas as pd

from tqdm import tqdm

import tensorflow as tf
import tensorflow_addons as tfa

from transformers import BertConfig, BertModel, RobertaModel
from transformers import RobertaTokenizer, RobertaConfig, TFRobertaModel


TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 

 The versions of TensorFlow you are currently using is 2.10.1 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


# Load Model

In [24]:
# load tokenizer
tokenizer = RobertaTokenizer.from_pretrained("./codebert-base")

In [8]:
# load model
model = TFRobertaModel.from_pretrained("./codebert-base/")

All model checkpoint layers were used when initializing TFRobertaModel.

All the layers of TFRobertaModel were initialized from the model checkpoint at ./codebert-base/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


In [33]:
le_tokens = tokenizer.tokenize("(λx.(((λy.y) (λz.(λa.(λb.a)))) (λc.c)))")
tokens = [tokenizer.cls_token] + le_tokens + [tokenizer.eos_token]
tokens_ids = tokenizer.convert_tokens_to_ids(tokens)

tokens_ids = tf.constant(tokens_ids, dtype=tf.int32)
tokens_ids = tf.expand_dims(tokens_ids, axis=0)

context_embeddings = model(tokens_ids)

context_embeddings[0]

<tf.Tensor: shape=(1, 30, 768), dtype=float32, numpy=
array([[[-0.06790983,  0.16003864, -0.08113418, ..., -0.16317725,
         -0.47541702,  0.40393323],
        [-0.49890435,  0.0894645 , -0.03886295, ...,  0.36657208,
         -0.54184794,  0.2876726 ],
        [-0.37728477, -0.04498858,  0.25325608, ..., -0.7006783 ,
         -0.53978413,  0.34478602],
        ...,
        [-0.42253712, -0.1807437 ,  0.18520138, ..., -0.792792  ,
         -0.53018826,  0.37685454],
        [-0.28178942, -0.02010319, -0.01168358, ..., -0.11536087,
         -0.4393902 ,  0.8512744 ],
        [-0.06779005,  0.16013034, -0.08106291, ..., -0.16288999,
         -0.47560245,  0.4040114 ]]], dtype=float32)>

In [41]:
np.array(context_embeddings[0])[0]

array([[-0.06790983,  0.16003864, -0.08113418, ..., -0.16317725,
        -0.47541702,  0.40393323],
       [-0.49890435,  0.0894645 , -0.03886295, ...,  0.36657208,
        -0.54184794,  0.2876726 ],
       [-0.37728477, -0.04498858,  0.25325608, ..., -0.7006783 ,
        -0.53978413,  0.34478602],
       ...,
       [-0.42253712, -0.1807437 ,  0.18520138, ..., -0.792792  ,
        -0.53018826,  0.37685454],
       [-0.28178942, -0.02010319, -0.01168358, ..., -0.11536087,
        -0.4393902 ,  0.8512744 ],
       [-0.06779005,  0.16013034, -0.08106291, ..., -0.16288999,
        -0.47560245,  0.4040114 ]], dtype=float32)

In [42]:
data_pd = pd.DataFrame(np.array(context_embeddings[0])[0])

In [43]:
data_pd.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,-0.06791,0.160039,-0.081134,0.205889,0.178483,-0.302993,-0.000539,0.058517,0.14841,-0.055308,...,0.189717,-0.148141,-0.36075,0.850089,-0.254381,0.081455,0.470585,-0.163177,-0.475417,0.403933
1,-0.498904,0.089465,-0.038863,0.471839,0.549624,-0.0204,-0.081514,0.001333,0.167236,0.42453,...,0.079803,-0.224443,-0.45626,0.687729,-0.450908,0.326975,0.926596,0.366572,-0.541848,0.287673
2,-0.377285,-0.044989,0.253256,0.346877,-0.179055,-0.113545,-0.208201,0.444694,-0.172227,0.245694,...,0.217623,0.176998,-0.444811,0.802039,-0.721672,0.29054,0.887376,-0.700678,-0.539784,0.344786
3,0.10651,0.293196,-0.173621,0.339994,0.551803,0.320187,0.032758,-0.033515,0.124311,0.267653,...,0.13829,-0.078387,-0.224901,0.692784,-0.549351,0.138192,0.662575,0.655478,-0.375942,0.214235
4,-0.173781,-0.160438,-0.471957,0.479068,0.807183,-0.481921,0.157651,-0.364095,0.079632,0.050947,...,0.015211,-0.149803,-0.965889,0.999199,-0.666296,0.372833,0.841991,0.150965,-0.766177,0.294903


In [46]:
data_pd.to_csv("term_embeddings/embeddings_4k/test.csv")

In [36]:
model.summary()

Model: "tf_roberta_model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 roberta (TFRobertaMainLayer  multiple                 124645632 
 )                                                               
                                                                 
Total params: 124,645,632
Trainable params: 124,645,632
Non-trainable params: 0
_________________________________________________________________


# Load Data

In [45]:
# prepare 4k terms:
all_data = pd.read_csv("../tests_17_ann_and_terms/article_v2_2_term_var_to_best_strategy/prepare_data/data_steps/steps_vars_term_str.csv", delimiter=',')

# leave only unique terms
print(f"Count all terms: {len(all_data)}")
print(f"max RI steps count: {max(all_data['RI_steps_num'])}")
print(f"max LO steps count: {max(all_data['LO_steps_num'])}")

terms_data = all_data["vars_terms"].tolist()

Count all terms: 4282
max RI steps count: 1000
max LO steps count: 219


In [47]:
for term_inx, term_str in tqdm(enumerate(terms_data)):
    le_tokens = tokenizer.tokenize(term_str)
    tokens = [tokenizer.cls_token] + le_tokens + [tokenizer.eos_token]
    tokens_ids = tokenizer.convert_tokens_to_ids(tokens)
    
    tokens_ids = tf.constant(tokens_ids, dtype=tf.int32)
    tokens_ids = tf.expand_dims(tokens_ids, axis=0)
    
    context_embeddings = model(tokens_ids)
    data_pd = pd.DataFrame(np.array(context_embeddings[0])[0])
    data_pd.to_csv(f"term_embeddings/embeddings_4k/term_embeddings_{term_inx}.csv")

4282it [21:25,  3.33it/s]
