In [1]:
import sys
import torch
sys.path.append('./modules')

from modules import composition, comp_func
from transformers import AutoModel, AutoTokenizer, pipeline

In [2]:
model_name = 'bert-base-uncased'
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [3]:
pipe = pipeline('composition', model = model, tokenizer = tokenizer)

In [4]:
# Composition functions ids: 
#   - 'sum'         : Suma de vectores
#   - 'avg'         : Media global
#   - 'icds_avg'    : Media por función ICDS 
#   - 'ind'         : F independiente
#   - 'inf'         : F information
#   - 'jnt'         : F joint

output = pipe(['Esto', 'es', 'una', 'frase', 'por','palabras'], comp_fun='sum')

In [5]:
# Word embedding
torch.tensor(output)

tensor([[-1.0533, -0.8754, -1.0761,  ..., -0.8785, -0.7937,  1.8503],
        [-0.8066, -0.7124, -0.5085,  ...,  0.1398,  0.6337,  0.0990],
        [-0.5062, -0.5618, -0.2220,  ...,  0.3578,  0.7608,  0.1234],
        [-0.3052, -1.0076,  0.2574,  ...,  0.3287, -0.3511,  1.3365],
        [-0.2550, -0.4451, -0.1506,  ..., -0.2118, -0.3686,  0.3944],
        [-1.7845, -0.3357,  0.0446,  ...,  0.0039, -0.0847, -0.1124]])

In [6]:
# Compose the sentence by its constinuent words
output = comp_func.compose(output, 'sum')

In [7]:
torch.tensor(output)

tensor([-4.7109e+00, -3.9379e+00, -1.6552e+00, -3.6846e+00,  5.4309e+00,
         3.9793e+00,  3.8690e+00,  8.6284e+00, -9.4452e-02, -6.7026e+00,
        -1.3813e-01, -2.5763e-01,  3.0247e+00,  3.3740e+00, -2.3671e+00,
         2.0679e+00,  2.2972e+00,  2.8383e-01,  2.3354e-01,  3.4180e+00,
        -2.8978e-01, -1.2357e+00, -3.9823e+00, -1.8527e+00, -2.1564e+00,
        -2.0728e-01, -4.5952e+00,  1.3271e+00, -1.7247e+00,  2.1600e+00,
        -1.3423e+00,  2.1043e+00, -3.6210e+00,  2.5394e+00, -6.6394e+00,
        -2.0691e+00, -9.4389e-01, -8.1239e-01, -6.6980e+00,  1.6718e+00,
        -2.7455e+00, -5.2327e+00,  3.3301e+00, -1.0494e+00,  1.8382e+00,
         1.1879e+00,  7.1531e+00, -3.4663e+00, -5.6718e+00, -9.8421e-01,
        -9.0064e+00,  3.3539e+00, -1.9148e+00,  2.0225e+00, -6.2418e+00,
        -4.4303e-01,  1.3803e+00,  3.9536e-01,  9.2532e-01, -9.2413e-01,
        -2.0344e+00,  9.0041e-01,  3.4059e+00, -3.6552e+00,  4.5468e-01,
         1.1735e+00,  6.1559e+00,  2.3163e-01, -6.0

In [8]:
# Composition of the model representation of a sentence
output = pipe('Esto es una frase completa', comp_fun='inf')

In [9]:
torch.tensor(output)

tensor([ 3.9394e-03, -9.5840e-01, -3.0607e-01,  1.7099e-01,  7.9895e-01,
         7.6236e-01,  2.2116e-01,  2.0945e+00,  2.2012e-01, -5.9956e-01,
         1.2060e-01, -2.5963e-01, -5.0639e-01,  7.2868e-01, -7.2644e-01,
         1.5829e+00,  1.2074e+00, -4.5237e-01, -2.7408e-01,  3.2341e-03,
         6.0819e-01, -8.1564e-01, -4.0112e-01,  3.0185e-01, -6.9799e-01,
        -1.5917e+00, -3.4921e-01,  1.8520e-01, -1.0317e+00,  1.6200e-02,
        -1.8156e-01, -3.7866e-02, -8.5027e-01,  2.2983e-01,  6.8841e-01,
         9.5154e-01,  5.2237e-01, -2.0291e-01, -4.9816e-01,  7.1809e-02,
        -7.4031e-01,  3.4859e-02, -5.9720e-01,  3.8922e-01,  2.7011e-01,
         7.0095e-01,  1.6967e+00, -1.2597e+00,  2.7921e-01, -3.3780e-02,
        -1.8919e+00,  1.9514e+00, -1.0660e+00,  2.5329e-01, -8.4682e-01,
         1.4362e-01,  1.5951e-01, -8.6518e-01, -1.9013e-02,  3.7576e-02,
        -3.5827e-01,  2.8558e-01,  7.1183e-01, -3.5001e-01, -4.4252e-01,
        -7.7150e-01,  1.0382e+00, -3.7554e-01, -1.6

In [None]:
# Example with OpenAI Ada-2 Embeddings
# Make sure to create .env file on the root directory with your API key:
#   - Add: OPENAI_API_KEY = "your api key"

import modules.openai_embeddings as oai 

output = oai.openai_embed('text')

In [10]:
import torch
torch.tensor(output)

tensor([[-0.0099, -0.0152,  0.0086,  ..., -0.0212, -0.0083, -0.0408]])