In [2]:
import tensorflow as tf
import numpy as nps
import sklearn
from sklearn.metrics import confusion_matrix
import csv
import pandas as pd
from numpy import random
import tensorflow_datasets as tfds
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Layer
from tensorflow.keras.layers import Dense,Input
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam
from datasets import load_dataset
from transformers import AutoTokenizer,create_optimizer,TFAutoModel

In [3]:
dataset = load_dataset('csv', data_files='data.csv')

Found cached dataset csv (C:/Users/Admin/.cache/huggingface/datasets/csv/default-813752fff6601379/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d)


  0%|          | 0/1 [00:00<?, ?it/s]

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['query', 'product', 'label'],
        num_rows: 3000
    })
})

In [5]:
dataset['train'][0]

{'query': '# 2 pencils not sharpened',
 'product': 'Ticonderoga Beginner Pencils, Wood-Cased #2 HB Soft, With Eraser, Yellow, 12-Pack (13308)',
 'label': 'exact'}

In [6]:
# setting label to numbers

def get_label(label):
    if label=='exact':
        return 1.0
    elif label=='substitute':
        return 0.7
    elif label=='complement':
        return 0.5
    else:
        return 0.0

In [7]:
model_id="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [10]:
# Idea is to fine tune the pre-trained model on my data so, will pass in query + product title and label, 
# hence modify the process a little


BATCH_SIZE=128
MAX_LENGTH=64

def preprocess(dataset):
    if dataset['product']==None:
        dataset['product']=dataset['query']
    
    dataset['input_ids_query']=[]
    dataset['token_type_ids_query']=[]
    dataset['attention_mask_query']=[]

    dataset['input_ids_product']=[]
    dataset['token_type_ids_product']=[]
    dataset['attention_mask_product']=[]

    tokenized_output_query=tokenizer(dataset['query'],max_length=MAX_LENGTH,padding='max_length',truncation=True)
    tokenized_output_product=tokenizer(dataset['product'],max_length=MAX_LENGTH,padding='max_length',truncation=True)

    dataset['input_ids_query'].append(tokenized_output_query['input_ids'])
    dataset['token_type_ids_query'].append(tokenized_output_query['token_type_ids'])
    dataset['attention_mask_query'].append(tokenized_output_query['attention_mask'])

    dataset['input_ids_product'].append(tokenized_output_product['input_ids'])
    dataset['token_type_ids_product'].append(tokenized_output_product['token_type_ids'])
    dataset['attention_mask_product'].append(tokenized_output_product['attention_mask'])

    dataset['label']=get_label(dataset['label'])
    return dataset

In [11]:
prep_dataset=dataset.map(preprocess)

Loading cached processed dataset at C:\Users\Admin\.cache\huggingface\datasets\csv\default-813752fff6601379\0.0.0\eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d\cache-0e7629e05b7100b2.arrow


In [12]:
prep_dataset['train'][21:22]

{'query': ['# 2 pencils not sharpened'],
 'product': ['BIC Evolution Cased Pencil, #2 Lead, Gray Barrel, 24-Count (PGEBP241-BLK)'],
 'label': [0.7],
 'input_ids_query': [[[0,
    468,
    116,
    5551,
    13003,
    7,
    959,
    189173,
    33,
    297,
    2,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1]]],
 'token_type_ids_query': [[[0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,


In [13]:
# convert to tf dataset
tf_dataset = prep_dataset["train"].to_tf_dataset(
    columns=['input_ids_query', 'token_type_ids_query', 'attention_mask_query','input_ids_product', 'token_type_ids_product', 'attention_mask_product', 'label'],
    shuffle=True,
    batch_size=BATCH_SIZE,
)

In [14]:
for i in tf_dataset.take(1):
    print(i)

{'label': <tf.Tensor: shape=(128,), dtype=float32, numpy=
array([0. , 0.7, 0. , 0. , 0.7, 0.7, 0.7, 0.7, 0. , 1. , 0.7, 0.7, 0. ,
       0.5, 0.7, 0. , 0.5, 0. , 0. , 1. , 0. , 1. , 0.7, 0. , 1. , 0. ,
       0.7, 0. , 0. , 0.7, 0.7, 0. , 0.7, 0. , 0.7, 0.5, 0. , 0.7, 1. ,
       1. , 0.7, 1. , 1. , 0. , 0. , 0.7, 0. , 1. , 1. , 0.7, 1. , 0. ,
       0. , 1. , 0. , 0. , 0.7, 1. , 0. , 0. , 1. , 0.7, 0. , 1. , 0.7,
       0. , 0.7, 1. , 0. , 1. , 1. , 1. , 0.7, 1. , 0.7, 0.7, 1. , 1. ,
       0. , 1. , 0.7, 0. , 0. , 0.7, 0. , 0.7, 0. , 0. , 0. , 1. , 0.7,
       1. , 0. , 0. , 0. , 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0. , 0.7, 0. ,
       1. , 1. , 1. , 0. , 1. , 1. , 0. , 0. , 0.7, 1. , 0.7, 0.7, 0. ,
       0.7, 0. , 0.7, 1. , 0. , 1. , 0.7, 0. , 0. , 0. , 0. ],
      dtype=float32)>, 'input_ids_query': <tf.Tensor: shape=(128, 1, 64), dtype=int64, numpy=
array([[[    0,  4880,   166, ...,     1,     1,     1]],

       [[    0, 81730,  2276, ...,     1,     1,     1]],

       [[    0,   9

In [15]:
# modelling
model = TFAutoModel.from_pretrained(model_id)
model.summary()

All model checkpoint layers were used when initializing TFBertModel.

All the layers of TFBertModel were initialized from the model checkpoint at sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "tf_bert_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  117653760 
                                                                 
Total params: 117653760 (448.81 MB)
Trainable params: 117653760 (448.81 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [16]:
# overwriting train step as architecture is in a way that train query + product and use a mean pooling layer to get the

class SentenceTransformer(tf.keras.Model):
    def __init__(self,model):
        super(SentenceTransformer,self).__init__()
        self.model=model
        self.dense=Dense(1,activation='sigmoid')

    def compile(self,optimizer,loss_fn):
        super(SentenceTransformer,self).compile()
        self.optimizer=optimizer
        self.loss_fn=loss_fn
        self.loss_metric=tf.keras.metrics.Mean(name='loss')

    @property
    def metrics(self):
        return [self.loss_metric]

    def mean_pooling(self, model_output, attention_mask):
        token_embeddings = model_output[0]

        # expand to get attention mask the same shape as embeddings 
        input_mask_expanded = tf.cast(
        tf.broadcast_to(tf.expand_dims(attention_mask, -1), tf.shape(token_embeddings)),
        tf.float32)
        # have padded tokens so in order to make their weightage 0 multiply with attention mask
        return tf.math.reduce_sum(token_embeddings * input_mask_expanded, axis=1)/tf.clip_by_value(tf.math.reduce_sum(input_mask_expanded, axis=1), 1e-9, tf.float32.max)

    
    def train_step(self,train_data):
        query={'input_ids':train_data['input_ids_query'][:,0,:],
           'token_type_ids':train_data['token_type_ids_query'][:,0,:],
           'attention_mask':train_data['attention_mask_query'][:,0,:]}

        product={'input_ids':train_data['input_ids_product'][:,0,:],
             'token_type_ids':train_data['token_type_ids_product'][:,0,:],
             'attention_mask':train_data['attention_mask_product'][:,0,:]}
        
        labels=train_data['label']

        with tf.GradientTape() as recorder:
            query_predictions=self.model(query)
            pred_query=self.mean_pooling(query_predictions,train_data['attention_mask_query'][:,0,:])

            product_predictions=self.model(product)
            pred_product=self.mean_pooling(product_predictions,train_data['attention_mask_product'][:,0,:])
            
            # u,v, |u-v|
            pred_concat=tf.concat([pred_query,pred_product,tf.abs(pred_query-pred_product)],axis=-1)

            predictions=self.dense(pred_concat)
            loss=self.loss_fn(labels,predictions)


        partial_derivatives = recorder.gradient(loss,self.model.trainable_weights)
        self.optimizer.apply_gradients(zip(partial_derivatives, self.model.trainable_weights))


        self.loss_metric.update_state(loss)
        return {'loss':self.loss_metric.result(),}

In [17]:
model.summary()

Model: "tf_bert_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  117653760 
                                                                 
Total params: 117653760 (448.81 MB)
Trainable params: 117653760 (448.81 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [18]:
stransformer=SentenceTransformer(model)
stransformer.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5,),
    loss_fn=tf.keras.losses.BinaryCrossentropy(),
)

In [19]:
EPOCHS=2
history=stransformer.fit(tf_dataset,epochs=EPOCHS,)

Epoch 1/2
Epoch 2/2


In [63]:
# stransformer.model.save_weights(model_path)

In [20]:
# create embeddings for the product titles and store
filepath_catalogue='product_catalogue-v0.3.csv'
df_catalogue=pd.read_csv(filepath_catalogue)

In [21]:
df_catalogue = df_catalogue[:3000]
df_catalogue.shape

(3000, 7)

In [22]:
df_catalogue['product_title'][10]

'Wood-Cased #2 HB Pencils, Shuttle Art 600 Pack Sharpened Yellow Pencils with Erasers, Bulk Pack Graphite Pencils for School and Teacher Supplies, Writhing, Drawing and Sketching'

In [23]:
product_titles=[str(df_catalogue['product_title'][i]) for i in range(len(df_catalogue))]
print(product_titles[:2])

['Amazon Basics Woodcased #2 Pencils, Unsharpened, HB Lead - Box of 144, Bulk Box', 'BAZIC Pencil #2 HB Pencils, Latex Free Eraser, Wood Free Yellow Unsharpened Pencils for Exam School Office (12/Pack), 1-Pack']


In [24]:
embeddings=[]

In [25]:
INFERENCE_BATCH_SIZE=640
len(product_titles)//INFERENCE_BATCH_SIZE

4

In [27]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]

    # expand to get attention mask the same shape as embeddings 
    input_mask_expanded = tf.cast(
    tf.broadcast_to(tf.expand_dims(attention_mask, -1), tf.shape(token_embeddings)),
    tf.float32)
    # have padded tokens so in order to make their weightage 0 multiply with attention mask
    return tf.math.reduce_sum(token_embeddings * input_mask_expanded, axis=1)/tf.clip_by_value(tf.math.reduce_sum(input_mask_expanded, axis=1), 1e-9, tf.float32.max)

In [28]:
# getting embeddings for batches, 4 times pass the full data, 640 batch
for i in range(len(product_titles)//INFERENCE_BATCH_SIZE):
    tokenized_output=tokenizer(
      product_titles[INFERENCE_BATCH_SIZE*i:INFERENCE_BATCH_SIZE*(i+1)],max_length=MAX_LENGTH,padding='max_length',truncation=True,return_tensors="tf")
    model_output=model(tokenized_output)
    embedding=mean_pooling(model_output,tokenized_output['attention_mask'])
    embeddings.append(embedding)
    if i%100==0:
        print(i)

0


In [29]:
embeddings

[<tf.Tensor: shape=(640, 384), dtype=float32, numpy=
 array([[-0.29282224, -0.07676274, -0.3253273 , ...,  0.26357576,
         -0.10390438,  0.2429624 ],
        [-0.4563171 ,  0.22972861, -0.13598211, ...,  0.35998857,
          0.04396171,  0.2383835 ],
        [-0.06812565, -0.02659825, -0.0476185 , ...,  0.4345699 ,
         -0.03786996,  0.18956815],
        ...,
        [-0.37099892,  0.47778252,  0.36435744, ..., -0.18578033,
         -0.16245577, -0.17742285],
        [-0.3159326 ,  0.24955328,  0.39163116, ..., -0.17924504,
         -0.15031905, -0.15297084],
        [-0.35707855,  0.2573044 ,  0.26153773, ..., -0.39928892,
         -0.24573238,  0.09266718]], dtype=float32)>,
 <tf.Tensor: shape=(640, 384), dtype=float32, numpy=
 array([[-0.11752909,  0.34652114,  0.16854785, ..., -0.22907712,
         -0.1429412 ,  0.02594221],
        [ 0.15281056,  0.29636747,  0.1012983 , ..., -0.21139735,
         -0.16036694, -0.20210999],
        [-0.29285634,  0.06421579,  0.2307576 ,

In [75]:
# save the embeddings
#np.savez_compressed('embeddings.npz', embeddings)
#np.savez_compressed('product_titles.npz',product_titles)

In [31]:
# use embeddings
import numpy as np
loaded_embedding=np.load('embeddings.npz')
loaded_embedding_array=np.array(loaded_embedding['arr_0'])

In [32]:
loaded_titles=np.load('product_titles.npz')
loaded_titles_array=np.array(loaded_titles['arr_0'])

In [33]:
loaded_embedding_array.shape

(4, 640, 384)

In [34]:
loaded_embedding_array=loaded_embedding_array.reshape(-1,loaded_embedding_array.shape[2])
print(loaded_embedding_array.shape)

(2560, 384)


In [35]:
#tokenizer = AutoTokenizer.from_pretrained(model_id)

In [36]:
# simulate a single user input
inputs = tokenizer(["Keyboard"],max_length=MAX_LENGTH,padding='max_length',truncation=True,return_tensors="tf")

logits = model(**inputs)
out_embedding=mean_pooling(logits,inputs['attention_mask'])
print(out_embedding.shape)

(1, 384)


In [37]:
# cosine similarity - a,b dot product
u_dot_v=np.matmul(loaded_embedding_array,(np.array(out_embedding).T))
print(u_dot_v.shape)

(2560, 1)


In [38]:
# cosine similarity - norm of a
u_magnitude=np.sqrt(np.sum(loaded_embedding_array*loaded_embedding_array,axis=-1))
print(u_magnitude.shape)
print(u_magnitude)

(2560,)
[4.968757  4.7849097 4.9656568 ... 4.640017  4.803453  4.5058026]


In [39]:
# cosine similarity - norm of a
v_magnitude=np.sqrt(np.sum(out_embedding*out_embedding,axis=-1))
print(v_magnitude.shape)
print(v_magnitude)

(1,)
[6.89353]


In [40]:
# cosine similarity
cosine_similarity=u_dot_v.T/(u_magnitude*v_magnitude)
print(cosine_similarity)

[[0.22148265 0.25732487 0.21718223 ... 0.10515758 0.08786255 0.10089629]]


In [41]:
# sorting to get index of 
sorted_indices=np.argsort(cosine_similarity,axis=-1)
print(sorted_indices)

[[1880 2259  356 ... 1625 1986  107]]


In [44]:
# get the last value in the sorted indices and map with title
for i in range(3):
    print(i,loaded_titles_array[sorted_indices[:,len(sorted_indices[0])-i-1]])

0 ['CHOORO Piano Keyboard Pendant Keychain Piano Zipper Pull Music Jewelry Gift for Pianist/Piano Teacher/Music Lovers (Necklace)']
1 ['Wise 8WD12 Aluminum Offset Piano Hinge, 11"']
2 ['M SANMERSEN Kids Piano Mat, 39.5" X 14" Musical Mat Keyboard Music Mat with 8 Instrument Sounds Touch Play Dancing Mat Gift Toys for Boys Girls']
