In [25]:
from transformers import pipeline
from sentence_transformers import SentenceTransformer
import os
import pickle
import numpy as np
import json
os.environ["LOKY_MAX_CPU_COUNT"] = "4" 

In [26]:
current_directory = os.getcwd()

datafile_json = 'datafile/data.json'

datafile_json_path = os.path.join(current_directory, datafile_json)

There is certain limit of tokens which can be embedded with the sentence transformer. This is why the information has to be summarized. This has been achieved with a summarization model from the transformers library. The following function `summarize_elements` is encharged of sumarizing the elments and returning the sentences for further embedding. 

In [27]:
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

def summarize_elements(elements):
    summarized_elements = []

    for element in elements:
        name = element.get('name', '')
        description = element.get('description', '')
        combined_text = f"{name} {description}"
        
        token_length = len(summarizer.tokenizer(combined_text)['input_ids'])
        
        if token_length > 128:
            remaining_length = 128 - len(summarizer.tokenizer(name)['input_ids'])
            summarized_part = summarizer(description, max_length=remaining_length, min_length=1)
            summarized_part = summarized_part[0].get('summary_text', '')
            summarized_element = f"{name} {summarized_part}"
        else:
            summarized_element = combined_text
        
        summarized_elements.append(summarized_element)

    return summarized_elements

In [29]:
embeddings_file = 'embeddings/embeddings.npy'
sentences_file = 'embeddings/sentences.npy'

if os.path.exists(embeddings_file) and os.path.exists(sentences_file):
    print("The embeddings have already been generated. Delete the files if you wish to generate them again.")
else:
    model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

    with open(datafile_json_path, 'r', encoding='utf-8') as file:
        elements = json.load(file)
    sentences = summarize_elements(elements)
    print("Success!")
    embeddings = model.encode(sentences)

    with open(sentences_file, 'wb') as f:
        pickle.dump(elements, f)

    np.save(embeddings_file, embeddings)

The embeddings have already been generated. Delete the files if you wish to generate them again.
