In [3]:
#import openai
#import requests
import numpy as np
from sentence_transformers import SentenceTransformer
import json
import concurrent.futures
from functools import partial


# Set up the Sentence Transformer model for embeddings
model = SentenceTransformer('paraphrase-distilroberta-base-v1')


with open("digital_agriculture_projects.json", "r") as f:
    projects = json.load(f)

# Function to generate embeddings for a document
def generate_embeddings(doc, model):
    doc["full_text"] = doc["title"] + ";" + doc["keywords"] + ";" + doc["abstract"]
    embeddings = model.encode(doc["full_text"])
    return embeddings

# Function to process documents in parallel using multithreading
def process_documents(projects, model):
    with concurrent.futures.ThreadPoolExecutor() as executor:
        # Partially apply the model to the generate_embeddings function
        generate_embeddings_partial = partial(generate_embeddings, model=model)

        # Process documents in parallel
        for i, embeddings in enumerate(executor.map(generate_embeddings_partial, projects)):
            projects[i]["embeddings"] = embeddings
            print(f"Processing document {i + 1} of {len(projects)}", end="\r")

    return projects

projects_with_embeddings = process_documents(projects, model)


#Save the documents to a file
np.save("documents_with_embeddings.npy", projects_with_embeddings)

Processing document 388 of 437