In [7]:
import re
import sqlite3
import numpy as np
from concurrent.futures import ThreadPoolExecutor
from threading import Lock
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
from annoy import AnnoyIndex
import os, json
import requests

# Function to tokenize sentences
def tokenize_sentences(text):
    return re.split(r'(?<=[^A-Z].[.?;!]) +(?=[A-Z])|(?<=;;) *|;', text)

# Function to compute TF-IDF weighted embeddings
def compute_tfidf_weighted_embeddings(sentences, model):
    vectorizer = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b")
    tfidf_matrix = vectorizer.fit_transform(sentences)
    tfidf_word_index = vectorizer.vocabulary_

    sentence_embeddings = []
    for row in tfidf_matrix:
        words = [word for word in tfidf_word_index if row[0, tfidf_word_index[word]] > 0]
        word_weights = [row[0, tfidf_word_index[word]] for word in words]
        word_embeddings = model.encode(words)
        
        if sum(word_weights) > 0:
            weighted_embedding = np.average(word_embeddings, axis=0, weights=word_weights)
        elif word_embeddings.size > 0:
            weighted_embedding = np.mean(word_embeddings, axis=0)
        else:
            weighted_embedding = np.zeros(model.get_sentence_embedding_dimension())
        
        sentence_embeddings.append(weighted_embedding)
        #print pgoress every 500 sentences
        if len(sentence_embeddings) % 500 == 0:
            print(f"Processed row {len(sentence_embeddings)} of {tfidf_matrix.shape[0]}")

    return sentence_embeddings

def clean_text(text):
    out = text.replace('\n', ' ').strip()
    out = text.replace("'", "").replace('"', "").replace('"', "").replace('[', "").replace(']', "")
    out = " ".join(out.split())
    return out

def retrieve_full_text(document):
    # Define the local file path
    #print(document)
    filename = "-".join(document["ids"])
    local_file_path = os.path.join(text_folder, f"{filename}.txt")
    
    # Check if the local file exists
    if os.path.isfile(local_file_path):
        with open(local_file_path, "r") as file:
            document["full_text"] = file.read()
    else:
        text_url = document["txturl"]
        response = requests.get(text_url)
        
        if response.status_code == 200:
            document["full_text"] = clean_text(response.text)
            
            # Save the full_text to the local file
            with open(local_file_path, "w") as file:
                file.write(document["full_text"])
        else:
            print(f"Failed to download the text from the URL: {text_url}")
    return


# Function to process a project
def process_project(project, annoy_index):
    global counter

    retrieve_full_text(project)

    sentences = tokenize_sentences(project["full_text"])
    sentence_embeddings = compute_tfidf_weighted_embeddings(sentences, model)

    with sqlite3.connect('embeddings.db') as conn:
        c = conn.cursor()
        for sentence, embedding in zip(sentences, sentence_embeddings):
            with counter_lock:
                idx = counter
                counter += 1
            c.execute("INSERT INTO embeddings (id, project_id, sentence, embedding) VALUES (?, ?, ?, ?)", (idx, ",".join(project['ids']), sentence, embedding.tobytes()))
            annoy_index.add_item(idx, embedding)  # Add the embeddings directly to the Annoy index
        conn.commit()
        print("Processed project", project["title"])

# Initialize a SentenceTransformer model
model = SentenceTransformer('sentence-transformers/paraphrase-mpnet-base-v2')
embedding_dim = model.get_sentence_embedding_dimension()

#Delete the database if it exists
if os.path.exists('embeddings.db'):
    os.remove('embeddings.db')

# Create a folder to store text files
text_folder = "text_files"
os.makedirs(text_folder, exist_ok=True)
    
# Create the SQLite database
with sqlite3.connect('embeddings.db') as conn:
    c = conn.cursor()
    c.execute('CREATE TABLE IF NOT EXISTS embeddings (id INTEGER PRIMARY KEY, project_id TEXT, sentence TEXT, embedding BLOB)')
    conn.commit()

# Initialize the Annoy index
annoy_index = AnnoyIndex(embedding_dim, 'angular')

# Load the projects
with open("digital_agriculture_projects.json", "r") as f:
    projects = json.load(f)
projects=projects[:5]

# Initialize counter and lock
counter = 1
counter_lock = Lock()

# Process texts and save embeddings into the database using 8 threads
with ThreadPoolExecutor(max_workers=4) as executor:
    for _ in executor.map(process_project, projects, [annoy_index] * len(projects)):
        pass

# Build the Annoy index with 10 trees
annoy_index.build(10)

# Save the Annoy index
annoy_index.save('embeddings.ann')

Processed project Disclosable Restructuring Paper - Agriculture Cluster Development Project - P145037
Processed row 500 of 2330
Processed row 500 of 1328
Processed row 500 of 602
Processed project Montenegro - Institutional Development and Agriculture Strengthening Project (MIDAS)
Processed row 500 of 1328
Processed row 1000 of 2330
Processed row 1000 of 1328
Processed row 1000 of 1328
Processed project Kenya - Climate Smart Agriculture Project : Environmental Assessment (Vol. 7) : Environmental and Social Impact Assessment for Sertonje Borehole Sub-project, Mugurin Sub Location, Simotwe Location, Kisanana Ward, Mogotio Sub County, Baringo County
Processed row 1500 of 2330
Processed project Kenya - National Climate Smart Agriculture Project : Environmental Assessment (Vol. 2) : Pest Management Plan for Livestock Vaccination Campaign for Management of East Coast Fever, Bomet County
Processed row 2000 of 2330
Processed project Myanmar - National Food and Agriculture System Project


True