In [18]:
import sqlite3
import re
import os
import json
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
from annoy import AnnoyIndex
from concurrent.futures import ThreadPoolExecutor
from threading import Lock


# Initialize the sentence-transformer model
model = SentenceTransformer('paraphrase-distilroberta-base-v1')

#Delete the database if it exists
if os.path.exists('embeddings.db'):
    os.remove('embeddings.db')

# Connect to the SQLite database
conn = sqlite3.connect('embeddings.db')
c = conn.cursor()

# Create table if not exists
c.execute('''CREATE TABLE IF NOT EXISTS embeddings (id INTEGER PRIMARY KEY, project_id TEXT, sentence TEXT, embedding BLOB)''')


def tokenize_sentences(text):
    return re.split(r'(?<=[^A-Z].[.?;!]) +(?=[A-Z])|(?<=;;) *|;', text)


def compute_tfidf_weighted_embeddings(sentences, model):
    vectorizer = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b")
    tfidf_matrix = vectorizer.fit_transform(sentences)
    tfidf_word_index = vectorizer.vocabulary_

    sentence_embeddings = []
    for row in tfidf_matrix:
        words = [word for word in tfidf_word_index if row[0, tfidf_word_index[word]] > 0]
        word_weights = [row[0, tfidf_word_index[word]] for word in words]
        word_embeddings = model.encode(words)
        
        if sum(word_weights) > 0:
            weighted_embedding = np.average(word_embeddings, axis=0, weights=word_weights)
        elif word_embeddings.size > 0:
            weighted_embedding = np.mean(word_embeddings, axis=0)
        else:
            weighted_embedding = np.zeros(model.get_sentence_embedding_dimension())
        
        sentence_embeddings.append(weighted_embedding)

    return sentence_embeddings



def process_project(project, annoy_index):
    global counter

    print("Processing project", project["title"])
    project["full_text"] = project["title"] + ";" + project["keywords"] + ";" + project["abstract"]

    sentences = tokenize_sentences(project["full_text"])
    sentence_embeddings = compute_tfidf_weighted_embeddings(sentences, model)

    with sqlite3.connect('embeddings.db') as conn:
        c = conn.cursor()
        for sentence, embedding in zip(sentences, sentence_embeddings):
            with counter_lock:
                idx = counter
                counter += 1
            c.execute("INSERT INTO embeddings (id, project_id, sentence, embedding) VALUES (?, ?, ?, ?)", (idx, ",".join(project['ids']), sentence, embedding.tobytes()))
            annoy_index.add_item(idx, embedding)  # Add the embeddings directly to the Annoy index
        conn.commit()


# Load the projects
with open("digital_agriculture_projects.json", "r") as f:
    projects = json.load(f)

counter = 1
counter_lock = Lock()

# Process texts and save embeddings into the database using 8 threads
with ThreadPoolExecutor(max_workers=8) as executor:
    for _ in executor.map(process_project, projects[:5], [annoy_index] * len(projects[:5])):
        pass

# Build the Annoy index with 10 trees
annoy_index.build(10)

# Save the Annoy index
annoy_index.save('embeddings.ann')





Processing projectProcessing project Myanmar - National Food and Agriculture System Project
 Disclosable Restructuring Paper - Agriculture Cluster Development Project - P145037
Processing project Montenegro - Institutional Development and Agriculture Strengthening Project (MIDAS)
Processing project Kenya - Climate Smart Agriculture Project : Environmental Assessment (Vol. 7) : Environmental and Social Impact Assessment for Sertonje Borehole Sub-project, Mugurin Sub Location, Simotwe Location, Kisanana Ward, Mogotio Sub County, Baringo County
Processing project Kenya - National Climate Smart Agriculture Project : Environmental Assessment (Vol. 2) : Pest Management Plan for Livestock Vaccination Campaign for Management of East Coast Fever, Bomet County


You can't add an item to a loaded index
You can't add an item to a loaded index
You can't add an item to a loaded index
You can't add an item to a loaded index
You can't add an item to a loaded index


Exception: You can't add an item to a loaded index

In [28]:
import re
import sqlite3
import numpy as np
from concurrent.futures import ThreadPoolExecutor
from threading import Lock
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
from annoy import AnnoyIndex

# Function to tokenize sentences
def tokenize_sentences(text):
    return re.split(r'(?<=[^A-Z].[.?;!]) +(?=[A-Z])|(?<=;;) *|;', text)

# Function to compute TF-IDF weighted embeddings
def compute_tfidf_weighted_embeddings(sentences, model):
    vectorizer = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b")
    tfidf_matrix = vectorizer.fit_transform(sentences)
    tfidf_word_index = vectorizer.vocabulary_

    sentence_embeddings = []
    for row in tfidf_matrix:
        words = [word for word in tfidf_word_index if row[0, tfidf_word_index[word]] > 0]
        word_weights = [row[0, tfidf_word_index[word]] for word in words]
        word_embeddings = model.encode(words)
        
        if sum(word_weights) > 0:
            weighted_embedding = np.average(word_embeddings, axis=0, weights=word_weights)
        elif word_embeddings.size > 0:
            weighted_embedding = np.mean(word_embeddings, axis=0)
        else:
            weighted_embedding = np.zeros(model.get_sentence_embedding_dimension())
        
        sentence_embeddings.append(weighted_embedding)

    return sentence_embeddings

# Function to process a project
def process_project(project, annoy_index):
    global counter

    print("Processing project", project["title"])
    project["full_text"] = project["abstract"]
    #print(project["full_text"])

    sentences = tokenize_sentences(project["full_text"])
    sentence_embeddings = compute_tfidf_weighted_embeddings(sentences, model)

    with sqlite3.connect('embeddings.db') as conn:
        c = conn.cursor()
        for sentence, embedding in zip(sentences, sentence_embeddings):
            with counter_lock:
                idx = counter
                counter += 1
            c.execute("INSERT INTO embeddings (id, project_id, sentence, embedding) VALUES (?, ?, ?, ?)", (idx, ",".join(project['ids']), sentence, embedding.tobytes()))
            annoy_index.add_item(idx, embedding)  # Add the embeddings directly to the Annoy index
        conn.commit()

# Initialize a SentenceTransformer model
model = SentenceTransformer('sentence-transformers/paraphrase-mpnet-base-v2')
embedding_dim = model.get_sentence_embedding_dimension()

#Delete the database if it exists
if os.path.exists('embeddings.db'):
    os.remove('embeddings.db')
    
# Create the SQLite database
with sqlite3.connect('embeddings.db') as conn:
    c = conn.cursor()
    c.execute('CREATE TABLE IF NOT EXISTS embeddings (id INTEGER PRIMARY KEY, project_id TEXT, sentence TEXT, embedding BLOB)')
    conn.commit()

# Initialize the Annoy index
annoy_index = AnnoyIndex(embedding_dim, 'angular')

# Load the projects
with open("digital_agriculture_projects.json", "r") as f:
    projects = json.load(f)
projects=projects[:5]

# Initialize counter and lock
counter = 1
counter_lock = Lock()

# Process texts and save embeddings into the database using 8 threads
with ThreadPoolExecutor(max_workers=8) as executor:
    for _ in executor.map(process_project, projects, [annoy_index] * len(projects)):
        pass

# Build the Annoy index with 10 trees
annoy_index.build(10)

# Save the Annoy index
annoy_index.save('embeddings.ann')

Processing projectProcessing project Myanmar - National Food and Agriculture System Project
 Disclosable Restructuring Paper - Agriculture Cluster Development Project - P145037
Processing project Montenegro - Institutional Development and Agriculture Strengthening Project (MIDAS)
Processing project Kenya - Climate Smart Agriculture Project : Environmental Assessment (Vol. 7) : Environmental and Social Impact Assessment for Sertonje Borehole Sub-project, Mugurin Sub Location, Simotwe Location, Kisanana Ward, Mogotio Sub County, Baringo County
Processing project Kenya - National Climate Smart Agriculture Project : Environmental Assessment (Vol. 2) : Pest Management Plan for Livestock Vaccination Campaign for Management of East Coast Fever, Bomet County


True

In [29]:

# Load the Annoy index
annoy_index = AnnoyIndex(embedding_dim, 'angular')
annoy_index.load('embeddings.ann')

# Perform a search using the Annoy index
query = "Remote sensing for fertilizer management"
query_embedding = model.encode(query)

n_nearest_neighbors = 5
nearest_neighbors = annoy_index.get_nns_by_vector(query_embedding, n_nearest_neighbors)

# Print the search results
with sqlite3.connect('embeddings.db') as conn:
    c = conn.cursor()
    for neighbor_id in nearest_neighbors:
        c.execute("SELECT project_id, sentence FROM embeddings WHERE id=?", (neighbor_id,))
        project_id, sentence = c.fetchone()
        print(f"Project ID: {project_id}\nSentence: {sentence}\n")

Project ID: P164448
Sentence:  (c) Strengthening Agriculture Extension Services through Digital Technologies and (d) Improving Irrigation and Drainage Infrastructure. 2.

Project ID: P145037
Sentence: The objective of the Agriculture Cluster Development Project is to raise on-farm productivity, production, and marketable volumes of selected agricultural commodities in specified geographic clusters.

Project ID: P164448
Sentence: Agriculture Productivity Enhancement and Diversification component will focus on: (a) Strengthening Agricultural Research and Development System

Project ID: P164448
Sentence: Project Management, Coordination, and Monitoring and Evaluation component will support effective project management systems for financial management

Project ID: P154784
Sentence:  dust from excavations and earth moving vehicles as well as materials delivery



In [21]:
import sqlite3
import numpy as np
from sentence_transformers import SentenceTransformer
from annoy import AnnoyIndex

def search(query, model, annoy_index, top_k=5):
    embedding = model.encode(query)
    nearest_ids = annoy_index.get_nns_by_vector(embedding, top_k)
    
    with sqlite3.connect('embeddings.db') as conn:
        c = conn.cursor()
        c.execute("SELECT project_id, sentence FROM embeddings WHERE id IN ({})".format(','.join(map(str, nearest_ids))))
        results = c.fetchall()

    return results

# Initialize the sentence-transformer model
model = SentenceTransformer('paraphrase-distilroberta-base-v1')

# Load the Annoy index
embedding_dim = model.get_sentence_embedding_dimension()
annoy_index = AnnoyIndex(embedding_dim, 'angular')


# Perform a search
query = "Remote sensing for fertilizer management"
results = search(query, model, annoy_index, top_k=5)

# Print the search results
for project_id, sentence in results:
    print(f"Project ID: {project_id}\nSentence: {sentence}\n")
