In [1]:
#if needed 
#!set_db.sh

In [2]:
import re
import numpy as np
import psycopg2
from psycopg2.extensions import register_adapter, AsIs
from concurrent.futures import ThreadPoolExecutor
from threading import Lock
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
import os, json
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
import datetime


#cross project import
from get_full_text import clean_text,retrieve_full_text 

MAX_TOKENS = 150
EMBEDDING_SIZE = 768
special_splitter="#!#" #used to always split text into chunks on that token.

def log(text):
    print(f"{datetime.datetime.now().strftime('%H:%M:%S')} - {text}")

def chunk_splitter(text, max_tokens=MAX_TOKENS,special_splitter=special_splitter):
    # Split at "#!#" and process each part separately
    parts = text.split(special_splitter)
    
    chunks = []
    for part in parts:
        raw_sentences = sent_tokenize(part.strip())
        
        token_count = 0
        current_chunk = []
        
        for sentence in raw_sentences:
            tokens = word_tokenize(sentence)
            token_count += len(tokens)
            
            if token_count <= max_tokens:
                current_chunk.append(sentence)
            else:
                chunks.append(" ".join(current_chunk))
                current_chunk = [sentence]
                token_count = len(tokens)
                
        if current_chunk:
            chunks.append(" ".join(current_chunk))
        chunks = [chunk for chunk in chunks if chunk] #LOL syntax
    
    return chunks

# Add these lines to register np.ndarray for psycopg2
def adapt_np_array(array):
    return AsIs(np.array(array).tolist())
register_adapter(np.ndarray, adapt_np_array)

# Database configuration
db_config = {
    'dbname': 'wb_s2_embeddings',
    'user': 's2',
    'password': 'wb@s2',
    'host': 'localhost',
    'port': 5432
}


def compute_embeddings(sentences, model, project):
    sentence_embeddings = []
    for idx, sentence in enumerate(sentences):
        words = sentence.split()
        word_embeddings = model.encode(words)

        if word_embeddings.size > 0:
            average_embedding = np.mean(word_embeddings, axis=0)
        else:
            average_embedding = np.zeros(model.get_sentence_embedding_dimension())

        sentence_embeddings.append(average_embedding)

        # Print progress every 100 sentences
        if len(sentence_embeddings) % 100 == 0:
            log(f"Doing -> Processed row {len(sentence_embeddings)} of {len(sentences)} for project {project['title']}")

    return sentence_embeddings


# Function to compute TF-IDF weighted embeddings
def compute_tfidf_weighted_embeddings(sentences, model,project):
    vectorizer = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b")
    tfidf_matrix = vectorizer.fit_transform(sentences)
    tfidf_word_index = vectorizer.vocabulary_

    sentence_embeddings = []
    for row in tfidf_matrix:
        row = row.tocoo()  # Convert to COOrdinate format for efficient iteration
        words = [vectorizer.get_feature_names_out()[idx] for idx in row.col]
        word_weights = row.data
        word_embeddings = model.encode(words)
        
        if np.sum(word_weights) > 0:
            weighted_embedding = np.average(word_embeddings, axis=0, weights=word_weights)
        elif word_embeddings.size > 0:
            weighted_embedding = np.mean(word_embeddings, axis=0)
        else:
            weighted_embedding = np.zeros(model.get_sentence_embedding_dimension())
        
        sentence_embeddings.append(weighted_embedding)

        # Print progress every 100 sentences
        if len(sentence_embeddings) % 100 == 0:
            log(f"Doing -> Processed row {len(sentence_embeddings)} of {tfidf_matrix.shape[0]} for project {project['title']}")

    return sentence_embeddings

def project_exists(project_id, conn):
    c = conn.cursor()
    c.execute("SELECT 1 FROM embeddings WHERE project_id = %s LIMIT 1", (project_id,))
    return c.fetchone() is not None



# Function to process a project
def process_project(thread_id, project):
    retrieve_full_text(project)
    project["keywords"] = project["keywords"].replace(";", ". ").replace(",", ". ")
    project["full_text"] = "Title: "    + project["title"] + special_splitter +\
                           "Abstract: " + project["abstract"] + special_splitter +\
                           "Fullt text: " + project["full_text"]
                           #"Keywords: " + project["keywords"] + special_splitter #bad results
    project["full_text"] = clean_text(project["full_text"])

    local_counter = 1
    with psycopg2.connect(**db_config) as conn:
        project_id = ",".join(project['ids'])
        if project_exists(project_id, conn):
            log(f"Skipping -> Project {project['title']} already exists in the table.")
            return
        chunks = chunk_splitter(project["full_text"])
        log(f"Starting -> {len(chunks)} chunks for project {project['title']}.")
        #sentence_embeddings = compute_tfidf_weighted_embeddings(chunks, model,project)
        sentence_embeddings = compute_embeddings(chunks, model,project)
        c = conn.cursor()
        for chunk, embedding in zip(chunks, sentence_embeddings):
            unique_id = thread_id * 1000000 + local_counter
            chunk = chunk.replace('\x00', ' ')  # Replace NUL characters with a space
            c.execute("INSERT INTO embeddings (id, project_id, chunk, embedding) VALUES (%s, %s, %s, %s::VECTOR)", (unique_id, project_id, chunk, embedding.tolist()))
            local_counter += 1
        conn.commit()
        log(f"Done -> Project {project['title']}")


# Initialize a SentenceTransformer model
model = SentenceTransformer('sentence-transformers/paraphrase-mpnet-base-v2')
embedding_dim = model.get_sentence_embedding_dimension()

#Delete the database if it exists
if os.path.exists('embeddings.db'):
    os.remove('embeddings.db')

# Create a folder to store text files
text_folder = "text_files"

with psycopg2.connect(**db_config) as conn:
    c = conn.cursor()
    #c.execute("DROP TABLE IF EXISTS embeddings;")
    #c.execute("DROP SEQUENCE IF EXISTS embeddings_id_seq;")
    c.execute('CREATE SEQUENCE IF NOT EXISTS embeddings_id_seq;')
    c.execute(f'CREATE TABLE IF NOT EXISTS embeddings (id INTEGER PRIMARY KEY DEFAULT nextval(\'embeddings_id_seq\'), project_id TEXT, chunk TEXT, embedding VECTOR({EMBEDDING_SIZE}));')
    
    conn.commit()


# Load the projects
with open("digital_agriculture_projects.json", "r") as f:
    projects = json.load(f)


# Initialize counter and lock
counter = 1
counter_lock = Lock()

# Process texts and save embeddings into the database using 8 threads
with ThreadPoolExecutor(max_workers=2) as executor:
    for i, _ in enumerate(executor.map(process_project, range(len(projects[:])), projects[:])):
        pass



08:12:05 - Skipping -> Project Disclosable Restructuring Paper - Agriculture Cluster Development Project - P145037 already exists in the table.08:12:05 - Skipping -> Project Myanmar - National Food and Agriculture System Project already exists in the table.

08:12:05 - Skipping -> Project Montenegro - Institutional Development and Agriculture Strengthening Project (MIDAS) already exists in the table.
08:12:05 - Skipping -> Project Kenya - Climate Smart Agriculture Project : Environmental Assessment (Vol. 7) : Environmental and Social Impact Assessment for Sertonje Borehole Sub-project, Mugurin Sub Location, Simotwe Location, Kisanana Ward, Mogotio Sub County, Baringo County already exists in the table.
08:12:05 - Skipping -> Project Kenya - National Climate Smart Agriculture Project : Environmental Assessment (Vol. 2) : Pest Management Plan for Livestock Vaccination Campaign for Management of East Coast Fever, Bomet County already exists in the table.
08:12:05 - Skipping -> Project Ken