In [6]:
import re
import sqlite3
import numpy as np
from concurrent.futures import ThreadPoolExecutor
from threading import Lock
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
from annoy import AnnoyIndex
import os, json
import requests
from nltk.tokenize import word_tokenize

#cross project import
from get_full_text import clean_text,retrieve_full_text 


MAX_TOKENS = 250

def chunk_splitter(text, max_tokens=MAX_TOKENS):
    raw_sentences = re.split(r'(?<=[^A-Z].[.?;!]) +(?=[A-Z])|(?<=;;) *|;', text)
    
    token_count = 0
    chunks = []
    current_chunk = []
    
    for sentence in raw_sentences:
        tokens = word_tokenize(sentence)
        token_count += len(tokens)
        
        if token_count <= max_tokens:
            current_chunk.append(sentence)
        else:
            chunks.append(" ".join(current_chunk))
            current_chunk = [sentence]
            token_count = len(tokens)
            
    if current_chunk:
        chunks.append(" ".join(current_chunk))
        
    return chunks

# Function to compute TF-IDF weighted embeddings
def compute_tfidf_weighted_embeddings(sentences, model):
    vectorizer = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b")
    tfidf_matrix = vectorizer.fit_transform(sentences)
    tfidf_word_index = vectorizer.vocabulary_

    sentence_embeddings = []
    for row in tfidf_matrix:
        words = [word for word in tfidf_word_index if row[0, tfidf_word_index[word]] > 0]
        word_weights = [row[0, tfidf_word_index[word]] for word in words]
        word_embeddings = model.encode(words)
        
        if sum(word_weights) > 0:
            weighted_embedding = np.average(word_embeddings, axis=0, weights=word_weights)
        elif word_embeddings.size > 0:
            weighted_embedding = np.mean(word_embeddings, axis=0)
        else:
            weighted_embedding = np.zeros(model.get_sentence_embedding_dimension())
        
        sentence_embeddings.append(weighted_embedding)
        #print progres every 500 sentences
        if len(sentence_embeddings) % 500 == 0:
            print(f"Processed row {len(sentence_embeddings)} of {tfidf_matrix.shape[0]}")

    return sentence_embeddings



# Function to process a project
def process_project(project, annoy_index):
    global counter

    retrieve_full_text(project)
    #prepend the title, abstract and keywords to the full text
    project["keywords"] = project["keywords"].replace(";", ". ").replace(",", ". ")
    project["full_text"] = project["title"] + ". " + project["abstract"] + ". " + project["keywords"] + ". " + project["full_text"]
    project["full_text"] = clean_text(project["full_text"])

    #print(json.dumps(project, indent=4, sort_keys=True))

    chunks = chunk_splitter(project["full_text"])
    sentence_embeddings = compute_tfidf_weighted_embeddings(chunks, model)

    with sqlite3.connect('embeddings.db') as conn:
        c = conn.cursor()
        for chunk, embedding in zip(chunks, sentence_embeddings):
            with counter_lock:
                idx = counter
                counter += 1
            c.execute("INSERT INTO embeddings (id, project_id, chunk, embedding) VALUES (?, ?, ?, ?)", (idx, ",".join(project['ids']), chunk, embedding.tobytes()))
            annoy_index.add_item(idx, embedding)  # Add the embeddings directly to the Annoy index
        conn.commit()
        print("Processed project", project["title"])

# Initialize a SentenceTransformer model
model = SentenceTransformer('sentence-transformers/paraphrase-mpnet-base-v2')
embedding_dim = model.get_sentence_embedding_dimension()

#Delete the database if it exists
if os.path.exists('embeddings.db'):
    os.remove('embeddings.db')

# Create a folder to store text files
text_folder = "text_files"
os.makedirs(text_folder, exist_ok=True)
    
# Create the SQLite database
with sqlite3.connect('embeddings.db') as conn:
    c = conn.cursor()
    c.execute('CREATE TABLE IF NOT EXISTS embeddings (id INTEGER PRIMARY KEY, project_id TEXT, chunk TEXT, embedding BLOB)')
    conn.commit()

# Initialize the Annoy index
annoy_index = AnnoyIndex(embedding_dim, 'angular')

# Load the projects
with open("digital_agriculture_projects.json", "r") as f:
    projects = json.load(f)


# Initialize counter and lock
counter = 1
counter_lock = Lock()

# Process texts and save embeddings into the database using 8 threads
with ThreadPoolExecutor(max_workers=4) as executor:
    for _ in executor.map(process_project, projects[:40], [annoy_index] * len(projects)):
        pass

# Build the Annoy index with 10 trees
annoy_index.build(10)

# Save the Annoy index
annoy_index.save('embeddings.ann')

In [5]:
import sqlite3
from prettytable import PrettyTable

def pretty_print_first_row(db_path, table_name):
    # Connect to the SQLite database
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    # Get the column names
    cursor.execute(f"PRAGMA table_info({table_name})")
    column_names = [info[1] for info in cursor.fetchall()]

    # Get the first row
    cursor.execute(f"SELECT * FROM {table_name} LIMIT 1")
    first_row = cursor.fetchone()

    # Close the connection
    cursor.close()
    conn.close()

    # Transpose the table (switch rows and columns)
    transposed_table = zip(column_names, first_row)

    # Create a PrettyTable object
    table = PrettyTable()
    table.field_names = ['Column', 'Value']
    table.align = 'l'  # Set left alignment for all fields


    # Add the transposed rows to the table
    for row in transposed_table:
        table.add_row(row)

    # Print the table
    print(table)


# Example usage
db_path = "embeddings.db"
table_name = "embeddings"
pretty_print_first_row(db_path, table_name)



+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------