In [11]:
import psycopg2
import json
import numpy as np

#importing local modules
from tokenize_sentences2db import openai_embeddings,log


# Function to perform the search
def search(query, model, db_config,number_of_results):
    # Encode the query
    log("Encoding the query...")
    query_embedding = openai_embeddings(model,query)[0]
    log("Finding the most similar projects...")
    # Connect to the database
    with psycopg2.connect(**db_config) as conn:
        c = conn.cursor()

        # Search for the top projects by cosine similarity
        c.execute("""
            SELECT project_id, chunk, embedding <=> %s::VECTOR AS distance
            FROM embeddings_openai
            ORDER BY distance DESC
            LIMIT %s;
        """, (list(query_embedding), number_of_results))
        
        results = c.fetchall()
    log("Done! Found {} results.".format(len(results)))
    return results


# Define the database configuration
db_config = {
    'dbname': 'wb_s2_embeddings',
    'user': 's2',
    'password': 'wb@s2',
    'host': 'localhost',
    'port': 5432
}



In [12]:
#convert this notebook to a python script
!jupyter nbconvert --to script search.ipynb

[NbConvertApp] Converting notebook search.ipynb to script
[NbConvertApp] Writing 2959 bytes to search.py


In [13]:
model="text-embedding-ada-002"
number_of_results = 5

# Database configuration
db_config = {
    'dbname': 'wb_s2_embeddings',
    'user': 's2',
    'password': 'wb@s2',
    'host': 'localhost',
    'port': 5432
}

# Load the projects
with open("digital_agriculture_projects.json", "r") as f:
    projects = json.load(f)

# Define the query
query = "Satellite Remote sensing for agriculture"

# Perform the search
results = search(query, model, db_config,number_of_results)
results[::-1]
# Print the results
for project_id, chunk, distance in results:
    project = next((p for p in projects if ",".join(p["ids"]) == project_id), None)
    print(f"Project ID: {project_id}")
    print(f"Project title: {project['title']}")
    print(f"Project abstract: {project['abstract']}")
    print(f"Relevant snippet: {chunk}")
    print(f"Distance: {distance}")
    print("\n")


23:35:38 - Encoding the query...
23:35:38 - Finding the most similar projects...


DataException: array must be 1-D


In [14]:
query_embedding = openai_embeddings(model,query)

In [18]:
len(query_embedding[0])

1536

In [16]:
query

'Satellite Remote sensing for agriculture'

In [17]:
query_embedding

[[-0.013060777448117733,
  -0.013284601271152496,
  -0.00648430734872818,
  0.000207263306947425,
  -0.010993698611855507,
  0.020604955032467842,
  -0.0013429428217932582,
  -0.0006406133179552853,
  -0.022066393867135048,
  -0.022737864404916763,
  0.00907144695520401,
  0.022250719368457794,
  -0.020657619461417198,
  -0.009828497655689716,
  -0.008755460381507874,
  0.020262636244297028,
  0.032309625297784805,
  -0.022053226828575134,
  0.020117809996008873,
  -0.009986490942537785,
  0.00853163655847311,
  -0.0002219723246525973,
  0.0026825941167771816,
  0.02082877978682518,
  0.009966742247343063,
  -0.0009800520492717624,
  0.023422501981258392,
  -0.043737802654504776,
  -0.006240734364837408,
  -0.005431018769741058,
  0.010065487585961819,
  -0.02875477634370327,
  -0.015693997964262962,
  -0.02138175629079342,
  -0.01735292747616768,
  -0.029360415413975716,
  0.020618122071027756,
  -0.01647079922258854,
  -0.005072242114692926,
  -0.02558174356818199,
  0.01431155763566