In [54]:
from dotenv import load_dotenv
import os,json 
from openai import AzureOpenAI
import psycopg2
from psycopg2.extras import execute_values
from pgvector.psycopg2 import register_vector
import numpy as np

load_dotenv()

True

In [38]:
## Intializing connection and Creating a Table


 
DBUSER = os.environ["DBUSER"]
DBPASS = os.environ["DBPASS"]
DBHOST = os.environ["DBHOST"]
DBNAME = os.environ["DBNAME"]
# Use SSL if not connecting to localhost
DBSSL = "disable"
if DBHOST != "localhost":
    DBSSL = "require"
 
 
## initiate a connection
 
def initiate_connection():
    conn = psycopg2.connect(database=DBNAME, user=DBUSER, password=DBPASS, host=DBHOST, sslmode=DBSSL,port=38530)
    conn.autocommit = True
    return conn
 
 
def create_table(conn,table_name):
    cur = conn.cursor()
    cur.execute("CREATE EXTENSION IF NOT EXISTS vector")
    cur.execute(f"DROP TABLE IF EXISTS {table_name}")
    cur.execute(f"CREATE TABLE {table_name} (id bigserial PRIMARY KEY, text TEXT, embedding VECTOR(1536));")
    register_vector(conn)
 
 
 
## initialize a connection
conn = initiate_connection()
## Register the vector type with psycopg2
register_vector(conn)
## create a table
create_table(conn,table_name="bio_data")

### Preprocessing the text

In [5]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
 
doc_path = "data/messi.pdf"
 
# load your pdf doc
loader = PyPDFLoader(doc_path)
pages = loader.load()
 
 
# split the doc into smaller chunks i.e. chunk_size=500
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = text_splitter.split_documents(pages)

In [6]:
for chunk in chunks:
    print(chunk)

page_content='Lionel\xa0Messi:\xa0A\xa0Biography\xa0\n\xa0\n\xa0\nLionel\xa0Messi\xa0is\xa0a\xa0professional\xa0footballer\xa0who\xa0plays\xa0for\xa0the\xa0Spanish\xa0football\xa0team,\xa0FC\xa0\nBarcelona.\xa0He\xa0wears\xa0the\xa0number\xa010\xa0shirt\xa0for\xa0his\xa0club\xa0and\xa0for\xa0his\xa0national\xa0team,\xa0Argentina.\xa0\nAlso,\xa0he\xa0is\xa0widely\xa0regarded\xa0as\xa0the\xa0best\xa0player\xa0of\xa0all\xa0time,\xa0along\xa0with\xa0Cristiano\xa0Ronaldo,\xa0and\xa0\nthe\xa0now\xadretired\xa0footballers,\xa0Pele\xa0and\xa0Diego\xa0Maradona.\xa0Read\xa0on\xa0to\xa0find\xa0out\xa0more\xa0about\xa0this\xa0\nspectacular\xa0footballer.\xa0\n\xa0\nChildhood' metadata={'source': 'data/messi.pdf', 'page': 0}
page_content='spectacular\xa0footballer.\xa0\n\xa0\nChildhood\xa0\nLionel\xa0Messi\xa0was\xa0born\xa0on\xa024\xa0June\xa01987\xa0in\xa0the\xa0small\xa0town\xa0of\xa0Rosario,\xa0Argentina.\xa0His\xa0\nmother\xa0is\xa0called\xa0Celia\xa0Cuccitini,\xa0and\xa0his\xa0father\xa0is\xa

### Create OpenAI Embeddings





In [28]:
client = AzureOpenAI(api_key=os.getenv("OPENAI_API_KEY"),
                    api_version=os.getenv("OPENAI_API_VERSION"),
                        azure_endpoint = os.getenv("OPENAI_API_ENDPOINT")
    )

def get_embeddings(text,client):
   """
   creates openai embedding
   """
   response = client.embeddings.create(
       model = 'text-embedding-ada-002',
       input=text
   )
   response = response.json()
   embedding = json.loads(response)
   return embedding["data"][0]["embedding"]


##example
emb = get_embeddings(chunks[0].page_content,client)
print(emb)


[-0.00980521272867918, 0.023951852694153786, 0.007665333338081837, -0.016675028949975967, -0.0018623737851157784, 0.021275462582707405, -0.024112191051244736, 0.015750009566545486, -0.01081656850874424, -0.03246203809976578, 0.010051884688436985, 0.019869431853294373, -0.005198612809181213, -0.007541996892541647, -0.0017344126245006919, 0.011562750674784184, 0.023729847744107246, 0.003878917545080185, 0.012142430059611797, 1.480273022025358e-05, -0.005130778066813946, 0.009780545718967915, 0.008615020662546158, -0.0018839575350284576, -0.016675028949975967, -0.008701355196535587, 0.02550588734447956, -0.010717899538576603, 0.03268404304981232, -0.006666311528533697, 0.013258621096611023, -0.0024543865583837032, -0.002953897463157773, -0.018315397202968597, -0.01610768400132656, -0.0035058262292295694, -0.00119790097232908, -0.022854162380099297, 0.017279375344514847, -0.002258590655401349, 0.030932672321796417, 0.025555221363902092, -0.0231008343398571, -0.00018577487207949162, -0.0069

### Inject Embeddings and Text to Database

In [39]:

register_vector(conn)
cur = conn.cursor()

## list of tuples to be inserted
data = [(chunk.page_content,get_embeddings(chunk.page_content,client)) for chunk in chunks]
## use execute_values for batch insertion
execute_values(cur,"INSERT INTO bio_data (text,embedding) VALUES %s",data)

# Commit after we insert all embeddings
conn.commit()


**Lets see total number of rows in database**

In [49]:
cur = conn.cursor()

## print total number of records in the table
cur.execute("SELECT COUNT(*) as cnt FROM bio_data")
num_records = cur.fetchone()[0]
print("Total number of records inserted: ",num_records)

# print the first record in the table, for sanity-checking
cur.execute("SELECT * FROM bio_data LIMIT 1;")
records = cur.fetchall()
print("First record in table: ", records)

Total number of records inserted:  11
First record in table:  [(1, 'Lionel\xa0Messi:\xa0A\xa0Biography\xa0\n\xa0\n\xa0\nLionel\xa0Messi\xa0is\xa0a\xa0professional\xa0footballer\xa0who\xa0plays\xa0for\xa0the\xa0Spanish\xa0football\xa0team,\xa0FC\xa0\nBarcelona.\xa0He\xa0wears\xa0the\xa0number\xa010\xa0shirt\xa0for\xa0his\xa0club\xa0and\xa0for\xa0his\xa0national\xa0team,\xa0Argentina.\xa0\nAlso,\xa0he\xa0is\xa0widely\xa0regarded\xa0as\xa0the\xa0best\xa0player\xa0of\xa0all\xa0time,\xa0along\xa0with\xa0Cristiano\xa0Ronaldo,\xa0and\xa0\nthe\xa0now\xadretired\xa0footballers,\xa0Pele\xa0and\xa0Diego\xa0Maradona.\xa0Read\xa0on\xa0to\xa0find\xa0out\xa0more\xa0about\xa0this\xa0\nspectacular\xa0footballer.\xa0\n\xa0\nChildhood', array([-0.00980521,  0.02395185,  0.00766533, ..., -0.03130268,
        0.00775167, -0.0247412 ], dtype=float32))]


### Indexing For Faster Data Retrieval


* Here we have only few embedding vectors.so the vector search would be very fast. But what if we have large corpus of data. For large datasets, we create indexes to speed up searching for similar embeddings.

* Pgvector supports the ivfflat index type to provide for speed up of approximate nearest neighbor (ANN) searches.

* We always want to build this index after you have inserted the data, as the index needs to discover clusters in your data to be effective, and it does this only when first building the index.

* For ivfat index we have a tunable parameter, which is  number of lists. We can use the below logic for it. Also we use vector_cosine_ops for creating the index as we use cosine similarity later while fetching the data.

In [51]:
import math

num_lists = num_records / 1000

if num_lists < 10:
   num_lists = 10
if num_records > 1000000:
   num_lists = math.sqrt(num_records)

print('Appropriate number of lists for our case: ',num_lists)

Appropriate number of lists for our case:  10


In [52]:
#use the cosine distance measure, which is what we'll later use for querying
cur.execute(f'CREATE INDEX ON bio_data USING ivfflat (embedding vector_cosine_ops) WITH (lists = {num_lists});')
conn.commit()

## Retrieval Pipeline


**Symbols used for similarity**

```
<-> - L2 distance
<#> - (negative) inner product
<=> - cosine distance
<+> - L1 distance (added in 0.7.0)
```

In [70]:


def get_top_similar_docs(query_embedding, conn, n=2):
    embedding_array = np.array(query_embedding)
    # Register pgvector extension
    register_vector(conn)
    cur = conn.cursor()
    # Get the top 3 most similar documents using the KNN <=> operator
    cur.execute(f"SELECT text FROM bio_data ORDER BY embedding <=> %s LIMIT {n}", (embedding_array,))
    top3_docs = cur.fetchall()
    return top3_docs



query = "With which players do messi played attacking role in Barcelona?"
query_embedding = get_embeddings(query,client)
top_text = get_top_similar_docs(query_embedding, conn)
for text in top_text:
    print(text[0])

was mute (can’t speak)? 
 
Early Career 
 
Near the start of the 2003­04 season, Messi started his first senior squad game. 
Unfortunately, he didn’t score any goals in that match. He quickly became Barcelona’s first 
choice winger, forming an astounding attacking trio with Ronaldinho and Samuel Eto’o. In 
the return leg against Chelsea in the Round of 16 in the Champions League, Messi suffered 
a hamstring injury. He worked hard to recover, but on the day of the Champions League
Cristiano Ronaldo. Barcelona finished the season without any trophies. 
 
Ahead of the new season, one concern was about Messi’s frequent muscular 
injuries. To combat this, the club implemented new regimens, and assigned Messi a 
personal physiotherapist. As a result, Messi stayed uninjured for 4 years. His performances 
in 2008 led him to be runner­up for the Ballon d’Or, behind Cristiano Ronaldo again.


In [79]:
system_prompt = """You are AI assistant who answers the question from the extracted content from articles.
                    You answer should be concise and clear.
                    You should only answer from given content.
                    If answer is not present or the question is out of context you should give answer as `I dont know`
                    You should not make up any answers.
                    """


def get_open_ai_response(query,content):

    user_prompt = f"""Question: {query} 
                      Content: {content}
                      Answer:"""
    
    messages = [
        {'role':'system','content':system_prompt},
        {'role':'user','content':user_prompt}
    ]

    response = client.chat.completions.create(
          model = 'gpt-4-32k',
          messages = messages,

      )
    
    response = response.json()
    response = json.loads(response)
    return response['choices'][0]['message']['content']



def answer(query):
    query_embedding = get_embeddings(query,client)
    top_text = get_top_similar_docs(query_embedding, conn)
    content = ' '.join([i[0] for i in top_text])
    ans = get_open_ai_response(query,content)
    return ans



In [80]:
query = "Do messi eat pizza"
print('Query: ',query)
print('Answer: ' ,answer(query))
print('-------'*2)

query = "Childhood club of messi?"
print('Query: ',query)
print('Answer: ' ,answer(query))
print('-------'*2)

query = "write a small note on childhood of messi?"
print('Query: ',query)
print('Answer: ' ,answer(query))
print('-------'*2)

Query:  Do messi eat pizza
Answer:  I don't know
--------------
Query:  Childhood club of messi?
Answer:  I don't know
--------------
Query:  write a small note on childhood of messi?
Answer:  From a young age, Messi established himself as a prodigy in his youth team, becoming one of the best players. However, he often experienced homesickness when his mother and siblings moved back to Argentina, leaving only him and his father in Spain. Interestingly, Messi was so quiet that his teammates at first thought he was mute.
--------------
