In [2]:
import pandas as pd
from rich import print, pretty
from rich.console import Console
from icecream import ic
import pandas as pd
import numpy as np
from pathlib import Path
import pickle
import os
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances_argmin as distances_argmin
from embed_sources import create_sentence_embedding
from sentence_transformers import SentenceTransformer
import google.ai.generativelanguage as glm 
import google.generativeai as genai
import openai
from dotenv import load_dotenv
load_dotenv()

from IPython.display import Markdown

console = Console()
pretty.install()

In [None]:
openai.api_key = os.getenv('OPENAI_API_KEY')
API_KEY = os.getenv("GEMINI_API_KEY")
genai.configure(api_key=API_KEY)
# OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')


In [3]:
filepath = Path.cwd().joinpath("embeddings", "so_database_app.csv")
so_df = pd.read_csv(filepath)
so_df.head()

Unnamed: 0,input_text,output_text,category
0,"python's inspect.getfile returns ""<string>""<p>...",<p><code>&lt;string&gt;</code> means that the ...,python
1,Passing parameter to function while multithrea...,<p>Try this and note the difference:</p>\n<pre...,python
2,How do we test a specific method written in a ...,"<p>Duplicate of <a href=""https://stackoverflow...",python
3,how can i remove the black bg color of an imag...,<p>The alpha channel &quot;disappears&quot; be...,python
4,How to extract each sheet within an Excel file...,<p>You need to specify the <code>index</code> ...,python


In [None]:
console.print(f"Shape of the data is [bold green underline]{so_df.shape}", style="bold yellow")

In [4]:
filepath = Path.cwd().joinpath("embeddings", "question_embeddings_app.pkl")
with open(filepath, "rb") as file:
    questions_embeddings = pickle.load(file)
print(questions_embeddings.shape)

# added the embedding to the dataframe
# This is acting like our vector DB
so_df["embeddings"] = questions_embeddings.tolist() 

In [35]:
# Now lets ask the LLM a question
query = ['How to concat dataframes pandas']
# query = ['''Creating new column from filtering others<p>I need to assign to a new column the value 1 or 0 depending on what 
# other columns have.
# I have around 30 columns with binary values (1 or 0), but also other variables with numeric, continuous, values 
# (e.g. 200). I would like to avoid the write a logical condition with many OR, so I was wondering if there is an 
# easy and fast way to do it.''']

model = SentenceTransformer("all-mpnet-base-v2")

# get the embedding of the query
query_embedding, _ = create_sentence_embedding(query, model, bert = False)
print(query_embedding.shape)
cos_similarity = cosine_similarity(query_embedding.tolist(),
                            list(so_df.embeddings.values))

print(cos_similarity.shape)

# find the embedding with the highest similarity value
index_doc_cosine = np.argmax(cos_similarity)
print(index_doc_cosine)

# get the distances
index_doc_distances = distances_argmin(
                    query_embedding.tolist(),
                    list(so_df.embeddings.values))[0]
print(index_doc_distances)

In [36]:
# get the input text from the database using the above index
print(so_df.input_text[index_doc_cosine])

In [37]:
print(so_df.output_text[index_doc_cosine])

In [19]:
for m in genai.list_models():
    if 'generateContent' in m.supported_generation_methods:
        print(m.name)

In [47]:
context = f"""
        Question: {so_df.input_text[index_doc_cosine]}\n
        Answer: {so_df.output_text[index_doc_cosine]}
"""
prompt = f"""Here is the context: {context}
             Using the relevant information from the context,
             provide an answer to the query: {query}."
             If the context doesn't provide \
             any relevant information, answer with 
             [I couldn't find a good match in the \
             document database for your query]
             """
Markdown(prompt)

Here is the context: 
        Question: matching very long and complex version numbers with regular expressions in python<p>I have lots of lines like this.</p>
<pre><code>some text some more text v3.1.0-beta.4 more &amp; more text
some text some v2 build.3 some more text more &amp; more text
some text some v21.1.23456.551436a4 alpha.4 some more text v16.1.2 more &amp; more text
</code></pre>
<p>version numbers are:</p>
<pre><code>v3.1.0-beta.4
v2 build 3
v21.1.23456.551436a4 alpha.4 and v16.1.2
</code></pre>
<p>The problem is I can have have words beta or build or neither in my version numbers. variable length of version numbers.</p>
<p>Here's what I have now and this can only match numbers.</p>
<pre><code>\d+(?:\.\d+)+
</code></pre>
<p>How do I search for the existence of one or more version numbers in a give line like shown in above lines like above?</p>

        Answer: <p>You may try this regex:</p>
<pre><code>\bv[\d+]+[\w.]*(?:[-\s]+(?:alpha|beta|build)[\w.]*)?
</code></pre>
<p><a href="https://regex101.com/r/sualem/3/" rel="nofollow noreferrer">RegEx Demo</a></p>
<p><strong>Pattern Details:</strong></p>
<ul>
<li><code>\bv</code>: Match <code>v</code> after a word boundary</li>
<li><code>[\d+]+</code>: Match 1+ digit or dot characters</li>
<li><code>[\w.]*</code>: Match 0 or more word or dot characters</li>
<li><code>(?:[-\s]+(?:alpha|beta|build)[\w.]*)?</code>: starting with whitespace or hyphen, optionally match <code>alpha|beta|build</code> part followed by 0 or more word or dot characters</li>
</ul>

             Using the relevant information from the context,
             provide an answer to the query: ['How to concat dataframes pandas']."
             If the context doesn't provide              any relevant information, answer with 
             [I couldn't find a good match in the              document database for your query]
             

In [48]:
model = genai.GenerativeModel('models/gemini-pro')
answer = model.generate_content(prompt)
print(answer)

In [49]:
print(answer.text)

## Scale with approximate nearest neighbor search

When dealing with a large dataset, computing the similarity between the query and each original embedded document in the database might be too expensive. Instead of doing that, you can use approximate nearest neighbor algorithms that find the most similar documents in a more efficient way.

These algorithms usually work by creating an index for your data, and using that index to find the most similar documents for your queries. In this notebook, we will use ScaNN to demonstrate the benefits of efficient vector similarity search. First, you have to create an index for your embedded dataset.

In [5]:
# cerate index of the embedding database using scann
# index = create_index(
#     embedded_dataset = questions_embeddings,
#     num_leaves = 25,
#     num_leaves_to_search = 10,
#     training_sample_size = 2000
# )
 