In [1]:
import pandas as pd
from rich import print, pretty
from rich.console import Console
from icecream import ic
import pandas as pd
import numpy as np
from pathlib import Path
import pickle
import os
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances_argmin as distances_argmin
from embed_sources import create_sentence_embedding
from sentence_transformers import SentenceTransformer
import google.ai.generativelanguage as glm 
import google.generativeai as genai
import openai
from dotenv import load_dotenv
load_dotenv()

from IPython.display import Markdown

console = Console()
pretty.install()

In [2]:
openai.api_key = os.getenv('OPENAI_API_KEY')
API_KEY = os.getenv("GEMINI_API_KEY")
genai.configure(api_key=API_KEY)
# OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')


In [3]:
filepath = Path.cwd().joinpath("embeddings", "so_database_app.csv")
so_df = pd.read_csv(filepath)
so_df.head()

Unnamed: 0,input_text,output_text,category
0,"python's inspect.getfile returns ""<string>""<p>...",<p><code>&lt;string&gt;</code> means that the ...,python
1,Passing parameter to function while multithrea...,<p>Try this and note the difference:</p>\n<pre...,python
2,How do we test a specific method written in a ...,"<p>Duplicate of <a href=""https://stackoverflow...",python
3,how can i remove the black bg color of an imag...,<p>The alpha channel &quot;disappears&quot; be...,python
4,How to extract each sheet within an Excel file...,<p>You need to specify the <code>index</code> ...,python


In [None]:
console.print(f"Shape of the data is [bold green underline]{so_df.shape}", style="bold yellow")

In [4]:
filepath = Path.cwd().joinpath("embeddings", "hmd_so_embeddings2000.pkl")
with open(filepath, "rb") as file:
    questions_embeddings = pickle.load(file)
print(questions_embeddings.shape)

# added the embedding to the dataframe
# This is acting like our vector DB
so_df["embeddings"] = questions_embeddings.tolist() 

In [6]:
# Now lets ask the LLM a question
query = ['How to concat dataframes pandas']
# query = ['''Creating new column from filtering others<p>I need to assign to a new column the value 1 or 0 depending on what 
# other columns have.
# I have around 30 columns with binary values (1 or 0), but also other variables with numeric, continuous, values 
# (e.g. 200). I would like to avoid the write a logical condition with many OR, so I was wondering if there is an 
# easy and fast way to do it.''']

model = SentenceTransformer("all-mpnet-base-v2")

# get the embedding of the query
query_embedding, _ = create_sentence_embedding(query, model, bert = False)
print(query_embedding.shape)
cos_similarity = cosine_similarity(query_embedding.tolist(),
                            list(so_df.embeddings.values))

print(cos_similarity.shape)

# find the embedding with the highest similarity value
index_doc_cosine = np.argmax(cos_similarity)
print(index_doc_cosine)

# get the distances
index_doc_distances = distances_argmin(
                    query_embedding.tolist(),
                    list(so_df.embeddings.values))[0]
print(index_doc_distances)

In [8]:
# get the input text from the database using the above index
Markdown(so_df.input_text[index_doc_cosine])

Concatenate 2 dataframes and repeat values from small one with pandas<p>I have these two dataframes:</p>
<div class="s-table-container">
<table class="s-table">
<thead>
<tr>
<th>Field1</th>
<th>Field2</th>
</tr>
</thead>
<tbody>
<tr>
<td>0.5</td>
<td>0.7</td>
</tr>
<tr>
<td>2</td>
<td>1</td>
</tr>
<tr>
<td>3</td>
<td>0.1</td>
</tr>
<tr>
<td>4</td>
<td>0.4</td>
</tr>
</tbody>
</table>
</div>
<p>and</p>
<div class="s-table-container">
<table class="s-table">
<thead>
<tr>
<th>Date</th>
<th>Time</th>
</tr>
</thead>
<tbody>
<tr>
<td>2022-08-01</td>
<td>1</td>
</tr>
<tr>
<td>2022-08-01</td>
<td>2</td>
</tr>
</tbody>
</table>
</div>
<p>and a I need to obtain the following:</p>
<div class="s-table-container">
<table class="s-table">
<thead>
<tr>
<th>Field1</th>
<th>Field2</th>
<th>Date</th>
<th>Time</th>
</tr>
</thead>
<tbody>
<tr>
<td>0.5</td>
<td>0.7</td>
<td>2022-08-01</td>
<td>1</td>
</tr>
<tr>
<td>2</td>
<td>1</td>
<td>2022-08-01</td>
<td>2</td>
</tr>
<tr>
<td>3</td>
<td>0.1</td>
<td>2022-08-01</td>
<td>1</td>
</tr>
<tr>
<td>4</td>
<td>0.4</td>
<td>2022-08-01</td>
<td>2</td>
</tr>
</tbody>
</table>
</div>
<p>Thanks in advance</p>

In [9]:
Markdown(so_df.output_text[index_doc_cosine])

<p>You can elongate your second dataframe to match dimentions, and then concatenate it with first dataframe.</p>
<pre class="lang-py prettyprint-override"><code>import pandas as pd

df1 = pd.DataFrame({'Field1': [0.5, 2, 3, 4], 'Field2': [0.7, 1, 0.1, 0.4]})
print(df1)
#    Field1  Field2
# 0     0.5     0.7
# 1     2.0     1.0
# 2     3.0     0.1
# 3     4.0     0.4

df2 = pd.DataFrame({'Date': ['2022-08-01', '2022-08-01'], 'Time': [1, 2]})
print(df2)
#          Date  Time
# 0  2022-08-01     1
# 1  2022-08-01     2

n = int(df1.size / df2.size)
df3 = pd.concat([df2] * n, axis=0).reset_index(drop=True)
print(df3)
#          Date  Time
# 0  2022-08-01     1
# 1  2022-08-01     2
# 2  2022-08-01     1
# 3  2022-08-01     2

df4 = pd.concat([df1, df3], axis=1)
print(df4)
#    Field1  Field2        Date  Time
# 0     0.5     0.7  2022-08-01     1
# 1     2.0     1.0  2022-08-01     2
# 2     3.0     0.1  2022-08-01     1
# 3     4.0     0.4  2022-08-01     2
</code></pre>
<p>or shorter:</p>
<pre class="lang-py prettyprint-override"><code>df4 = pd.concat([
    df1,
    pd.concat(
        [df2] * int(df1.size / df2.size),
        axis=0
    ).reset_index(drop=True)
], axis=1)
</code></pre>

In [10]:
for m in genai.list_models():
    if 'generateContent' in m.supported_generation_methods:
        print(m.name)

In [11]:
context = f"""
        Question: {so_df.input_text[index_doc_cosine]}\n
        Answer: {so_df.output_text[index_doc_cosine]}
"""
prompt = f"""Here is the context: {context}
             Using the relevant information from the context,
             provide an answer to the query: {query}."
             If the context doesn't provide \
             any relevant information, answer with 
             [I couldn't find a good match in the \
             document database for your query]
             """
Markdown(prompt)

Here is the context: 
        Question: Concatenate 2 dataframes and repeat values from small one with pandas<p>I have these two dataframes:</p>
<div class="s-table-container">
<table class="s-table">
<thead>
<tr>
<th>Field1</th>
<th>Field2</th>
</tr>
</thead>
<tbody>
<tr>
<td>0.5</td>
<td>0.7</td>
</tr>
<tr>
<td>2</td>
<td>1</td>
</tr>
<tr>
<td>3</td>
<td>0.1</td>
</tr>
<tr>
<td>4</td>
<td>0.4</td>
</tr>
</tbody>
</table>
</div>
<p>and</p>
<div class="s-table-container">
<table class="s-table">
<thead>
<tr>
<th>Date</th>
<th>Time</th>
</tr>
</thead>
<tbody>
<tr>
<td>2022-08-01</td>
<td>1</td>
</tr>
<tr>
<td>2022-08-01</td>
<td>2</td>
</tr>
</tbody>
</table>
</div>
<p>and a I need to obtain the following:</p>
<div class="s-table-container">
<table class="s-table">
<thead>
<tr>
<th>Field1</th>
<th>Field2</th>
<th>Date</th>
<th>Time</th>
</tr>
</thead>
<tbody>
<tr>
<td>0.5</td>
<td>0.7</td>
<td>2022-08-01</td>
<td>1</td>
</tr>
<tr>
<td>2</td>
<td>1</td>
<td>2022-08-01</td>
<td>2</td>
</tr>
<tr>
<td>3</td>
<td>0.1</td>
<td>2022-08-01</td>
<td>1</td>
</tr>
<tr>
<td>4</td>
<td>0.4</td>
<td>2022-08-01</td>
<td>2</td>
</tr>
</tbody>
</table>
</div>
<p>Thanks in advance</p>

        Answer: <p>You can elongate your second dataframe to match dimentions, and then concatenate it with first dataframe.</p>
<pre class="lang-py prettyprint-override"><code>import pandas as pd

df1 = pd.DataFrame({'Field1': [0.5, 2, 3, 4], 'Field2': [0.7, 1, 0.1, 0.4]})
print(df1)
#    Field1  Field2
# 0     0.5     0.7
# 1     2.0     1.0
# 2     3.0     0.1
# 3     4.0     0.4

df2 = pd.DataFrame({'Date': ['2022-08-01', '2022-08-01'], 'Time': [1, 2]})
print(df2)
#          Date  Time
# 0  2022-08-01     1
# 1  2022-08-01     2

n = int(df1.size / df2.size)
df3 = pd.concat([df2] * n, axis=0).reset_index(drop=True)
print(df3)
#          Date  Time
# 0  2022-08-01     1
# 1  2022-08-01     2
# 2  2022-08-01     1
# 3  2022-08-01     2

df4 = pd.concat([df1, df3], axis=1)
print(df4)
#    Field1  Field2        Date  Time
# 0     0.5     0.7  2022-08-01     1
# 1     2.0     1.0  2022-08-01     2
# 2     3.0     0.1  2022-08-01     1
# 3     4.0     0.4  2022-08-01     2
</code></pre>
<p>or shorter:</p>
<pre class="lang-py prettyprint-override"><code>df4 = pd.concat([
    df1,
    pd.concat(
        [df2] * int(df1.size / df2.size),
        axis=0
    ).reset_index(drop=True)
], axis=1)
</code></pre>

             Using the relevant information from the context,
             provide an answer to the query: ['How to concat dataframes pandas']."
             If the context doesn't provide              any relevant information, answer with 
             [I couldn't find a good match in the              document database for your query]
             

In [12]:
model = genai.GenerativeModel('models/gemini-pro')
answer = model.generate_content(prompt)
print(answer)

In [13]:
Markdown(answer.text)

To concatenate dataframes in pandas, you can use the `concat()` function. This function takes a list of dataframes as input, and concatenates them along a specified axis. For example, to concatenate two dataframes `df1` and `df2` vertically, you would use the following code:

```
df3 = pd.concat([df1, df2])
```

This would create a new dataframe `df3` that contains all of the rows from `df1` and `df2`. You can also specify the axis to concatenate along using the `axis` argument. For example, to concatenate `df1` and `df2` horizontally, you would use the following code:

```
df3 = pd.concat([df1, df2], axis=1)
```

This would create a new dataframe `df3` that contains all of the columns from `df1` and `df2`.

## Scale with approximate nearest neighbor search

When dealing with a large dataset, computing the similarity between the query and each original embedded document in the database might be too expensive. Instead of doing that, you can use approximate nearest neighbor algorithms that find the most similar documents in a more efficient way.

These algorithms usually work by creating an index for your data, and using that index to find the most similar documents for your queries. In this notebook, we will use ScaNN to demonstrate the benefits of efficient vector similarity search. First, you have to create an index for your embedded dataset.

In [5]:
# cerate index of the embedding database using scann
# index = create_index(
#     embedded_dataset = questions_embeddings,
#     num_leaves = 25,
#     num_leaves_to_search = 10,
#     training_sample_size = 2000
# )
 