# 0. installing necessary packages

In [15]:
#!pip install chromadb google-cloud-bigquery pandas langchain google-cloud-aiplatform gradio

Collecting langchain
  Downloading langchain-0.0.353-py3-none-any.whl.metadata (13 kB)
Collecting SQLAlchemy<3,>=1.4 (from langchain)
  Downloading SQLAlchemy-2.0.25-cp39-cp39-macosx_10_9_x86_64.whl.metadata (9.6 kB)
Collecting aiohttp<4.0.0,>=3.8.3 (from langchain)
  Using cached aiohttp-3.9.1-cp39-cp39-macosx_10_9_x86_64.whl.metadata (7.4 kB)
Collecting async-timeout<5.0.0,>=4.0.0 (from langchain)
  Using cached async_timeout-4.0.3-py3-none-any.whl.metadata (4.2 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain)
  Using cached dataclasses_json-0.6.3-py3-none-any.whl.metadata (25 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain)
  Using cached jsonpatch-1.33-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting langchain-community<0.1,>=0.0.2 (from langchain)
  Downloading langchain_community-0.0.7-py3-none-any.whl.metadata (7.3 kB)
Collecting langchain-core<0.2,>=0.1.4 (from langchain)
  Downloading langchain_core-0.1.4-py3-none-any.whl.metadata (4.0 kB)
Collecting langsmith

In [6]:
from google.cloud import bigquery
import pandas as pd
import requests
from google.oauth2 import service_account
import vertexai
from vertexai.language_models import TextGenerationModel


### we will be looking for data in public bigquery datasets

We're creating explicit list of tables to search for

In [7]:

bqclient = bigquery.Client(project='bigquery-public-data')

# Heading 1
## Heading 2
### Heading 3

**Bold text**
*Italic text*
~~Strikethrough text~~

- Bullet point 1
- Bullet point 2
- Bullet point 3

1. Numbered item 1
2. Numbered item 2
3. Numbered item 3

[Link](https://www.example.com)
![Image](image.jpg)

> Blockquote

`Inline code`


In [8]:
#project_ref = bqclient.project('bigquery-public-data')

table_list=[
    {'dataset': 'iowa_liquor_sales', 'table': 'sales'},
    {'dataset': 'london_bicycles', 'table': 'cycle_hire'},
    {'dataset': 'london_bicycles', 'table': 'cycle_stations'},
    {'dataset': 'ml_datasets', 'table': 'census_adult_income'},
    {'dataset': 'ml_datasets', 'table': 'credit_card_default'},
    {'dataset': 'ml_datasets', 'table': 'holidays_and_events_for_forecasting'},
    {'dataset': 'ml_datasets', 'table': 'iris'},
    {'dataset': 'ml_datasets', 'table': 'penguins'},
    {'dataset': 'ml_datasets', 'table': 'ulb_fraud_detection'},
    {'dataset': 'new_york_citibike', 'table': 'citibike_stations'},
    {'dataset': 'new_york_citibike', 'table': 'citibike_trips'},
    {'dataset': 'fdic_banks', 'table': 'institutions'},
    {'dataset': 'fdic_banks', 'table': 'locations'},
    {'dataset': 'fda_food', 'table': 'food_enforcement'},
    {'dataset': 'fda_food', 'table': 'food_events'},
    {'dataset': 'fcc_political_ads', 'table': 'broadcast_tv_radio_station'},
    {'dataset': 'fcc_political_ads', 'table': 'content_info'},
    {'dataset': 'fcc_political_ads', 'table': 'file_history'},
    {'dataset': 'fcc_political_ads', 'table': 'file_record'},
]



### We're using a variable to generate missing table descriptions using Text Bison
if True then generate missing table descriptions

In [9]:


GENERATE_TABLE_DESCRIPTIONS = True


### Function that sticks together dataset,table,field descriptions
if GENERATE_TABLE_DESCRIPTIONS is true then missing table descriptions are generated in bison based on other descriptions

In [10]:
def assemble_description(dataset_ref, table_ref,GENERATE_TABLE_DESCRIPTIONS):
    table=bqclient.get_table(table_ref)
    dataset=bqclient.get_dataset(dataset_ref)
    description=f"Dataset name: '{dataset_ref.dataset_id}', Dataset description:'{dataset.description}', Table name:\"{str(table_ref.table_id).replace('_',' ')}\", Table description:'{table.description}'"
    description=description + "Schema attributes: " + ",".join([f"column-name: {field.name}, column-type:\"{field.field_type}\" ,column-description: \"{field.description}\"" for field in table.schema])
    
    if table.description is None:
        returned_table_description="No description available"
    else:
        returned_table_description=table.description
        
    if GENERATE_TABLE_DESCRIPTIONS and table.description is None:
        parameters = {
            "temperature": 0.9,  # Temperature controls the degree of randomness in token selection.
            "max_output_tokens": 1000,  # Token limit determines the maximum amount of text output.
            "top_p": 0.8,  # Tokens are selected from most probable to least until the sum of their probabilities equals the top_p value.
            "top_k": 40,  # A top_k of 1 means the selected token is the most probable among all tokens.
        }
        model = TextGenerationModel.from_pretrained("text-bison@002")
        response = model.predict(
            "Please give brief description of a table that describes contents and purpose of the table for table users. Please do not describe or list attributes of a table only table general description. This is table schema:" + description,
            **parameters,
        )
        print(f"Response from Model: {response.text}")
        description=f"Dataset name: '{dataset_ref.dataset_id}', Dataset description:'{dataset.description}', Table name:\"{str(table_ref.table_id).replace('_',' ')}\", Table description:'{response.text}'"
        description=description + "Schema attributes: " + ",".join([f"column-name: {field.name}, column-type:\"{field.field_type}\" ,column-description: \"{field.description}\"" for field in table.schema])
        returned_table_description=response.text

    return returned_table_description,description

### Assemble table description
Calling the above function for each table


In [13]:

for i in table_list:
    
    
    dataset_ref = bqclient.dataset(i['dataset'])
    table_ref = dataset_ref.table(i['table'])
    dataset=bqclient.get_dataset(dataset_ref)
    table=bqclient.get_table(table_ref)
    #print(assemble_description(dataset_ref, table_ref))
    i['table_description'],i['description']=assemble_description(dataset_ref, table_ref,GENERATE_TABLE_DESCRIPTIONS)
    #print(dataset_ref,table_ref)

Response from Model:  The table "cycle hire" in the dataset "london_bicycles" provides information about bicycle trips in London. Each row in the table represents a single bike trip. The table includes the following columns:

- rental_id: Unique identifier for each bike trip.
- duration: Duration of the bike trip in seconds.
- duration_ms: Duration of the bike trip in milliseconds.
- bike_id: Unique identifier for each bike.
- bike_model: Model of the bike used for the trip.
- end_date: Date and time when the bike trip ended.
- end_station_id: Unique identifier for the station where the bike trip ended.
- end_station_name: Name of the station where the bike trip ended.
- start_date: Date and time when the bike trip started.
- start_station_id: Unique identifier for the station where the bike trip started.
- start_station_name: Name of the station where the bike trip started.
- end_station_logical_terminal: Logical terminal for the station where the bike trip ended.
- start_station_logi

### Create embeddings from descriptions

In [14]:
import chromadb
from chromadb.utils import embedding_functions
import langchain
from langchain.embeddings import VertexAIEmbeddings
from vertexai.language_models import TextEmbeddingModel

ModuleNotFoundError: No module named 'langchain'

In [None]:

chromaclient = chromadb.Client()
collection = chromaclient.get_or_create_collection("my_tables")


## Load embeddings into vector database

In [None]:
from chromadb import Documents, EmbeddingFunction, Embeddings
from chromadb.utils import embedding_functions
from langchain.vectorstores import Chroma
from langchain.embeddings import VertexAIEmbeddings


In [None]:

#VAIembeddings=VertexAIEmbeddings(model_name='textembedding-gecko@002')
#model = TextEmbeddingModel.from_pretrained("textembedding-gecko@002")


### We're not calling embedding functions ourselves. 
We point ChromaDB to use Vertex Embeddings every time new document is loaded or every time a query to DB is made

In [None]:

#for non english embeddings use model_name=textembedding-gecko-multilingual@001

db=Chroma(client=chromaclient,collection_name='my_tables',embedding_function=VertexAIEmbeddings(model_name='textembedding-gecko@001',task_type="SEMANTIC_SIMILARITY"))

In [None]:
from langchain.docstore.document import Document


for i in table_list:
    doc=Document(page_content=i['description'],metadata={'dataset': i['dataset'],'table': i['table'],'table_description' : i['table_description']},id=i['table'])
    db.add_documents( documents=[doc],                    
                     ids=[i['table']]
    )

## Query the vector store

db.similarity_search_with_relevance_scores(query="yummy", k=5, threshold=0.5, return_relevance_scores=True)

In [None]:
output=db.similarity_search_with_relevance_scores(query="bicycles", k=5, threshold=0.5, return_relevance_scores=True)

In [None]:
for i in output:
    print(i)

(Document(page_content='Dataset name: \'fdic_banks\', Dataset description:\'None\', Table name:"locations", Table description:\' The table describes the various physical locations of banks insured by the FDIC. Each row represents a branch or main office of an FDIC-insured institution. The table includes information such as the FDIC certificate number, institution name, branch name, branch address, city, state, zip code, county, county FIPS code, state abbreviation, state name, institution class, Core Based Statistical Area (CBSA) code, CBSA name, CBSA division flag, CBSA division code, CBSA division name, CBSA metro flag, CBSA metro code, CBSA metro name, CBSA micro flag, Combined Statistical Area (CSA) flag, CSA code, CSA name, date established, FDIC UNINUM, last updated date, service type, and branch FDIC UNINUM.\'Schema attributes: column-name: fdic_certificate_number, column-type:"STRING" ,column-description: "A unique number assigned by the FDIC used to identify institutions and f

In [None]:
import gradio as gr



def search_items(query,table_fields)):
    # Your search logic here
    # Return the search results
    
    # For example, let's assume we have a list of items

    # Filter the items based on the query
    if query =="":
        query="banks are bad"
    results = db.similarity_search_with_relevance_scores(query=query, k=5, threshold=0.5, return_relevance_scores=True)
    output=[]
    for i in results:
        output.append(i[0].metadata['dataset']+"."+i[0].metadata['table'])
        output.append(i[0].metadata['table_description'])
    update_show=[gr.Text(visible=True,value=x) for x in output]

    return update_show

result_tables_list = []


with gr.Blocks() as demo:
    search_phrase = gr.Textbox(label="Search phrase",placeholder="Banks are good")

    with gr.Column():
        for i in range(5):
            with gr.Row():
                table_field=gr.Text(show_label=False,visible=False)
                description_field=gr.Textbox(show_label=False,visible=False)
                result_tables_list.append(table_field)
                result_tables_list.append(description_field)
    search_phrase.change(search_items, search_phrase, result_tables_list)
    greet_btn = gr.Button("Search")

    greet_btn.click()
    greet_btn.click(search_items,search_phrase,result_tables_list)
        

demo.launch()



Running on local URL:  http://127.0.0.1:7903

To create a public link, set `share=True` in `launch()`.




