In [6]:
from llama_index import (
    VectorStoreIndex,
    ServiceContext,
    StorageContext,
    SQLDatabase,
    download_loader
)
from llama_index.vector_stores import WeaviateVectorStore

import weaviate
import openai

In [13]:
from dotenv import find_dotenv, load_dotenv
load_dotenv(find_dotenv())
import os
openai.api_key = os.getenv("OPENA_AI_KEY")

In [None]:

client = weaviate.Client(
    embedded_options=weaviate.embedded.EmbeddedOptions()
)

## Create Schema

In [None]:
# Create Schema
podcast_schema = {
   "classes": [
       {
           "class": "Podcast",
           "description": "Weaviate podcast",
           "vectorizer": "text2vec-openai",
           "properties": [
               {
                  "name": "Content",
                  "dataType": ["text"],
                  "description": "Content from the podcasts.",
               }
            ]
        }
    ]
}

client.schema.create(podcast_schema)
print("Podcast schema was created.")

## Load in Data

In [10]:
from llama_index import download_loader

YouTubeTranscriptReader = download_loader("YoutubeTranscriptReader")

loader = YouTubeTranscriptReader()
podcasts = loader.load_data(ytlinks=['https://www.youtube.com/watch?v=xk28RMhRy1U', 'https://www.youtube.com/watch?v=Du6IphCcCec',
'https://www.youtube.com/watch?v=Q7f2JeuMN7E', 'https://www.youtube.com/watch?v=nSCUk5pHXlo'])
     


  for item in lines:


Collecting youtube_transcript_api~=0.5.0 (from -r /home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/llama_index/download/llamahub_modules/requirements.txt (line 1))
  Downloading youtube_transcript_api-0.5.0-py3-none-any.whl (23 kB)
Installing collected packages: youtube_transcript_api
Successfully installed youtube_transcript_api-0.5.0


  "  youtube.com/watch?v=\{video_id\} "
  "  youtube.com/embed?v=\{video_id\} "
  "  youtu.be/{video_id\} (never includes www subdomain)"


## Build Weaviate Index

In [16]:
vector_store = WeaviateVectorStore(weaviate_client=client, class_prefix="Podcasts_index")
storage_context = StorageContext.from_defaults(vector_store=vector_store)


podcast_index = VectorStoreIndex.from_documents(podcasts, storage_context=storage_context)


{"level":"info","msg":"Created shard podcasts_index_node_gpp1dwwWJZH8 in 1.837564ms","time":"2024-01-28T01:01:17Z"}
{"action":"hnsw_vector_cache_prefill","count":1000,"index_id":"main","level":"info","limit":1000000000000,"msg":"prefilled vector cache","time":"2024-01-28T01:01:17Z","took":103196}


## Create SQL Table


In [17]:
from sqlalchemy import (
    create_engine,
    MetaData,
    Table,
    Column,
    String,
    Integer,
    select,
    column,
)

engine = create_engine("sqlite:///:memory:", future=True)
metadata_obj = MetaData()

In [18]:
table_name = "podcast_stats"
podcast_stats_table = Table(
    table_name,
    metadata_obj,
    Column("podcast_title", String(16), primary_key=True),
    Column("views", Integer),
    Column("duration", Integer),
)

metadata_obj.create_all(engine)

In [19]:

metadata_obj.tables.keys()


dict_keys(['podcast_stats'])

In [20]:
from sqlalchemy import insert

rows = [
    {"podcast_title": "Weaviate 1.20", "views": 328, "duration": 65},
    {"podcast_title": "Weaviate 1.19", "views": 280, "duration": 27},
    {"podcast_title": "Weaviate 1.18", "views": 428, "duration": 65},
    {"podcast_title": "Weaviate 1.17", "views": 257, "duration": 43}
]

for row in rows:
  stmt = insert(podcast_stats_table).values(**row)
  with engine.connect() as connection:
    cursor = connection.execute(stmt)
    connection.commit()
     


## Create SQL Table in Llamaindex

In [21]:
sql_database = SQLDatabase(engine, include_tables=["podcast_stats"])

In [22]:
from llama_index.indices.struct_store.sql_query import NLSQLTableQueryEngine

In [23]:
# set up text2SQL prompt
sql_query_engine = NLSQLTableQueryEngine(
    sql_database=sql_database,
    tables=["podcast_stats"],
)

## Build Query Engine

In [24]:

vector_query_engine = podcast_index.as_query_engine()

## Tell LlamaIndex about the Tools

In [25]:
from llama_index.tools.query_engine import QueryEngineTool

sql_tool = QueryEngineTool.from_defaults(
    query_engine = sql_query_engine,
    description=(
        "Useful for translating a natural language query into a SQL query over a table containing: "
        "podcast_stats, containing the views/duration of each podcast"
    ),
)
vector_tool = QueryEngineTool.from_defaults(
    query_engine=vector_query_engine,
    description="Useful for answering semantic questions about Weaviate release podcasts",
)

In [27]:

from llama_index.query_engine.router_query_engine import RouterQueryEngine
from llama_index.selectors.llm_selectors import LLMSingleSelector

query_engine = RouterQueryEngine(
    selector=LLMSingleSelector.from_defaults(),
    query_engine_tools=([sql_tool] + [vector_tool]),
)

## Query

In [28]:

response = query_engine.query("Which release podcast had the most views?")
print(str(response))


The podcast episode titled "Weaviate 1.18" had the most views, with a total of 428 views.


In [31]:
response = query_engine.query("Tell me about a new feature in Weaviate")
print(str(response))

A new feature in Weaviate is multi-tenancy. This feature allows users to separate and isolate their data from other users within the application. For example, if you have an app that allows you to index documents from your hard drive, you can use multi-tenancy to ensure that only you can search through your documents and not other users. This feature is particularly useful in the context of Vector search, as it helps to limit the vector space and improve search efficiency. Without multi-tenancy, the graph containing billions of vectors would need to be cut into smaller chunks for each user, resulting in potential disconnections or inefficient traversal. Multi-tenancy in Weaviate addresses this technical requirement and provides a solution for efficient and isolated data management.


{"level":"info","msg":"Created shard blogpost_aoHZA7ipG29I in 1.177625ms","time":"2024-01-28T01:28:38Z"}
{"action":"hnsw_vector_cache_prefill","count":1000,"index_id":"main","level":"info","limit":1000000000000,"msg":"prefilled vector cache","time":"2024-01-28T01:28:38Z","took":96751}
