# NYC Wikipedia Embeddings Demo

In [None]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

Demonstrate embedding capabilities in GPTTreeIndex and GPTListIndex

### Setup + Data Prep

In [None]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [None]:
# fetch "New York City" page from Wikipedia
from pathlib import Path

import requests
response = requests.get(
    'https://en.wikipedia.org/w/api.php',
    params={
        'action': 'query',
        'format': 'json',
        'titles': 'New York City',
        'prop': 'extracts',
        # 'exintro': True,
        'explaintext': True,
    }
).json()
page = next(iter(response['query']['pages'].values()))
nyc_text = page['extract']

data_path = Path('data')
if not data_path.exists():
    Path.mkdir(data_path)

with open('data/nyc_text.txt', 'w') as fp:
    fp.write(nyc_text)

In [None]:
# My OpenAI Key
import os
os.environ['OPENAI_API_KEY'] = "INSERT OPENAI KEY"

### GPTTreeIndex - Embedding-based Query

In [None]:
from llama_index import GPTTreeIndex, SimpleDirectoryReader
from IPython.display import Markdown

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [None]:
documents = SimpleDirectoryReader('data').load_data()
index = GPTTreeIndex.from_documents(documents)

In [None]:
# set Logging to DEBUG for more detailed outputs
query_engine = index.as_query_engine(
    retriever_mode="embedding"
)
response = query_engine.query("What is the name of the professional women's basketball team in New York City?")

In [None]:
display(Markdown(f"<b>{response}</b>"))

In [None]:
response = query_engine.query(
    "What battles took place in New York City in the American Revolution?", 
)

In [None]:
display(Markdown(f"<b>{response}</b>"))

In [None]:
# set Logging to DEBUG for more detailed outputs
response = query_engine.query("What are the airports in New York City?")

In [None]:
display(Markdown(f"<b>{response}</b>"))

### GPTListIndex - Embedding-based Query

In [None]:
from llama_index import GPTListIndex, SimpleDirectoryReader
from IPython.display import Markdown

In [None]:
documents = SimpleDirectoryReader('data').load_data()
index = GPTListIndex.from_documents(documents)

In [None]:
# set Logging to DEBUG for more detailed outputs
query_engine = index.as_query_engine(retriever_mode="embedding")
response = query_engine.query("What is the name of the professional women's basketball team in New York City?")

In [None]:
display(Markdown(f"<b>{response}</b>"))

In [None]:
# set Logging to DEBUG for more detailed outputs
response = query_engine.query("What battles took place in New York City in the American Revolution?", retriever_mode="embedding")

In [None]:
display(Markdown(f"<b>{response}</b>"))

In [None]:
# set Logging to DEBUG for more detailed outputs
response = query_engine.query("What are the airports in New York City?", retriever_mode="embedding")

In [None]:
display(Markdown(f"<b>{response}</b>"))

## Try out other embeddings! 
(courtesy of langchain)

In [None]:
from llama_index import GPTListIndex, SimpleDirectoryReader, ServiceContext
from IPython.display import Markdown

In [None]:
# load in HF embedding model from langchain
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index import LangchainEmbedding
embed_model = LangchainEmbedding(HuggingFaceEmbeddings())

In [None]:
# configure
service_context = ServiceContext.from_defaults(embed_model=embed_model)

# set Logging to DEBUG for more detailed outputs
query_engine = index.as_query_engine(
    retriever_mode="embedding", 
    service_context=service_context, 
)
response = query_engine.query(
    "What is the name of the professional women's basketball team in New York City?", 
)

In [None]:
response