In [1]:
import pinecone

  from tqdm.autonotebook import tqdm


In [9]:
from sleepmate.config import *
from pathlib import Path
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.document_loaders import PyPDFium2Loader as PDFLoader
from langchain.document_loaders import TextLoader
from langchain.vectorstores import Pinecone

In [3]:
dir = Path(SLEEPMATE_DATADIR)
embeddings = OpenAIEmbeddings()

In [4]:
loader_map = {
    ".pdf": PDFLoader,
    ".txt": TextLoader,
}

In [5]:
pages = []

for file in dir.iterdir():
    loader_cls = loader_map.get(file.suffix)
    if loader_cls is None:
        print(f"skipping `{file}'")
        continue
    print(f"loading `{file}'")
    # the split part is important, otherwise we get similarity search
    # results that are too long for the model context window
    pages.extend(loader_cls(str(file)).load_and_split())

loading `/Users/cck197/Downloads/virtual_greg/Copy of 190823 Sleep for athletes (GDMP).pdf'
loading `/Users/cck197/Downloads/virtual_greg/190823 Sleep-onset insomnia (GDMP).pdf'
loading `/Users/cck197/Downloads/virtual_greg/Circadian_pod_2_final.txt'
loading `/Users/cck197/Downloads/virtual_greg/Mike_Tipton.txt'
loading `/Users/cck197/Downloads/virtual_greg/Final_episode_final.txt'
skipping `/Users/cck197/Downloads/virtual_greg/.langchain.db'
loading `/Users/cck197/Downloads/virtual_greg/Principles of Resilient Nutrition_v5.pdf'
loading `/Users/cck197/Downloads/virtual_greg/Revised_circadian_episode.txt'
skipping `/Users/cck197/Downloads/virtual_greg/chroma_db'
loading `/Users/cck197/Downloads/virtual_greg/190818 Having trouble sleeping_ (GDMP).pdf'
loading `/Users/cck197/Downloads/virtual_greg/190823 Sleep-maintenance insomnia (GDMP).pdf'


In [6]:
pinecone.init(
    api_key=os.getenv("PINECONE_API_KEY"),  # find at app.pinecone.io
    environment=os.getenv("PINECONE_ENVIRONMENT"),  # next to api key in console
)

In [7]:
index_name = "sleepmate"

In [11]:
# First, check if our index already exists. If it doesn't, we create it
if index_name not in pinecone.list_indexes():
    # we create a new index
    pinecone.create_index(name=index_name, metric="cosine", dimension=1536)
# The OpenAI embedding model `text-embedding-ada-002 uses 1536 dimensions`
docsearch = Pinecone.from_documents(pages, embeddings, index_name=index_name)

In [12]:
docs = docsearch.similarity_search("melatonin")

In [13]:
docs

[Document(page_content="linked to your eyes\nAnd the key pathway\nlinking your eyes to your central clock is via a specialized type of cell in the eye called intrinsically\nphotosensitive\nretinal ganglion cells. These\ncells will strongly affect many of the so called non visual responses to light. It's the cells that are important to rapidly reducing melatonin production to shifting the time of your body's clock. They also influence brain regions that are involved in things like mood and cognition. But for the purpose of today,\nthere are a couple of things to consider. And one of them is that these cells respond differently to different wavelengths of light. As humans, we can see wavelengths between three hundred and eighty and seven hundred and eighty nanometers.\nLight in the lowest end of this range looks pilots.\nLight in the highest end of this range red. Other wavelengths outside of this range do affect us, for example, UV light, which is shorter than the lowest of those wavele

In [15]:
Pinecone??

[0;31mInit signature:[0m
[0mPinecone[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mindex[0m[0;34m:[0m [0;34m'Any'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0membedding[0m[0;34m:[0m [0;34m'Union[Embeddings, Callable]'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtext_key[0m[0;34m:[0m [0;34m'str'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mnamespace[0m[0;34m:[0m [0;34m'Optional[str]'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdistance_strategy[0m[0;34m:[0m [0;34m'Optional[DistanceStrategy]'[0m [0;34m=[0m [0;34m<[0m[0mDistanceStrategy[0m[0;34m.[0m[0mCOSINE[0m[0;34m:[0m [0;34m'COSINE'[0m[0;34m>[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m        
[0;32mclass[0m [0mPinecone[0m[0;34m([0m[0mVectorStore[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;34m"""`Pinecone` vector store.[0m
[0;34m[0m
[0;34m    To use, you should have the ``pinecone-client

In [18]:
docsearch = Pinecone.from_existing_index(index_name, embeddings)

In [19]:
docsearch.similarity_search("melatonin")

[Document(page_content="linked to your eyes\nAnd the key pathway\nlinking your eyes to your central clock is via a specialized type of cell in the eye called intrinsically\nphotosensitive\nretinal ganglion cells. These\ncells will strongly affect many of the so called non visual responses to light. It's the cells that are important to rapidly reducing melatonin production to shifting the time of your body's clock. They also influence brain regions that are involved in things like mood and cognition. But for the purpose of today,\nthere are a couple of things to consider. And one of them is that these cells respond differently to different wavelengths of light. As humans, we can see wavelengths between three hundred and eighty and seven hundred and eighty nanometers.\nLight in the lowest end of this range looks pilots.\nLight in the highest end of this range red. Other wavelengths outside of this range do affect us, for example, UV light, which is shorter than the lowest of those wavele