In [20]:
from dotenv import load_dotenv
import os


In [21]:
from llama_index import SimpleDirectoryReader  # loads docs from directory
from llama_index.node_parser import SimpleNodeParser
from llama_index.llms import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index import (
    download_loader,
    ServiceContext,
    VectorStoreIndex,
    StorageContext,
)
from llama_index.vector_stores import PineconeVectorStore
from pinecone import Pinecone

In [22]:
# load the environment variables
load_dotenv()
pinecone = Pinecone(
    api_key=os.environ["PINECONE_API_KEY"],
    environment=os.environ["PINECONE_ENVIRONMENT"],
)


In [23]:
# load from llamahub a code for reading unstructured data
UnstructuredReader = download_loader('UnstructuredReader')
#dir_reader = SimpleDirectoryReader('./llamaindex-docs', file_extractor={".html": UnstructuredReader()})
dir_reader = SimpleDirectoryReader('./data', file_extractor={".txt": UnstructuredReader(),".pdf": UnstructuredReader()})
documents = dir_reader.load_data(show_progress=True)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/akshayranganath/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/akshayranganath/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/akshayranganath/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/akshayranganath/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
Loading files: 100%|██████████| 46/46 [00:34<00:00,  1.34file/s]


In [24]:
# now split to chunks
node_parser = SimpleNodeParser.from_defaults(chunk_size=512,chunk_overlap=20)
# not necessary since LLama does it on it's own
#nodes = node_parser.get_nodes_from_documents(documents=documents)

In [25]:
llm = OpenAI(model='gpt-3.5-turbo', temperature=0)
embed_model = OpenAIEmbedding(model='text-embedding-ada-002',embed_batch_size=100)

In [26]:
service_context = ServiceContext.from_defaults(
    llm=llm,
    embed_model=embed_model,
    node_parser=node_parser
)

In [27]:
index_name = 'llamaindex-helper'
pinecone_index = pinecone.Index(index_name)

In [28]:
vector_store = PineconeVectorStore(pinecone_index=pinecone_index)

In [29]:
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [30]:
index = VectorStoreIndex.from_documents(documents=documents, storage_context=storage_context, service_context=service_context, show_progress=True)
print('Ingestion completed')

Parsing nodes:   0%|          | 0/46 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/1848 [00:00<?, ?it/s]

Upserted vectors:   0%|          | 0/1848 [00:00<?, ?it/s]

Ingestion completed
