# Building a simple chatbot using LlamaIndex

## Import libraries, API and set filepath

In [1]:
# pip install llama-index==0.8.12 pypdf sentence-transformers ragas openai
import os
os.environ['OPENAI_API_KEY'] = "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" # replace with your API key

from llama_index import Document, GPTVectorStoreIndex, ServiceContext
from llama_index.readers import BeautifulSoupWebReader, SimpleDirectoryReader
from llama_index.llms import OpenAI
from llama_index.evaluation import DatasetGenerator

import openai

In [2]:
# set filepath to my data directory 
current_dir = os.getcwd()
data_dir = os.path.join(current_dir, "data")

## Load the data

According to [LlamaIndex's documentation](https://gpt-index.readthedocs.io/en/latest/examples/data_connectors/simple_directory_reader.html), the `SimpleDirectoryReader` is the most commonly used data connector that just works. Simply pass in a input directory or a list of files. It will select the best file reader based on the file extensions. 

In this use case here, there are csv files of TripAdvisor bar reviews, which are not included in gpt-3.5-turbo's pretraining of up to Sep 2021.

In [3]:
filename_fn = lambda filename: {'file_name': filename}
pdfhtml_docs = SimpleDirectoryReader(input_dir=data_dir, exclude_hidden=True, file_metadata=filename_fn).load_data()
print([x.doc_id for x in pdfhtml_docs])
print(f"Loaded {len(pdfhtml_docs)} docs")

['0534b7ca-0740-4a2d-9145-deddbd0fa2ee', 'be551dc2-73f3-4356-bc5a-199fb78a1f5b']
Loaded 2 docs


## Build index

With all the data loaded, we can construct the index for the chatbot. There are 4 types of indexing: Summary index, VectorStore Index, Tree Index and Keyword Table Index. Here we are using VectorStore Index, which is also one of the most common types of indexing.

In [5]:
openai.api_key = os.getenv("OPENAI_API_KEY")

# for more info on service context, refer to 
# https://gpt-index.readthedocs.io/en/latest/core_modules/supporting_modules/service_context.html
service_context = ServiceContext.from_defaults(
    llm=OpenAI(model="gpt-3.5-turbo", temperature=0) # degree of randomness from 0 to 1. 
)
docs = pdfhtml_docs 
index = GPTVectorStoreIndex.from_documents(documents=docs, service_context=service_context)

In [6]:
# saving the output as a vector store so that we can refer to this 
# instead of running the embedding model above aagin

index.storage_context.persist(persist_dir="./data/index.vecstore")
