# Basic Setup

In [28]:
import os, json
import sys, logging
import openai
from llama_index import SimpleDirectoryReader, ServiceContext, KnowledgeGraphIndex, VectorStoreIndex, load_index_from_storage
from llama_index.graph_stores import SimpleGraphStore
from llama_index.storage.storage_context import StorageContext
from llama_index.llms import OpenAI

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
openai.log = "info"

In [2]:
MAIN_DIR = ".."
DATA_DIR = os.path.join(MAIN_DIR, "data", "document_store", "uc")
SAMPLE_DIR = os.path.join(MAIN_DIR, "data", "examples")

with open(os.path.join(MAIN_DIR, "auth", "api_keys.json"), "r") as f:
    api_keys = json.load(f)

os.environ["OPENAI_API_KEY"] = api_keys["OPENAI_API_KEY"]
openai.api_key = api_keys["OPENAI_API_KEY"]

with open(os.path.join(MAIN_DIR, "data", "queries", "uc_all.txt"), "r") as f:
    test_cases = f.readlines()

# Vector Database

In [22]:
llm = OpenAI(temperature=0, model="gpt-3.5-turbo", max_tokens=512)
service_context = ServiceContext.from_defaults(chunk_size=1000, chunk_overlap=200)
storage_context = StorageContext.from_defaults()

In [29]:
documents = SimpleDirectoryReader(SAMPLE_DIR).load_data()
print("Number of documents:", len(documents))
vector_index = VectorStoreIndex.from_documents(documents, service_context=service_context)


Number of documents: 1


In [42]:
retrieval_engine = vector_index.as_retriever(similarity_top_k=3)

relevant_docs = retrieval_engine.retrieve(str_or_query_bundle="What did the author do growing up?")
relevant_docs

[NodeWithScore(node=TextNode(id_='7468d9c5-2dce-4d59-9013-3b4ee65d1e1f', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='c35091c3-4831-48dc-8783-a9acd1f1e6eb', node_type=None, metadata={}, hash='4c702b4df575421e1d1af4b1fd50511b226e0c9863dbfffeccb8b689b8448f35'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='c3f4395c-8832-437e-837c-f37cc1a86fc5', node_type=None, metadata={}, hash='2b076feb45ef9318654997e47fb3b2f2185bcc7e39e0997045fcf755cb2ca6dc')}, hash='004bbb820e2627b96db9a79d93bb1b8a665582ada07e261a6085d2d0b0119ad6', text='What I Worked On\n\nFebruary 2021\n\nBefore college the two main things I worked on, outside of school, were writing and programming. I didn\'t write essays. I wrote what beginning writers were supposed to write then, and probably still are: short stories. My stories were awful. They had hardly any plot, just characters with strong feelings,

In [44]:
query_engine = vector_index.as_query_engine(similarity_top_k=3)
response = query_engine.query("What did the author do growing up?")
print(response)

The author worked on writing and programming outside of school before college. They wrote short stories and tried writing programs on the IBM 1401 computer. They also mentioned working with microcomputers, specifically a TRS-80, where they wrote simple games and a word processor.


# Graph Database

In [37]:
graph_store = SimpleGraphStore()
storage_context = StorageContext.from_defaults(graph_store=graph_store)

# NOTE: can take a while!
index = KnowledgeGraphIndex.from_documents(
    documents,
    max_triplets_per_chunk=2,
    storage_context=storage_context,
    service_context=service_context,
)

(I, worked on, writing)
(I, worked on, programming)
(IBM 1401, used for, data processing)
(I, tried writing, programs)
(I, tried writing, on the IBM 1401)
(IBM 1401, happened to be, in the basement)
(I, got permission to use, IBM 1401)
(IBM 1401, was like, a mini Bond villain's lair)
(machines, sitting up on, raised floor)
(language, used, an early version of Fortran)
(programs, typed on, punch cards)
(programs, loaded into, memory)
(result, would ordinarily be, to print something)
(I, was puzzled by, 1401)
(I, couldn't figure out, what to do with it)
(form of input, to programs, stored on punched cards)
(I, didn't have, any data stored on punched cards)
(option, was to do, things that didn't rely on any input)
(I, didn't know enough math, to do anything interesting)
(I, can't remember, any programs I wrote)
(one of mine, didn't terminate)
(machine, without time-sharing, was a social as well as a technical error)
(data center manager's expression, made clear)
(Heathkit, sold as, kit)
(

In [39]:
query_engine = index.as_query_engine(include_text=False, response_mode="tree_summarize")
response = query_engine.query(
    "Tell me more about Interleaf",
)



In [41]:
from pprint import pprint

pprint(response.response)

('Interleaf was a company that developed software for creating documents. They '
 'also added a scripting language to their software. However, Interleaf faced '
 'challenges and eventually struggled due to the rapid advancement of '
 'technology, often referred to as "Moore\'s Law."')


In [45]:
from llama_index import VectorStoreIndex, SimpleDirectoryReader

index = VectorStoreIndex.from_documents(documents)
query_engine = index.as_query_engine()
response = query_engine.query("What did the author do growing up?")
print(response)

The author worked on writing and programming outside of school before college. They wrote short stories and tried writing programs on an IBM 1401 computer using an early version of Fortran. They later got a microcomputer and started programming on it, writing simple games and a word processor. They also mentioned their interest in philosophy and AI.


In [44]:
index

<llama_index.indices.vector_store.base.VectorStoreIndex at 0x7ff6c8b3e440>