In [13]:
import os
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.llms.openai import OpenAI
from llama_index.core import Settings
from dotenv import load_dotenv

# Load the environment variables from the specified .env file
dotenv_path = '/home/zihan/Desktop/ProfessorGPT/.env'
load_dotenv(dotenv_path)

# Load the OpenAI API key from the environment variable
api_key = os.getenv('OPENAI_API_KEY')
assert api_key is not None, "OpenAI API key not found in environment variables."

required_exts = [".txt"]

documents = SimpleDirectoryReader("/home/zihan/Desktop/ProfessorGPT/input",
                                    required_exts=required_exts,
                                    recursive=True).load_data()
print(f"Loaded {len(documents)} docs")

Loaded 2893 docs


In [108]:
os.environ["OPENAI_API_KEY"] = api_key

## Build this from documents to create the index
# index = VectorStoreIndex.from_documents(documents, chunk_size=30000, chunk_overlap=0)
# index.storage_context.persist() # Save the index to disk
# index

<llama_index.core.indices.vector_store.base.VectorStoreIndex at 0x79c0c7669550>

In [3]:
# ## To reload you can run 
from llama_index.core import StorageContext, load_index_from_storage

# rebuild storage context
storage_context = StorageContext.from_defaults(persist_dir="./storage")
# load index
index = load_index_from_storage(storage_context)

## This section below can virtualize the chunk size

In [None]:
# from llama_index.core.node_parser import JSONNodeParser
# from llama_index.core import Document

# document = Document(id_='url', text=documents[0].text)
# parser = JSONNodeParser()

# nodes = parser.get_nodes_from_documents([document])

In [None]:
# nodes.text

In [107]:
from llama_index.core.node_parser import SentenceSplitter

text_splitter = SentenceSplitter(chunk_size=30000, chunk_overlap=300)
nodes = text_splitter.get_nodes_from_documents(documents=documents)
len(nodes)

2900

## Single result test

In [110]:
retriever = index.as_retriever(similarity_top_k=5)

In [112]:
results = retriever.retrieve("""xiaotui liu nc state""")
for result in results:
    print("#" * 100)
    print(result.text)


####################################################################################################
University: North Carolina State University
URL: https://sites.google.com/ncsu.edu/xiaorui
Description: Skip to main content
Skip to navigation
Xiaorui Liu	
HomeGroupAwardResearchTeachingService
Dr. Xiaorui Liu

Assistant Professor 

Department of Computer Science

Department of Electrical and Computer Engineering (by courtesy)

College of Engineering 

North Carolina State University

Office: Room 2296, Engineering Building II

Email: xliu96 at ncsu dot edu

Research  |  Group  |  Award  |  Teaching  |  Service
About Me

Xiaorui Liu has been an Assistant Professor of Computer Science at North Carolina State University since August 2022. He received his Ph.D. degree in Computer Science from Michigan State University in 2022 under the supervision of Dr. Jiliang Tang. Before that, he received his Master's and Bachelor's degrees from South China University of Technology. He was awarded the

In [79]:
llm = OpenAI(model="gpt-4o-mini")
query_engine = index.as_query_engine(llm=llm, similarity_top_k=3)

In [66]:
from llama_index.core import PromptTemplate


new_summary_tmpl_str = (
    "You are a professional PhD Applicant Advisor, you are answer the question based only on the following context:\n. you are providing all correclated information to stuednt for which professors to work with."
    "{context_str}\n"
    "Answer the following question:"
    "Query: {query_str}\n"
    "Answer: "
)
new_summary_tmpl = PromptTemplate(new_summary_tmpl_str)

In [67]:
query_engine.update_prompts(
    {"response_synthesizer:text_qa_template": new_summary_tmpl}
)

prompts_dict = query_engine.get_prompts()
print(prompts_dict['response_synthesizer:text_qa_template'].template)

You are a professional PhD Applicant Advisor, you are answer the question based only on the following context:
. you are providing all correclated information to stuednt for which professors to work with.{context_str}
Answer the following question:Query: {query_str}
Answer: 


In [85]:
## Single Test
result = query_engine.query("""who is professor Yuankai Huo?""")

print(result.response)

There is no information provided about a professor named Yuankai Huo in the context.
