<a href="https://colab.research.google.com/github/dollabillgates/Wolfram_Physics_Project_GPT_Langchain/blob/main/WolframLangchain_cli_app.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install langchain
!pip install openai
!pip install unstructured
!pip install faiss-cpu

import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

In [None]:
import os
import pickle
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain.vectorstores.faiss import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.prompts.prompt import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.chains import ChatVectorDBChain

In [None]:
# Global configurations
os.environ["OPENAI_API_KEY"] = ""
CHUNK_SIZE = 500
CHUNK_OVERLAP = 100
TOP_K_DOCS = 10
TEMPERATURE = 0.7

In [None]:
# Load Data: New lines in text file define the chunk overlap boundary
raw_documents = []
with open("WPPALLDATA.txt", "r") as f:
    for line in f:
        doc = Document(page_content=line.strip())
        raw_documents.append(doc)

# Split text
text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
documents = []
for doc in raw_documents:
    split_docs = text_splitter.split_documents([doc])
    documents.extend(split_docs)

# Load Data to vectorstore
embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(documents, embeddings)

# Save vectorstore
with open("vectorstore.pkl", "wb") as f:
    pickle.dump(vectorstore, f)

with open("vectorstore.txt", "a") as f:
    f.write(str(documents))

In [None]:
# Prompts and Model
_template = """You are Stephen Wolfram. Given the following conversation and a follow up question, try to ignore the conversation and answer the question on its own.
Only if the question does not make sense on its own, use the conversation to rephrase the follow up question to be a standalone question.
Do not repeat statements you made previously in the conversation.
If you don't know the answer, just say "Hmm, I'm not sure." Don't try to make up an answer.
Chat History:
{chat_history}
Follow Up Input: {question}
Standalone question:"""
CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)

template = """You are Stephen Wolfram. You are given the following extracted parts of a long document and a question. Provide a complete, complex, and detailed answer.
Give technical and precise definitions whenever possible, about all related concepts, even when not explicitly asked. Do not repeat yourself.
Do not repeat phrases, even if they're repeated in the extracted parts of the document provided.
Ignore extracted parts of the document which are repetitive. Include as much different unique information from the extracted parts of the document as you can in your answer.
Question: {question}
=========
{context}
=========
Answer in Markdown:"""
QA_PROMPT = PromptTemplate(template=template, input_variables=["question", "context"])


def get_chain(vectorstore):
    llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=TEMPERATURE)
    qa_chain = ChatVectorDBChain.from_llm(
        llm,
        vectorstore,
        qa_prompt=QA_PROMPT,
        condense_question_prompt=CONDENSE_QUESTION_PROMPT,
        return_source_documents=False
    )
    return qa_chain

In [None]:
# Run the code
if __name__ == "__main__":
    with open("vectorstore.pkl", "rb") as f:
        vectorstore = pickle.load(f)
    qa_chain = get_chain(vectorstore)
    qa_chain.top_k_docs_for_context = TOP_K_DOCS
    chat_history = []
    print("Ask me questions about the Wolfram Physics Project!")
    while True:
        print("Human:")
        question = input()
        result = qa_chain({"question": question, "chat_history": chat_history})
        chat_history.append((question, result["answer"]))
        print("AI:")
        print(result["answer"])