Importing Required Packages

In [86]:
import os.path
import os

from langchain.document_loaders import UnstructuredPDFLoader, WebBaseLoader, AsyncChromiumLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain import hub
from langchain.schema.runnable import RunnablePassthrough
import pickle


In [56]:
#Constant KEY Variable (Use env in production)
#Go to https://platform.openai.com/account/api-keys to create a new API key
KEY = "";

In [70]:
llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo", openai_api_key=KEY)

In [72]:
#Generate a list of the URLS (this is a partical list of depts
codeList = [
        "aasp", "aast", "abrm", "agnr", "agst", "amsc", "amst", "ansc", "anth", "aosc", "arab", "arch", "arec", "arhu", "arhx", "army", "arsc", "arth", "artt", "artx", "arux", "astr", "bchm", "bioe", "biom", "bmgt", "bsci", "bscv", "bsgc", "bsos", "bsst", "ccjs", "chbe", "chem", "chin", "chse", "cine", "cinx", "clas", "clax", "cmlt", "cmlx", "cmsc", "comm", "comx", "cpbe", "cpet", "cpgh", "cpjt", "cpms", "cppl", "cpsa", "cpsd", "cpsf", "cpsg", "cpsn", "cpsp", "cpss", "eall", "econ", "edci", "edcp", "edhd", "edhi", "edms", "edps", "edsp", "educ", "enae", "enbc", "ence", "eneb", "enee", "enes", "enfp", "engl", "engx", "enma", "enme", "ennu", "enre", "ensp", "enst", "epib", "fgsm", "fire", "fmsc", "fren", "gems", "geog", "geol", "germ", "gers", "grek", "gvpt", "hacs", "hdcc", "hebr", "heip", "hesi", "hesp", "hglo", "hhum", "hisp", "hist", "hisx", "hlsa", "hlsc", "hlth", "hnuh", "honr", "idea", "imdm", "immr", "inag", "inst", "isrl", "ital", "itax", "ivsp", "japn", "jour", "jwst", "knes", "kora", "lacs", "larc", "lasx", "latn", "lbsc", "lead", "lgbt", "lgbx", "ling", "math", "mees", "mieh", "mith", "mlaw", "mlsc", "mued", "muet", "musc", "musp", "navy", "neur", "nfsc", "peer", "pers", "phil", "phix", "phpe", "phpx", "phsc", "phys", "plcy", "plsc", "port", "psyc", "rdev", "rels", "russ", "slaa", "sllc", "sllx", "smlp", "socy", "span", "spax", "sphl", "stat", "surv", "tdps", "thet", "thex", "tlpl", "tltc", "umei", "univ", "ursp", "uslt", "weid", "wgss", "wmsx"
]
codeList.append("cmsc")
baseURL = "https://academiccatalog.umd.edu/undergraduate/approved-courses/"
urlList = []
for dept in codeList:
        urlList.append(baseURL+dept+"/")
print(urlList)

['https://academiccatalog.umd.edu/undergraduate/approved-courses/aasp/', 'https://academiccatalog.umd.edu/undergraduate/approved-courses/aast/', 'https://academiccatalog.umd.edu/undergraduate/approved-courses/abrm/', 'https://academiccatalog.umd.edu/undergraduate/approved-courses/agnr/', 'https://academiccatalog.umd.edu/undergraduate/approved-courses/agst/', 'https://academiccatalog.umd.edu/undergraduate/approved-courses/amsc/', 'https://academiccatalog.umd.edu/undergraduate/approved-courses/amst/', 'https://academiccatalog.umd.edu/undergraduate/approved-courses/ansc/', 'https://academiccatalog.umd.edu/undergraduate/approved-courses/anth/', 'https://academiccatalog.umd.edu/undergraduate/approved-courses/aosc/', 'https://academiccatalog.umd.edu/undergraduate/approved-courses/arab/', 'https://academiccatalog.umd.edu/undergraduate/approved-courses/arch/', 'https://academiccatalog.umd.edu/undergraduate/approved-courses/arec/', 'https://academiccatalog.umd.edu/undergraduate/approved-courses

In [88]:
PickleFilePath = "all_splits.p"

In [92]:
#Scraping and Loading Data
if os.path.exists(PickleFilePath):
        print("Loading data from pickle file")
        all_splits = pickle.load( open( PickleFilePath, "rb" ) )
        print("loaded data")
else:
        loader = WebBaseLoader(web_paths=urlList)
        data = loader.load();
        print(data)

Loading data from pickle file
loaded data


In [93]:
if os.path.exists(PickleFilePath) == False:
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
        all_splits = text_splitter.split_documents(data)
        pickle.dump(all_splits, open( "all_splits.p", "wb" ) )

Using vectorstore to store all of our docs

In [94]:
vectorstore = Chroma.from_documents(documents=all_splits, embedding=OpenAIEmbeddings(openai_api_key=KEY))
retriever = vectorstore.as_retriever()
# print(retriever)

tags=['Chroma', 'OpenAIEmbeddings'] vectorstore=<langchain.vectorstores.chroma.Chroma object at 0x00000172E648EF90>


In [99]:
#This is the prompt we give to OpenAI everytime we ask it a question
rag_prompt = hub.pull("rlm/rag-prompt")
print(type(rag_prompt))
print(rag_prompt)

<class 'langchain.prompts.chat.ChatPromptTemplate'>
input_variables=['context', 'question'] messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"))]


In [96]:
rag_chain = (
                {"context": retriever, "question": RunnablePassthrough()}
                | rag_prompt
                | llm
        )

In [101]:
#It will be to awnser most general questions but nothing too complext. Here are a list of sample questions
Question = "What courses will teach me about opening a startup?"
awns = rag_chain.invoke(Question)
print(awns)

content='ENES - Engineering Science, SMLP470 Fundamentals of Entrepreneurial Ventures, and BUSI712 Entrepreneurship and New Ventures are courses that will teach you about opening a startup.'
