In [1]:
import os
from langchain_groq import ChatGroq
from langchain_openai import ChatOpenAI
from  dotenv import load_dotenv

# load_dotenv()
# groq_api_key = os.getenv("GROQ_API_KEY")
# os.environ["GROQ_API_KEY"] = groq_api_key
# model = ChatGroq(
#     model="llama-3.1-70b-versatile"
# )
openai_api_key = os.getenv("OPENAI_API_KEY")
os.environ["OPENAI_API_KEY"] = openai_api_key
model = ChatOpenAI(
    model="gpt-4o-mini"
)

In [None]:
import bs4
from langchain_community.document_loaders import WebBaseLoader, TextLoader

# bs4_strainer = bs4.SoupStrainer(class_=("clearfix text-formatted field field--name-body field--type-text-with-summary field--label-hidden field__item"))
# loader = WebBaseLoader(
#     web_paths=("https://okanagan.calendar.ubc.ca/faculties-schools-and-colleges/faculty-creative-and-critical-studies/bachelor-media-studies-program/academic-regulations",),
#     bs_kwargs={"parse_only": bs4_strainer},
# )

# data = {
#     "https://students.ok.ubc.ca/academic-success/advising-options/academic-advising/frequently-asked-questions/#lab" : "ok-wysiwyg-wrapper",
#     "https://okanagan.calendar.ubc.ca/faculties-schools-and-colleges/faculty-creative-and-critical-studies/bachelor-media-studies-program/academic-regulations" : "clearfix text-formatted field field--name-body field--type-text-with-summary field--label-hidden field__item",
#     "https://students.ok.ubc.ca/ws-tutorial/viewing-your-transfer-credit-or-high-school-and-ap-ib-credit/" : "tutorial-content",
#     "https://okanagan.calendar.ubc.ca/admissions/change-degree-program" : "clearfix text-formatted field field--name-body field--type-text-with-summary field--label-hidden field__item",
#     "https://you.ubc.ca/applying-ubc/applied/first-year-credit-ok/" : "expand row-fluid"
# }

# docs = []

# def scrape_web(url: str, class_name: str):
#     bs4_strainer = bs4.SoupStrainer(class_=class_name)
#     loader = WebBaseLoader(
#         web_path=[url],
#         bs_kwargs={"parse_only": bs4_strainer}
#     )
#     docs.extend(loader.load())

# for url, class_name in data.items():
#     scrape_web(url, class_name)

# for doc in docs:
#     print(len(doc.page_content))

data = ["data/transfer_credits.txt", "data/academic_regulation.txt", "data/major_minor.txt", "data/course_withdrawal.txt", "data/graduation.txt",
        "data/degree_program_options.txt", "data/final_exams_cosc.txt", "data/final_exams_data.txt", "data/final_exams_math.txt", "data/final_exams_phil.txt", "data/final_exams_stat.txt"]

docs = []

for file in data:
    loader = TextLoader(file)
    docs.extend(loader.load())

[print(len(doc.page_content)) for doc in docs]

In [None]:
print(docs[0].page_content[:1000])

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 200,
    add_start_index = True
)

all_splits = text_splitter.split_documents(docs)
len(all_splits)

In [None]:
len(all_splits[0].page_content)

In [135]:
# all_splits[10].metadata

In [136]:
from langchain_openai import OpenAIEmbeddings
from langchain_pinecone import Pinecone, PineconeVectorStore
import pinecone

load_dotenv()
pinecone_api_key = os.getenv('PINECONE_API_KEY')
os.environ["PINECONE_API_KEY"] = pinecone_api_key

openai_api_key = os.getenv("OPENAI_API_KEY")
os.environ["OPENAI_API_KEY"] = openai_api_key

embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small"
)
index_name = "bolt-chatbot"
vectorstore = PineconeVectorStore.from_documents(
    documents=all_splits,
    embedding=embeddings,
    index_name=index_name
)


In [None]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 7})
retrieved_docs = retriever.invoke("What is a closed course?")

len(retrieved_docs)

In [None]:
print(retrieved_docs[0].page_content)

In [None]:
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import PromptTemplate

parser = StrOutputParser()

template = """ 
Answer the question based on the context below. You are an academic advisor for UBCO students. Don't start by saying, based on provided context. 
If you don't the answer of something, just say I can only help you with academic advising.
Students might give you courses, and ask details about their final exams. Provide them date, time and location for all their courses.

Context: {context}

Question: {question}
"""

prompt = PromptTemplate.from_template(template)
prompt.format(context="here is some context", question="here is a question")



In [140]:
from operator import itemgetter

chain = (
    {
        "context": itemgetter("question") | retriever,
        "question": itemgetter("question"),
    }
    | prompt
    | model
    | parser
)

In [None]:
question = input()
print(f"Question: {question}")
print(f"Answer: {chain.invoke({'question': question})}")
print()