In [1]:
from bs4 import BeautifulSoup
import requests
import re
import time
from dotenv import load_dotenv
import os
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain import OpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import DirectoryLoader
import nltk
# nltk.download('averaged_perceptron_tagger')

# !pip install unstructured
# #Other dependencies to install https://python.langchain.com/en/latest/modules/indexes/document_loaders/examples/unstructured_file.html
# !pip install python-magic-bin
# !pip install chromadb

## Get scripts of *The Office*

In [2]:
load_dotenv()
openai_api_key = os.getenv('OPENAI_API_KEY')
if openai_api_key is None:
    raise Exception('API key not found. Make sure to set it in the .env file.')

In [3]:
# url = 'https://subslikescript.com/series/The_Office-386676'
# response = requests.get(url)
# soup = BeautifulSoup(response.text, 'html.parser')
# hrefs = []
# for link in soup.find_all('a'):
#     href = link.get('href')
#     hrefs.append(href)

In [4]:
# paths = [href for href in hrefs if re.match(r"/series/", href)]
# text_content_all = []
# for path in paths:
#     try:
#         script_url = f'https://subslikescript.com/{path}'
#         script_response = requests.get(script_url)
#         soup = BeautifulSoup(script_response.text, 'html.parser')
#         div_content = soup.find('div', class_='full-script')
#         text_content = div_content.text
#         text_content_all.append(text_content)
#         print(f'Finished {path}')
#         time.sleep(1)
#     except:
#         print(f'Failed {path}')

In [5]:
# import pickle
# with open('pkl_files/the_office_script.pkl', 'wb') as f:
#     pickle.dump(text_content_all, f)

In [6]:
# ep_names = [ re.findall(r'season-\d+/episode-\d+-.+', path)[0].replace('/', '_') for path in paths ]
# for ep_name, text_content in zip(ep_names, text_content_all):
#     with open(f'data/the_office_scripts/{ep_name}.txt', 'w', encoding='utf-8') as file:
#         # Write each item followed by a newline
#         for item in text_content:
#             file.write(str(item))
#     print(f'Wrote {ep_name}.txt')

## Get QA pairs from *quizbreaker.com*

In [7]:
url = 'https://www.quizbreaker.com/the-office-trivia'
response = requests.get(url)
# Check if the request was successful
if response.status_code == 200:
    soup = BeautifulSoup(response.text, 'html.parser')
    questions = [li.text for li in soup.find_all('li')]
    answers = [p.text for p in soup.find_all('p')]
else:
    print("Failed to retrieve the URL.")

In [8]:
answers = [answer for answer in answers if re.match(r'\bAnswer:', answer)]
answers.pop(55) # Remove the answer 55 which does not have a question
questions = [question for question in questions if '?' in question or '_' in question]
office_qas = list(zip(questions, answers))

## Start using Langchain

In [9]:
loader = DirectoryLoader('data/the_office_scripts/', glob='*.txt')
#documents = loader.load()
import pickle
with open('pkl_files/the_office_documents.pkl', 'rb') as f:
    documents = pickle.load(f)

In [10]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(documents)

In [11]:
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
docsearch = FAISS.from_documents(texts, embeddings)
llm = OpenAI(openai_api_key=openai_api_key)
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=docsearch.as_retriever())

In [14]:
import random
random.seed(42)
office_qas_random = random.sample(office_qas, 5)

for q, a in office_qas_random:
    ai_answer = qa.run(q)
    print(f'Question: {q}')
    print(f'Correct Answer: {a.replace("Answer: ", "")}')
    print(f'AI Answer: {ai_answer}')
    print('-'*30)

Question: Mindy Kaling, who played Kelly Kapoor in the sitcom, also served as a writer, director, and _______ on the show.
Correct Answer: An executive producer.
AI Answer:  producer
------------------------------
Question: Who are the 3 main members of the Party Planning Committee?
Correct Answer: Pam, Angela, and Phylis.
AI Answer:  The three main members of the Party Planning Committee are Angela, Pam, and Karen.
------------------------------
Question: What’s the name of the episode where Michael Scott accidentally burns his foot on the George Foreman Grill?
Correct Answer: “The Injury”.
AI Answer:  The episode is called "The Injury" and is from Season 2 of The Office.
------------------------------
Question: What’s the name of the second episode of the first season?
Correct Answer: "Diversity Day".
AI Answer:  The name of the second episode of the first season of The Office is "Diversity Day".
------------------------------
Question: Showrunners didn’t inform the NBC executives th

In [13]:
# Run a query
# query = "Who are the members of the party planning committee?"
# qa.run(query)
# query = "What does Kevin like?"
# qa.run(query)
# query = "Who is Pam's husband?"
# qa.run(query)
# query = "Who was Pam engaged to before Jim?"
# qa.run(query)
# query = "What season did Michael leave The Office?"
# qa.run(query)
# query = "What is Erin Hannon's real name?"
# qa.run(query)
# query = "Ryan caused the fire at the office warming up what?"
# qa.run(query)
# query = "What's Pam's favorite flavor of yogurt?"
# qa.run(query)