In [1]:
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup

In [2]:
url = "https://www.philstar.com/business/technology/2023/08/07/2286889/musk-says-his-cage-fight-zuckerberg-will-stream-x"
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
html = urlopen(req).read()

In [3]:
soup = BeautifulSoup(html, 'html')

# Get the title
title = soup.title
print(title)

<title>Musk says his cage fight with Zuckerberg will stream on X | Philstar.com</title>


In [4]:
# Get the paragraphs
paragraphs = soup.find_all("p")

In [5]:
p = str(paragraphs)
cleantext = BeautifulSoup(p, "html").get_text()
print(cleantext)

[WASHINGTON, United States — Elon Musk said Sunday that a "cage match" he and Meta CEO Mark Zuckerberg have seemingly agreed to as a fund-raiser will be carried live on X, formerly known as Twitter, which he owns., "Zuck v Musk fight will be live-streamed on X," Musk posted. "All proceeds will go to charity for veterans.", Zuckerberg soon hit back on Threads, the new app he launched last month in a direct challenge to what was then still called Twitter, saying he was ready., "Shouldn't we use a more reliable platform that can actually raise money for charity?" he added, in a dig at the wave of problems faced by Musk's platform since he took over last year., The two billionaire entrepreneurs, who in the past have occasionally jousted from afar, became direct competitors after Zuckerberg's Meta launched its Twitter-like Threads platform in early July, quickly drawing 120 million users, according to Quiver Quantitative., Musk then posted on X, "I'm up for a cage match if he is lol," refer

In [6]:
import os

os.environ["OPENAI_API_KEY"] = "sk-YOUR_OPENAI_API_KEY"

In [7]:
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import DirectoryLoader
from InstructorEmbedding import INSTRUCTOR
from langchain.embeddings import HuggingFaceInstructEmbeddings

  from tqdm.autonotebook import trange


In [8]:
#splitting the text into
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.create_documents([cleantext])

In [9]:
import torch
torch.cuda.is_available()

False

In [10]:
from langchain.embeddings import HuggingFaceInstructEmbeddings

instructor_embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl", 
                                                      model_kwargs={"device": "cpu"})

load INSTRUCTOR_Transformer
max_seq_length  512


In [11]:
# Embed and store the texts
# Supplying a persist_directory will store the embeddings on disk
persist_directory = 'db-news-musk-match'

In [12]:
## Here is the nmew embeddings being used
embedding = instructor_embeddings

vectordb = Chroma.from_documents(documents=texts, 
                                 embedding=embedding,
                                 persist_directory=persist_directory)

Using embedded DuckDB with persistence: data will be stored in: db-news-musk-match


In [13]:
# persist the db to disk
vectordb.persist()
vectordb = None

In [14]:
# Now we can load the persisted database from disk, and use it as normal. 
vectordb = Chroma(persist_directory=persist_directory, 
                  embedding_function=embedding)

Using embedded DuckDB with persistence: data will be stored in: db-news-musk-match


In [15]:
retriever = vectordb.as_retriever()
docs = retriever.get_relevant_documents("What are the usage requirements?")
print("len(docs)", len(docs))

Chroma collection langchain contains fewer than 4 elements.


len(docs) 3


In [16]:
# Specifying top k
# See https://python.langchain.com/en/latest/modules/indexes/retrievers/examples/vectorstore-retriever.html
retriever = vectordb.as_retriever(search_kwargs={"k": 3})

In [17]:
# create the chain to answer questions 
# https://python.langchain.com/docs/modules/chains/document/stuff
qa_chain = RetrievalQA.from_chain_type(llm=OpenAI(), 
                                  chain_type="stuff", 
                                  retriever=retriever, 
                                  return_source_documents=True)

In [18]:
import textwrap

def wrap_text_preserve_newlines(text, width=110):
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text

def process_llm_response(llm_response):
    print(wrap_text_preserve_newlines(llm_response['result']))

# Q&A Time!

In [19]:
# full example
query = "Who will Elon Musk fight in a cage match?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 Elon Musk will fight Mark Zuckerberg in a cage match.


In [20]:
# full example
query = "Where will the match be broadcast?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 The match will be broadcast on X (formerly known as Twitter), which is owned by Elon Musk.


In [21]:
# full example
query = "When will the match happen?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 No date has been set for the fight, so it is not known when it will happen.


In [22]:
# full example
query = "Who do you think will win?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 It is too hard to say who will win as no date has been set for the fight yet.


# Credits

Script based on Sam Witteveen work: https://www.youtube.com/watch?v=cFCGUjc33aU
Check out his YouTube Channel: https://www.youtube.com/@samwitteveenai