In [1]:
import json
import os
from tqdm import tqdm
from bs4 import BeautifulSoup

from langchain import hub
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langgraph.graph import START, StateGraph
from typing_extensions import List, TypedDict
from langchain_ollama import OllamaLLM
from langchain_huggingface import HuggingFaceEmbeddings

### Preprocessing Datasets

In [2]:
# collect news meta data,
file_path = './news1.json'
with open(file_path, 'r') as f:
    news = json.load(f)

In [3]:
# check the schema of json file
news[:1]

[{'category': 'company',
  'datetime': 1731968631,
  'headline': 'Mag 7 vs. S&P 493: Is earnings growth beginning to slow?',
  'id': 131461305,
  'image': 'https://s.yimg.com/ny/api/res/1.2/eKS.TjTPOUaiMZHSvZav1w--/YXBwaWQ9aGlnaGxhbmRlcjt3PTEyMDA7aD02NzQ-/https://s.yimg.com/os/creatr-uploaded-images/2024-10/8ef63850-a5fb-11ef-bfbf-7c0ad8de5650',
  'related': 'AAPL',
  'source': 'Yahoo',
  'summary': 'Magnificent Seven stocks — which is comprised of Alphabet (GOOGL, GOOG), Amazon (AMZN), Apple (AAPL), Meta Platforms (META), Microsoft (MSFT), Nvidia (NVDA), and Tesla (TSLA) — are experiencing a deceleration in growth across their earnings. Nvidia is the next and final member of the group to report earnings this Wednesday, November 20. Yahoo Finance acnhor Julie Hyman joins Asking for a Trend to compare the Mag 7\'s earnings growth compared to that of the rest of the S&P 500 (^GSPC), referring back to Solidarity Capital CEO Jeff McClean\'s comments to Yahoo Finance: "I think there\'s goin

In [4]:
# collect all news content from html files with beautifulsoup
contents = {}
for new in tqdm(news):
    filename = str(new['id'])+'.html'
    filepath = os.path.join('./news1', filename)
    with open(filepath, 'r') as f:
        html = f.read()
        content = BeautifulSoup(html, 'html.parser')
    
    contents[new['id']] = content

100%|██████████| 200/200 [00:09<00:00, 21.44it/s]


In [5]:
# parse the content and construct the new's text information
def parse_text_in_bs(content: BeautifulSoup):
    paragraphs = []
    for ptext in content.find_all('p'):
        if not ptext.text:
            continue
        paragraphs.append(ptext.text.strip())
    return  '\n'.join(paragraphs)

# save it based on id as key
pages = {}
for key,content in tqdm(contents.items()):
    pages[key] = parse_text_in_bs(content)

100%|██████████| 200/200 [00:00<00:00, 1494.36it/s]


### Build the Retrieval-Augmented-Generation (RAG) model

In [7]:
# before this, install the ollama
# for linux user: curl -fsSL https://ollama.com/install.sh | sh
# or using Docker image to run llama.
# refer to https://github.com/ollama/ollama, find the model which your local computer can hold.
llm = OllamaLLM(model="llama3.2")

# interact with the LLM to verify service is running.
llm.invoke("The first man on the moon was ...")

'...Neil Armstrong. He stepped onto the lunar surface on July 20, 1969, as part of the Apollo 11 mission. His famous words upon setting foot on the moon were: "That\'s one small step for man, one giant leap for mankind."'

In [8]:
# load pre-trained embedding model
# which is used to encode text to embedding vectors.
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
# this model: https://huggingface.co/sentence-transformers/all-mpnet-base-v2
# refer to huggingface hub for more models



# there we need a vectordb to store the embedding vector and support the efficient similarity search.
# Considering the size of the dataset is small, we just use the in-memory vectorstore
from langchain_core.vectorstores import InMemoryVectorStore
vector_store = InMemoryVectorStore(embeddings)

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
# Construct the Documentation based on our collected news data
docs = []
for new in news:
    page = pages[new['id']]
    doc = Document(page_content=page, metadata=new)
    docs.append(doc)

In [28]:
# In case our loaded document is too long to fit into the context window of the LLM, we need to split it into smaller chunks.
# Even for those LLM or embedding LM that cold fit the full document in their context window, models still struggle to find information in very long inputs.

# Split the documents into chunks for embedding and vector storage,
# this could help us retrieve only the most relevant chunks when we search in certain query.
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200)
all_splits = text_splitter.split_documents(docs)
len(all_splits)

1294

In [13]:
# save these chunks into vectordb for later similarity search.
# maybe takes several minutes to encode all document and save it into vectordb
_ = vector_store.add_documents(all_splits)

In [27]:
# the prompt is a template for augmentation step in the RAG pipeline,
# it contains two input fields: "question" and "context"
prompt = hub.pull("rlm/rag-prompt")
prompt



ChatPromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, metadata={'lc_hub_owner': 'rlm', 'lc_hub_repo': 'rag-prompt', 'lc_hub_commit_hash': '50442af133e61576e74536c6556cefe1fac147cad032f4377b60c436e6cdcb6e'}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"), additional_kwargs={})])

In [17]:
# Define state for application
class State(TypedDict):
    question: str
    context: List[Document]
    answer: str


# Define application steps

# these are two predefined interface in langchain rag framework.

# retrieve: just invoke the vectordb's similarity search to retrieve the most relevant documents based on the question.
def retrieve(state: State):
    retrieved_docs = vector_store.similarity_search(state["question"])
    return {"context": retrieved_docs}


# generate: invoke the LLM to generate the answer based on the retrieved documents.
# the retrieved documents are joined together as the context for the LLM.
# the question is also passed to the LLM as the query.
def generate(state: State):
    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
    messages = prompt.invoke({"question": state["question"], "context": docs_content})
    response = llm.invoke(messages)
    return {"answer": response}

In [18]:
# Compile application and test
graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

### Sample a related article and query based on it

In [33]:
import pprint
pprint.pprint(news[10])
print('\n\n')
pprint.pprint(pages[news[10]['id']])

{'category': 'company',
 'datetime': 1731926825,
 'headline': 'Tata Electronics to acquire stake in Pegatron iPhone plant',
 'id': 131462772,
 'image': 'https://www.verdict.co.uk/wp-content/uploads/2024/11/12-shutterstock_2374133657.jpg',
 'related': 'AAPL',
 'source': 'Yahoo',
 'summary': 'The acquisition will result in a joint venture, in which Tata '
            'will own 60% and Pegatron will have 40% interest.',
 'url': 'https://finnhub.io/api/news?id=9afbb03728fdfe6c793146a49d8d665f560b083218fc23e503bed29a4b11d0ad'}



('The acquisition will result in a joint venture, in which Tata will own 60% '
 'and Pegatron will have 40% interest.\n'
 'Tata Electronics has agreed to purchase majority stake in the iPhone '
 'manufacturing facility in India from Taiwanese firm Pegatron, reports '
 'Reuters.\n'
 'The move will result in a joint venture, in which Tata will own 60% and '
 'manage daily operations. Pegatron will have a 40% stake, and provide '
 'technical support, two sources said.

In [34]:
# check the answer
response = graph.invoke({"question": "How's going with Tata Electronics"})
pprint.pprint(response["answer"])

("I don't know the current status or performance of Tata Electronics "
 'specifically beyond the information provided about their iPhone '
 'manufacturing plans and acquisition of the Chennai Pegatron plant. The '
 'company is expanding its iPhone manufacturing capabilities and has been '
 'increasing its presence in the Indian market. Tata operates an existing '
 "iPhone assembly plant in Karnataka that was acquired from Taiwan's Wistron "
 'in 2023.')
