In [644]:
pip install langchain-community



In [645]:
import os
import streamlit as st
import pickle
import time
from transformers import AutoTokenizer, AutoModelForCausalLM
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredURLLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS

In [646]:
pip install unstructured



In [647]:
loaders = UnstructuredURLLoader(urls=[
    "https://www.moneycontrol.com/news/business/markets/wall-street-rises-as-tesla-soars-on-ai-optimism-11351111.html"
])
data = loaders.load()
len(data)

1

In [648]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)

# As data is of type documents we can directly use split_documents over split_text in order to get the chunks.
docs = text_splitter.split_documents(data)

In [649]:
len(docs)

9

In [650]:
pip install faiss-cpu transformers




In [651]:
!!pip install --upgrade sentence-transformers



In [652]:
from sentence_transformers import SentenceTransformer


model = SentenceTransformer('all-MiniLM-L6-v2')



In [653]:
texts = [doc.page_content for doc in docs]

In [654]:
var = model.encode(texts)

In [655]:
len(var)

9

In [656]:
question = "What is the latest news about Tata Motors?"

question_embedding = model.encode(question)

In [670]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Compute cosine similarity between question and document embeddings
similarities = cosine_similarity([question_embedding], var)

# Get indices of the top-k similar documents
top_k = 5  # Number of similar documents to retrieve
similar_indices = np.argsort(similarities[0])[::-1][:top_k]

# Retrieve the most similar documents
similar_docs = [docs[i] for i in similar_indices]


In [658]:
pip install --upgrade transformers



In [659]:
pip install langchain_groq



In [660]:
import os
import getpass

# Prompt for the API key if not already set
if not os.environ.get("GROQ_API_KEY"):
    os.environ["GROQ_API_KEY"] = getpass.getpass("Enter API key for Groq: ")


from langchain_groq import ChatGroq

model = ChatGroq(model="llama3-70b-8192")

In [661]:
pip install langchain langchain-groq faiss-cpu



In [662]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.docstore.document import Document


# Create embeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorIndex = FAISS.from_documents(docs, embeddings)

In [663]:
from langchain.chains import RetrievalQAWithSourcesChain
from langchain_groq import ChatGroq

# Initialize the Groq model
model = ChatGroq(model="llama3-70b-8192")

# Use the vectorIndex as the retriever
retriever = vectorIndex.as_retriever()

# Create a retrieval-based QA chain
chain = RetrievalQAWithSourcesChain.from_llm(llm=model, retriever=retriever)

In [678]:
query = "What can you tell me about Amazon's performance?"

In [683]:
prompt = """You are an advanced research assistant that answers questions based on the content of the provided articles. If a financial question is asked, such as about a company's market value, stock price, or market trends, prioritize retrieving the most relevant financial data from the article and provide the exact figures.

Always ensure that you retrieve data that directly answers the user's question. For example, if someone asks, "How much is Tesla's market value?", ensure that the specific figure is mentioned in your answer, along with the source (the link to the article).

Your responses should always contain the necessary numbers and data extracted from the provided article. Do not say "I don't know" if the data is present in the article — instead, provide the exact figure along with any necessary context.

Answer in a clear and straightforward manner based on the article's information.

Answer in as few words as possible. Make the answer to the point unless the question isn't direct.
"""

formatted_query = f"{prompt}\nQuestion: {query}"


In [682]:
result = chain({"question": formatted_query}, return_only_outputs=True)

# Print the results
print("Question:", query)
print("Answer:", result["answer"])
print("Sources:", result["sources"])

Question: What can you tell me about Amazon's performance?
Answer: Amazon's stock is climbing 3.5%.

Sources: https://www.moneycontrol.com/news/business/markets/wall-street-rises-as-tesla-soars-on-ai-optimism-11351111.html
