In [39]:
import os
from langchain_cohere.chat_models import ChatCohere
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import PromptTemplate
from langchain.schema import Document
from langchain_cohere.embeddings import CohereEmbeddings
from langchain_mongodb import MongoDBAtlasVectorSearch
from pymongo.mongo_client import MongoClient
from bs4 import BeautifulSoup, Comment
import requests
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
# Initialize Summarization Model
summarization_model = ChatCohere(cohere_api_key=os.getenv("COHERE_API_KEY"), model="command-r-plus", temperature=0.5)

In [4]:
summarization_model.invoke(input = "Hello World!")

AIMessage(content='Hello! How can I help you today?', additional_kwargs={'documents': None, 'citations': None, 'search_results': None, 'search_queries': None, 'is_search_required': None, 'generation_id': '2e470590-e2e4-43bc-bee4-8eba3bccb56b', 'token_count': {'input_tokens': 69, 'output_tokens': 9}}, response_metadata={'documents': None, 'citations': None, 'search_results': None, 'search_queries': None, 'is_search_required': None, 'generation_id': '2e470590-e2e4-43bc-bee4-8eba3bccb56b', 'token_count': {'input_tokens': 69, 'output_tokens': 9}}, id='run-fa7b38b6-da16-445c-a39c-c1e85b8d05d6-0')

In [5]:
document_sample = Document(page_content = "Page Content of Document 1", metadata = {'id' : 1, 'author' : 'Aritta'})

In [6]:
document_sample

Document(page_content='Page Content of Document 1', metadata={'id': 1, 'author': 'Aritta'})

In [7]:
embeddings_model = CohereEmbeddings(cohere_api_key = os.getenv("COHERE_API_KEY"), model = "embed-english-v3.0")

In [8]:
len(embeddings_model.embed_query(text = "Hello, how are you?"))

1024

In [9]:
mongo_client = MongoClient(host = os.getenv("ATLAS_CONNECTION_STRING"))

In [10]:
# Define MongoDB Database and Collection
webpages_database = mongo_client["webpages"]
content_collection = webpages_database["content"]

In [11]:
vectorstore = MongoDBAtlasVectorSearch(collection = content_collection, embedding = embeddings_model, index_name = "content_index")

In [15]:
# list(knowledge_collection.find())
for i in content_collection.find():
    print(i)

In [16]:
content_collection.insert_one({'hello':'world'})

InsertOneResult(ObjectId('667d9018df3ba1692f03b34a'), acknowledged=True)

In [52]:
for i in content_collection.find():
    print(i)

{'_id': ObjectId('667d9018df3ba1692f03b34a'), 'hello': 'world'}


In [57]:
def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True

def extract_useful_content(url):
    try:
        response = requests.get(url)
        response.raise_for_status()

        soup = BeautifulSoup(response.content, 'html.parser')

        elements = soup.find_all(['p', 'div', 'article', 'section'])

        visible_texts = filter(tag_visible, elements)
        content_parts = [element.get_text(separator=' ').strip() for element in visible_texts if element.get_text(strip=True)]

        main_content = ' '.join(content_parts)
        main_content = ' '.join(main_content.split())

        return main_content.strip()

    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None

def summarize_content(content):
    prompt_template = PromptTemplate.from_template(template="Summarize the following content:\n\n{content}\n\nSummary:")
    response = summarization_model.invoke(prompt_template.format(content=content))
    return response.content

def store_content_in_db(url, content, summary):
    # document = {
    #     "page_content": content,
    #     "metadata": {
    #         "url": url,
    #         "summary": summary
    #     }
    # }
    # content_collection.insert_one(document)
    ingestion_docs = []
    # url, summary = url, summary
    id = 1
    if content:
        ingestion_docs.append(Document(page_content = content, metadata = {'id' : id, 'url' : url, 'summary' : summary}))
        id+=1
    insert_ids = vectorstore.add_documents(ingestion_docs)

def summarize_webpage(url):
    raw_content = extract_useful_content(url)
    if not raw_content:
        return None
    
    summary = summarize_content(raw_content)
    store_content_in_db(url, raw_content, summary)
    return summary

def get_answer(query):
    qa_prompt_template = PromptTemplate.from_template("Answer the question based on the provided documents: {context}\n\nQuestion: {query}")
    str_parser = StrOutputParser()
    search_result = vectorstore.similarity_search(query=query)
    context = ''
    doc_id = 1
    for doc in search_result:
        context += f'\n\nSource {doc_id}:\n' + doc.page_content
        doc_id += 1
    qa_chain = qa_prompt_template | summarization_model | str_parser
    response = qa_chain.invoke({'context': context, 'query': query})
    return response

In [60]:
# Input URL
url = "https://blog.google/intl/en-in/products/platforms/google-wallet-comes-to-india-with-localized-experiences/"
print("Summarizing webpage...")
summary = summarize_webpage(url)
print("Summary:", summary)

Summarizing webpage...
Summary: Google Wallet is now available to Android users in India, offering a secure and convenient way to access essential items such as boarding passes, loyalty cards, event tickets, and more. With partnerships with over 20 Indian brands, Google Wallet provides a wide range of digital wallet services. Users can add movie tickets, access boarding passes, redeem gift cards, ride public transport, and even use their Android phone as a corporate badge. Google Wallet prioritizes security and privacy, giving users control over their information. This launch further enhances the Android experience in India, making it more connected and seamless.


In [69]:
# Answering a question based on the stored data
query = "What is Google Wallet?"
print("Answering question...")
answer = get_answer(query)
print("Answer:", answer)

Answering question...
Answer: Google Wallet is a new feature available to Android users in India that provides fast, secure access to everyday essentials such as boarding passes, loyalty cards, event tickets, public transport tickets, gift cards, and more. It offers a single, organized destination for users to access their essential items easily. Google has partnered with over 20 of India's top brands, including PVR, INOX, Air India, Air India Express, MakeMyTrip, Ixigo, and Flipkart, to offer a wide range of services through Google Wallet.


In [None]:
# LangSmith Website: https://smith.langchain.com/o/01459434-cc55-50e2-ac3c-57013defb5cc/
# LangChain Documentation Website: https://python.langchain.com/v0.2/docs/introduction/
# Cohere API Key Website: https://dashboard.cohere.com/api-keys
# MongoDB Atlas Website: https://cloud.mongodb.com/v2/667d7625c0053f5aaebdc4b9#/overview
# https://aws.amazon.com/what-is/langchain/
# https://blog.langchain.dev/integrating-langchain-with-azure-container-apps-dynamic-sessions/
# https://blog.google/intl/en-in/products/platforms/google-wallet-comes-to-india-with-localized-experiences/