In [14]:
!pip install newspaper3k lxml_html_clean langchain-core langchain-mistralai --quiet

In [15]:
import logging

logger = logging.getLogger()
logger.setLevel(logging.INFO)

from langchain_community.document_loaders import WebBaseLoader
from typing import List
from langchain_core.messages import HumanMessage
from langchain_mistralai import ChatMistralAI

from dotenv import load_dotenv
import os

load_dotenv()
url1 = "https://techcrunch.com/category/artificial-intelligence/"
url2 = "https://www.berliner-kurier.de/topics/berlin"

In [16]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin


def extract_headline_urls(url):
    """Fetch the given URL and extract all headline link URLs found on the page."""
    try:
        response = requests.get(url, timeout=5)
        # Raise an HTTPError if the response was unsuccessful (4xx or 5xx status)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        # Handle any request-related errors (includes timeouts, HTTP errors, etc.)
        print(f"Error fetching {url}: {e}")
        return []

    # Parse the HTML content of the page
    soup = BeautifulSoup(response.text, 'html.parser')

    headline_links = set()  # use a set to avoid duplicate URLs

    # 1. Find anchors within heading tags (e.g., <h1>, <h2>, <h3>)
    for heading in soup.find_all(['h1', 'h2', 'h3']):  # common headline tags
        a_tag = heading.find('a', href=True)
        if a_tag:
            href = a_tag['href']
            # Convert relative URLs to absolute URLs
            full_url = urljoin(url, href)
            headline_links.add(full_url)

    # 2. Find any <a> tags that have class or id attributes indicating headlines
    for a_tag in soup.find_all('a', href=True):
        # Combine class names and id into one string for checking
        attributes = " ".join(a_tag.get('class', [])) + " " + (a_tag.get('id') or "")
        attributes = attributes.lower()
        if any(keyword in attributes for keyword in ['headline', 'title']):
            href = a_tag['href']
            full_url = urljoin(url, href)
            headline_links.add(full_url)

    # Convert the set to a list for consistency (set avoids duplicates)
    headline_urls = list(headline_links)
    logging.info(f"Urls: {headline_urls}")
    return headline_urls

In [28]:
def collect_top_headline_content(site_url: str, limit: int = 10) -> List[str]:
    """
    Fetch headlines from a given site URL, limit the number of results,
    load each page with WebBaseLoader, and return only the main news content
    (excluding headers, footers, scripts, navigation, and unnecessary sections).
    """
    extracted_urls = extract_headline_urls(site_url)
    limited_urls = extracted_urls[:limit]

    # Load the pages using WebBaseLoader
    loader = WebBaseLoader(limited_urls)
    data_items = loader.load()

    collected_data = []
    for item in data_items:
        title = item.metadata.get('title', '')
        page_text = item.page_content or ''

        # Use BeautifulSoup to clean unnecessary sections
        soup = BeautifulSoup(page_text, "html.parser")

        # Remove non-content sections
        for tag in soup.find_all(["header", "footer", "nav", "aside", "script", "style"]):
            tag.decompose()  # Remove these elements from the tree

        # Extract cleaned text and truncate to 1000 characters
        main_content = soup.get_text(separator=" ", strip=True)[:3000]

        # Combine title with cleaned content
        combined_text = f"{title} {main_content}".strip()
        collected_data.append(combined_text)

    logging.info(f"Collected {len(collected_data)} cleaned headline contents from {site_url}")
    return collected_data

In [29]:
data = collect_top_headline_content(url1)

INFO:root:Urls: ['https://techcrunch.com/2025/03/07/deepseek-everything-you-need-to-know-about-the-ai-chatbot-app/', 'https://techcrunch.com/2025/03/07/last-day-to-apply-to-be-a-techcrunch-sessions-ai-speaker/', 'https://techcrunch.com/2025/03/06/mistrals-new-ocr-api-turns-any-pdf-document-into-an-ai-ready-markdown-file/', 'https://techcrunch.com/2025/03/07/metas-next-llama-models-may-have-upgraded-voice-features/', 'https://techcrunch.com/2025/03/07/google-debuts-a-new-gemini-based-text-embedding-model/', 'https://techcrunch.com/2025/03/08/judge-allows-authors-ai-copyright-lawsuit-against-meta-to-move-forward/', 'https://techcrunch.com/storyline/sxsw-2025-live-coverage-ai-takes-center-stage/', 'https://techcrunch.com/2025/03/07/signal-president-meredith-whittaker-calls-out-agentic-ai-as-having-profound-security-and-privacy-issues/', 'https://techcrunch.com/2025/03/07/us-lawmakers-have-already-introduce-hundreds-of-ai-bills-in-2025/', 'https://techcrunch.com/2025/03/06/chatgpt-doubled-

In [30]:
print(data[0])

DeepSeek: Everything you need to know about the AI chatbot app | TechCrunch DeepSeek: Everything you need to know about the AI chatbot app | TechCrunch


























DeepSeek: Everything you need to know about the AI chatbot app | TechCrunch













































































 
























 TechCrunch Desktop Logo




 TechCrunch Mobile Logo




LatestStartupsVentureAppleSecurityAIAppsStartup Battlefield

EventsPodcastsNewsletters







		Sign In	







SearchSubmit







Site Search Toggle




Mega Menu Toggle





Topics



		Latest	



		AI	



		Amazon	



		Apps	



		Biotech & Health	



		Climate	



		Cloud Computing	



		Commerce	



		Crypto	



		Enterprise	



		EVs	



		Fintech	



		Fundraising	



		Gadgets	



		Gaming	



		Google	



		Government & Policy	




		Hardware	



		Instagram	



		Layoffs	



		Media & Entertainment	



		Meta	



		Microsoft	



		Privacy	



		Robotics	



		Security	

In [31]:

def llm_daily_response():
    logging.info("Data: " + str(data))
    logging.info("Summarising news articles")
    chat = ChatMistralAI(api_key=MISTRAL_API_KEY,
                         model="mistral-large-latest",
                         temperature=0.8,
                         max_tokens=131000,
                         )
    system_prompt = f"""
        You are a personal ai bot that helps people summarise news articles every day,
        My name is Yash and you will provide me with a summary of the news articles below:
        {data}

        Make sure it is going to be in an audio format so make it sound like a human and do not format it textually.
        Always greet me with "Hello Yash" and say "Goodbye have a good day" at the end.
        """
    messages = [HumanMessage(content=system_prompt)]
    response = chat.invoke(messages)
    logging.info("Response from LLM: " + response.content)
    return response.content

In [32]:
response = llm_daily_response()

INFO:root:Data: ['DeepSeek: Everything you need to know about the AI chatbot app | TechCrunch DeepSeek: Everything you need to know about the AI chatbot app | TechCrunch\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nDeepSeek: Everything you need to know about the AI chatbot app | TechCrunch\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n TechCrunch Desktop Logo\n\n\n\n\n TechCrunch Mobile Logo\n\n\n\n\nLatestStartupsVentureAppleSecurityAIAppsStartup Battlefield\n\nEventsPodcastsNewsletters\n\n\n\n\n\n\n\n\t\tSign In\t\n\n\n\n\n\n\n\nSearchSubmit\n\n\n\n\n\n\n\nSite Search Toggle\n\n\n\n\nMega Menu Toggle\n\n\n\n\n\nTopics\n\n\n\n\t\tLatest\t\n\n\n\n\t\tAI\t\n\n\n\n\t\tAmazon\t\n\n\n\n\t\tApps\t\n\n\n\n\t\tBiotech & Health\t\n\n\n\n\t\tClimate\t\n\n\n\n\t\tCloud Computing\t\n\n\n\n\t\tCommerce\t\n\n\n\n\t\tCrypto\t\n\n\n\n\

In [33]:
print(response)

Hello Yash,

Here are the summaries of the news articles you provided:

1. **DeepSeek: Everything you need to know about the AI chatbot app**
   DeepSeek, a Chinese AI lab, has gained widespread attention after its chatbot app topped the Apple App Store and Google Play charts. The lab, backed by High-Flyer Capital Management, focuses on AI tools for research. Despite facing U.S. export bans on hardware, DeepSeek has managed to train models using less powerful chips. The success of DeepSeek has raised questions about the U.S.'s lead in the AI race and the demand for AI chips.

2. **Last day to apply to be a Sessions: AI speaker**
   TechCrunch Sessions: AI, taking place on June 5 in Zellerbach Hall at UC Berkeley, is seeking AI leaders to lead breakout sessions and discussions. The deadline to apply is tonight at 11:59 p.m. PT. Selected speakers will enjoy various perks, including access to exclusive events and the opportunity to engage with industry pioneers.

3. **Mistral adds a new A

In [6]:
!pip install langchain-aws --quiet

In [23]:
from langchain_aws import ChatBedrock

chat_bedrock_model = ChatBedrock(
    model_id="anthropic.claude-3-5-sonnet-20240620-v1:0",
    model_kwargs=dict(temperature=1),
    # other params...
)

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


In [24]:
def llm_daily_response():
    logging.info("Data: " + str(data))
    logging.info("Summarising news articles")

    system_prompt = f"""
        You are a personal ai bot that helps people summarise news articles every day,
        My name is Yash and you will provide me with a summary of the news articles below:
        {data}

        Make sure it is going to be in an audio format so make it sound like a human and do not format it textually.
        Always greet me with "Hello Yash" and say "Goodbye have a good day" at the end.
        """
    messages = [HumanMessage(content=system_prompt)]
    response = chat_bedrock_model.invoke(messages)
    logging.info("Response from LLM: " + response.content)
    return response.content

In [25]:
response = llm_daily_response()

INFO:root:Data: ['DeepSeek: Everything you need to know about the AI chatbot app | TechCrunch DeepSeek: Everything you need to know about the AI chatbot app | TechCrunch\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nDeepSeek: Everything you need to know about the AI chatbot app | TechCrunch\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n TechCrunch Desktop Logo\n\n\n\n\n TechCrunch Mobile Logo\n\n\n\n\nLatestStartupsVentureAppleSecurityAIAppsStartup Battlefield\n\nEventsPodcastsNewsletters\n\n\n\n\n\n\n\n\t\tSign In\t\n\n\n\n\n\n\n\nSearchSubmit\n\n\n\n\n\n\n\nSite Search Toggle\n\n\n\n\nMega Menu Toggle\n\n\n\n\n\nTopics\n\n\n\n\t\tLatest\t\n\n\n\n\t\tAI\t\n\n\n\n\t\tAmazon\t\n\n\n\n\t\tApps\t\n\n\n\n\t\tBiotech & Health\t\n\n\n\n\t\tClimate\t\n\n\n\n\t\tCloud Computing\t\n\n\n\n\t\tCommerce\t\n\n\n\n\t\tCrypto\t\n\n\n\n\

In [27]:
print(response)

Hello Yash! Here's a summary of today's tech news articles:

DeepSeek has launched a new AI chatbot app that's generating buzz in the tech world. Meanwhile, TechCrunch is reminding speakers that today is the last day to apply for their Sessions: AI event.

In AI development news, Mistral has introduced a new API that can convert PDF documents into AI-ready Markdown files. This could be a game-changer for processing and analyzing document data.

Meta is reportedly working on upgrading the voice features for their next Llama models, potentially enhancing their AI's conversational abilities.

Google has debuted a new text embedding model based on their Gemini AI, which could improve various natural language processing tasks.

In legal news, a judge has allowed an AI copyright lawsuit against Meta to move forward. This case, brought by authors, could have significant implications for AI training on copyrighted material.

Lastly, AI is taking center stage at SXSW 2025, with TechCrunch provi