In [110]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import re
import xml.etree.ElementTree as ET

ARTICLE_PATTERN = re.compile(r"^https://www\.bbc\.co\.uk/news/articles/[a-z0-9]+$")

def fetch_page(url):
    try:
        resp = requests.get(url, timeout=10)
        resp.raise_for_status()
        return BeautifulSoup(resp.text, 'html.parser')
    except Exception as e:
        print(f"Failed to fetch {url}: {e}")
        return None

def extract_links(soup, base_url):
    """Extract and return valid article links only."""
    links = set()
    for a in soup.find_all('a', href=True):
        href = a['href'].split('#')[0]
        if not href:
            continue
        abs_url = urljoin(base_url, href)
        if ARTICLE_PATTERN.match(abs_url):
            links.add(abs_url)
    return links

def crawl(start_url, max_urls=200):
    seen = set()
    queue = [start_url]
    while queue and len(seen) < max_urls:
        url = queue.pop(0)
        soup = fetch_page(url)
        if not soup:
            continue
        links = extract_links(soup, url)
        for link in links:
            if link not in seen:
                seen.add(link)
                queue.append(link)  # crawl into other articles
                if len(seen) >= max_urls:
                    break
    return seen

def build_sitemap(urls, output='sitemap_articles.xml'):
    urlset = ET.Element('urlset', xmlns="http://www.sitemaps.org/schemas/sitemap/0.9")
    for url in sorted(urls):
        url_el = ET.SubElement(urlset, 'url')
        loc = ET.SubElement(url_el, 'loc')
        loc.text = url
    tree = ET.ElementTree(urlset)
    tree.write(output, xml_declaration=True, encoding='utf-8', method='xml')
    print(f"Article sitemap written to {output} with {len(urls)} article URLs.")

def main():
    start = 'https://www.bbc.co.uk/news'
    print(f"Starting article crawl at {start}")
    article_urls = crawl(start)
    print(f"Found {len(article_urls)} unique article URLs.")
    build_sitemap(article_urls)

if __name__ == '__main__':
    main()

Starting article crawl at https://www.bbc.co.uk/news
Found 200 unique article URLs.
Article sitemap written to sitemap_articles.xml with 200 article URLs.


In [114]:
start = 'https://www.bbc.co.uk/news'
print(f"Starting article crawl at {start}")
article_urls = crawl(start)

Starting article crawl at https://www.bbc.co.uk/news


In [115]:
article_urls

{'https://www.bbc.co.uk/news/articles/c0492rwnevgo',
 'https://www.bbc.co.uk/news/articles/c05l9y56773o',
 'https://www.bbc.co.uk/news/articles/c071elp1rv1o',
 'https://www.bbc.co.uk/news/articles/c071jr159p0o',
 'https://www.bbc.co.uk/news/articles/c0eq3dxe28go',
 'https://www.bbc.co.uk/news/articles/c0eqpz23l9jo',
 'https://www.bbc.co.uk/news/articles/c0q8gkexkv2o',
 'https://www.bbc.co.uk/news/articles/c0r1jv0rn0ko',
 'https://www.bbc.co.uk/news/articles/c0xj5w3nx7yo',
 'https://www.bbc.co.uk/news/articles/c15ng4g5g0eo',
 'https://www.bbc.co.uk/news/articles/c1k8jxdyj0go',
 'https://www.bbc.co.uk/news/articles/c1mgdendnv7o',
 'https://www.bbc.co.uk/news/articles/c1mgkd93r4yo',
 'https://www.bbc.co.uk/news/articles/c1mr177ze45o',
 'https://www.bbc.co.uk/news/articles/c1wepgxzqxqo',
 'https://www.bbc.co.uk/news/articles/c20nqzxn4xqo',
 'https://www.bbc.co.uk/news/articles/c20q4ql7pe2o',
 'https://www.bbc.co.uk/news/articles/c20r6vm0xl3o',
 'https://www.bbc.co.uk/news/articles/c20rz63g

In [None]:
import requests
import xml.etree.ElementTree as ET
from io import StringIO

import xml.etree.ElementTree as ET


def get_article_urls_from_local_sitemap(file_path, pattern_prefix='https://www.bbc.co.uk/news/articles/'):
    """Parses a local sitemap XML file and returns all matching article URLs."""
    try:
        tree = ET.parse(file_path)
        root = tree.getroot()

        namespace = {'ns': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
        urls = [
            loc.text for loc in root.findall('.//ns:loc', namespaces=namespace)
            if loc.text.startswith(pattern_prefix)
        ]
        return urls
    except Exception as e:
        print(f"Failed to parse local sitemap: {e}")
        return []


In [None]:
import requests
from bs4 import BeautifulSoup
import time

def get_article_content_and_images(url, headers):
    """
    Fetches the title, content, main image, and date of a single article
    
    Args:
        url (str): Article URL
        headers (dict): Request headers
        
    Returns:
        tuple: (title, content_string, main_image_url, publication_date)
    """
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()

        soup = BeautifulSoup(response.content, 'html.parser')

        # EXTRACT TITLE
        title = ""
        title_selectors = [
            'h1[id="main-heading"]',
            'h1.ssrcss-1s9pby4-Heading',
            'h1 span[role="text"]',
            'h1',
        ]
        
        for selector in title_selectors:
            title_element = soup.select_one(selector)
            if title_element:
                title = title_element.get_text(strip=True)
                break
        
        if not title:
            title = "Title not found"

        # EXTRACT DATE
        publication_date = ""
        date_selectors = [
            'time[data-testid="timestamp"]',  # BBC timestamp element
            'time[datetime]',  # Generic time with datetime attribute
            '[data-component="metadata-block"] time',  # Time in metadata block
            '.ssrcss-1pvwv4b-MetadataSnippet time',  # BBC metadata time
        ]
        
        for selector in date_selectors:
            date_element = soup.select_one(selector)
            if date_element:
                # Try to get the datetime attribute first, then text content
                publication_date = (date_element.get('datetime') or 
                                  date_element.get_text(strip=True))
                break
        
        if not publication_date:
            publication_date = "Date not found"

        # EXTRACT CONTENT (existing code)
        content_selectors = [
            '[data-component="text-block"] p',
            '.story-body__inner p',
            '[data-component="text-block"]',
            'article p',
            '.gel-body-copy p',
            '.ssrcss-1q0x1qg-Paragraph p'
        ]

        content_paragraphs = []
        for selector in content_selectors:
            elements = soup.select(selector)
            if elements:
                for element in elements:
                    text = element.get_text(strip=True)
                    if text and len(text) > 20:
                        content_paragraphs.append(text)
                break

        # EXTRACT MAIN IMAGE (existing code)
        main_image_url = ""
        image_selectors = [
            'article img.ssrcss-11yxrdo-Image',
            '[data-component="image-block"] img',
            'figure img',
            'img[src*="ichef.bbci.co.uk"]'
        ]

        for selector in image_selectors:
            img = soup.select_one(selector)
            if img:
                img_url = img.get('src') or img.get('data-src') or img.get('data-lazy-src')
                if img_url:
                    if img_url.startswith('//'):
                        img_url = 'https:' + img_url
                    elif img_url.startswith('/'):
                        img_url = 'https://www.bbc.co.uk' + img_url
                    
                    if 'ichef.bbci.co.uk' in img_url:
                        main_image_url = img_url
                        break

        content = '\n\n'.join(content_paragraphs) if content_paragraphs else "Content could not be extracted"

        return title, content, main_image_url, publication_date

    except Exception as e:
        return "Error extracting title", f"Error fetching content: {str(e)}", "", "Date not found"

def scrape_articles_from_urls(article_urls, max_articles=10):
    """
    Scrapes articles from a list of URLs (from sitemap)
    
    Args:
        article_urls (list): List of article URLs to scrape
        max_articles (int): Maximum number of articles to process
        
    Returns:
        list: List of dictionaries containing article information
    """
    
    # Headers to mimic a real browser request
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    articles = []
    
    print(f"Processing {min(len(article_urls), max_articles)} articles...")
    
    for i, url in enumerate(article_urls[:max_articles]):
        print(f"Processing article {i+1}/{min(len(article_urls), max_articles)}: {url[:70]}...")
        
        try:
            # Extract article data including date
            title, content, main_image_url, publication_date = get_article_content_and_images(url, headers)
            
            articles.append({
                'title': title,
                'url': url,
                'content': content,
                'main_image': main_image_url,
                'publication_date': publication_date,  # NEW FIELD
                'source': 'BBC'
            })
            
            time.sleep(1)
            
        except Exception as e:
            print(f"Error processing {url}: {e}")
            continue
    
    return articles



# Example usage function for sitemap URLs
def process_sitemap_urls(article_urls, max_articles=10):
    """
    Main function to process articles from sitemap URLs
    
    Args:
        article_urls (list): List of article URLs from sitemap
        max_articles (int): Maximum number of articles to process
    """
    print(f"Starting to process {len(article_urls)} article URLs from sitemap...")
    
    # Scrape articles
    articles = scrape_articles_from_urls(article_urls, max_articles)
    
    
    return articles

# Main execution
if __name__ == "__main__":
    print("BBC Article Scraper for Sitemap URLs")
    print("=" * 40)
    
    # Example usage - you would replace this with your actual sitemap URLs
    example_urls = [
        "https://www.bbc.co.uk/news/articles/cq6912mqp1go",
        # Add more URLs from your sitemap here
    ]
    
    print("To use this script:")
    print("1. Replace 'example_urls' with your actual list of article URLs from the sitemap")
    print("2. Call process_sitemap_urls(your_article_urls, max_articles=10)")
    print("\nExample:")
    print("articles = process_sitemap_urls(your_urls_list, max_articles=5)")
    
    # Uncomment the line below to test with example URLs
    articles = process_sitemap_urls(example_urls, max_articles=2)

BBC Article Scraper for Sitemap URLs
To use this script:
1. Replace 'example_urls' with your actual list of article URLs from the sitemap
2. Call process_sitemap_urls(your_article_urls, max_articles=10)

Example:
articles = process_sitemap_urls(your_urls_list, max_articles=5)
Starting to process 1 article URLs from sitemap...
Processing 1 articles...
Processing article 1/1: https://www.bbc.co.uk/news/articles/cq6912mqp1go...
BBC Articles with Content, Main Image, and Date

1. 'Mariupol is diseased': Residents deny Russian claims occupied city returning to normal
URL: https://www.bbc.co.uk/news/articles/cq6912mqp1go
Published: 2025-06-29T21:00:30.616Z
--------------------------------------------------
CONTENT:
"What they're showing on Russian TV are fairy tales for fools. Most of Mariupol still lies in ruins," says John, a Ukrainian living in Russian-occupied Mariupol. We've changed his name as he fears reprisal from Russian authorities.

"They are repairing the facades of the buildings

In [None]:
# Headers to mimic a real browser request
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
articles = []
file_path=r'C:\Users\AbhinavChhabra\projects\unbiasedupdates\sitemap_articles.xml'
article_urls=get_article_urls_from_local_sitemap(file_path, pattern_prefix='https://www.bbc.co.uk/news/articles/'):


for i, url in enumerate(article_urls):
    
    try:
        # Extract article data including date
        title, content, main_image_url, publication_date = get_article_content_and_images(url, headers)
        
        articles.append({
            'title': title,
            'url': url,
            'content': content,
            'main_image': main_image_url,
            'publication_date': publication_date,  # NEW FIELD
            'source': 'BBC'
        })
        
        time.sleep(1)
        
    except Exception as e:
        print(f"Error processing {url}: {e}")
        continue
    

In [None]:
file_path = r'C:\Users\AbhinavChhabra\projects\unbiasedupdates\sitemap_articles.xml'
article_urls = get_article_urls_from_local_sitemap(file_path)
print(f"Found {len(article_urls)} article URLs.")

Found 1000 article URLs.


In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI
from typing import Any, Dict, List, Optional, Set, Tuple, Union
from langchain.prompts import (
    ChatMessagePromptTemplate,
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    SystemMessagePromptTemplate,
)
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI
from pprint import pprint

In [102]:
def _extract_text_between_last_tag_pair(xml_text, tag):
    """
    Extracts the content between the last pair of opening and closing tags.

    Args:
        xml_text (str): The full input string (e.g., XML or custom-tagged text).
        tag (str): The tag name (without angle brackets), e.g., 'final_answer'.

    Returns:
        str: The content between the last occurrence of the opening and closing tags.

    Raises:
        ValueError: If the opening or closing tag is not found.
    """
    open_tag = f"<{tag}>"
    close_tag = f"</{tag}>"

    end_idx = xml_text.rfind(close_tag)
    if end_idx == -1:
        raise ValueError(f"Closing tag '{close_tag}' not found.")

    start_idx = xml_text.rfind(open_tag, 0, end_idx)
    if start_idx == -1:
        raise ValueError(f"Opening tag '{open_tag}' not found.")

    start_idx += len(open_tag)
    return xml_text[start_idx:end_idx].strip()

In [77]:
GOOGLE_API_KEY="AIzaSyAOzybNDWlfw0TbLFF-8t-nkWMPsiagPZ4"
llm_g_2_5 = ChatGoogleGenerativeAI(
    model="gemini-2.5-pro", google_api_key=GOOGLE_API_KEY
)

llm_g_2_5_f= ChatGoogleGenerativeAI(
    model="gemini-2.5-flash", google_api_key=GOOGLE_API_KEY
)

OPENAI_API_KEY="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"
llm_4o = ChatOpenAI(
    model="gpt-4o",
    api_key=OPENAI_API_KEY
)


In [33]:
def lg_runnable(
    llm: Any,
    system_message: str,
    schema: Optional[
        Any
    ] = None,  # Optionally specify the type of schema if needed, otherwise 'Any' is used
    strict: bool = False,
    use_schema: bool = False,
    human_message: Union[str, bool] = False,
    json_schema: bool = False,  ## This doesnt work as intended at the moment, idealy when this is true the model should doutput the json out but instead it outputs pydantic class type output.
):
    """
    Constructs a runnable configuration for a large language model (LLM) based on provided messages and settings.

    This function assembles a sequence of message templates into a prompt and optionally configures the LLM
    for structured output based on a schema and a specified output mode.

    Parameters:
    - llm (Any): The large language model to be configured.
    - system_message (str): A template string for the system message to be used in the chat.
    - schema (Optional[Any]): The schema used to format the LLM's output. None means no schema is used.
    - use_schema (bool): Flag to determine whether to apply the schema to the LLM's output.
    - human_message (Union[str, bool]): A template string for the human message or a boolean flag.
      If True, raises an error as it's not a valid input.
    - json_schema (bool): Flag to determine whether the output should be structured as JSON when using a schema.

    Returns:
    - runnable: A configured pipeline combining prompt templates and LLM output settings.

    Raises:
    - ValueError: If `human_message` is True, indicating an invalid input type.
    """
    messages = []  # Initialize list to hold message templates

    # Create a system message template from the provided string message
    system_template = SystemMessagePromptTemplate.from_template(system_message)
    messages.append(system_template)  # Add system message template to messages list

    # Handle human message based on its type (string or boolean)
    if isinstance(human_message, str):
        # If it's a string, create a human message template
        human_template = HumanMessagePromptTemplate.from_template(human_message)
        messages.append(human_template)
    elif human_message is True:
        # Raise an error if human_message is True as it's not a valid string template
        raise ValueError("human_message must be a string template or False, not True")

    # Create the final chat prompt template from accumulated messages
    prompt = ChatPromptTemplate.from_messages(messages)

    # Depending on the flags, prepare the large language model (LLM) with structured output or a simple output parser

    if strict and not json_schema:
        raise ValueError("Strict can only be set to True with json_schema enabled.")

    if use_schema and schema is None:
        raise ValueError("When use_schema is enabled, a schema needs to be provided.")

    if use_schema:
        if json_schema:
            # Configure LLM for structured output in JSON mode if specified
            structured_llm = llm.with_structured_output(
                schema, method="json_schema", strict=strict
            )
            runnable = prompt | structured_llm

        else:
            # Configure LLM for structured output without JSON mode
            structured_llm = llm.with_structured_output(schema)
            runnable = prompt | structured_llm

    else:
        # Default to a simple string output parser if no schema is used
        output_parser = StrOutputParser()
        runnable = prompt | llm | output_parser

    return runnable

In [91]:
def gemini_runnable(llm, template:str):
    prompt = ChatPromptTemplate.from_messages([template])
    runnable = prompt | llm
    return runnable



In [88]:
SUMMARY_GEN_SYS_TEMP="""These days the news articles are long but the meaningful information they contain is quite less compared to the length of the article.  
Along with this, sometimes I see the news articles, if read carefully, look biased towards certain agenda where the writer is making some claims without much evidence.  
Therefore, to save the reader's time and provide them with unbiased news, I am creating a website which shows a concise unbiased insights view of the entire story. Although this would be concise insights, it covers all crucial aspects of the story and provides a good picture to the user about the story. With this, the reader gets a complete understanding of the topic the news article is about in a time-saving manner.  
This insights view is NOT just another summary of the article shown by some other so-called news websites, it covers all the crucial aspects of the story in a neutral and evidence-based manner.  
I will give you the entire article from some news website and you will convert it to this insights view that we talked about and I will then show it on my website.  
Here is the actual news article: {content}

## Output format:
Before generating the insights view, think about how you would prepare this view such that it covers various aspects of the story, is time-saving for the reader, and doesn’t promote the self-agenda of the news company or writer. You can think of it as pure news and no fluff. Think about what leads to such a version of the story.

Once ready, put the version of the article you have generated in the insights XML tag e.g. <insights>Here goes your version</insights>. Putting the content into the XML tags will allow me to parse the content easily, so make sure it is always present.
Apart from this also produce the tiltle of the artcile and thumbnail snippet the thumbnail snippet is a line or two of text which is shown just beneath the thumbnail picture.The title will be in the xml tag <title>Here goes the title<title> and similarly thumbnail_snippet will be in tags <thumbnail_snippet></thumbnail_snippet>.
"""

In [57]:
SUMMARY_GEN_SYS_TEMP

'These days the news articles are long but the meaningful information they contain is quite less compared to the length of the article.  \nAlong with this, sometimes I see the news articles, if read carefully, look biased towards certain agenda where the writer is making some claims without much evidence.  \nTherefore, to save the reader\'s time and provide them with unbiased news, I am creating a website which shows a concise unbiased insights view of the entire story. Although this would be concise insights, it covers all crucial aspects of the story and provides a good picture to the user about the story. With this, the reader gets a complete understanding of the topic the news article is about in a time-saving manner.  \nThis insights view is NOT just another summary of the article shown by some other so-called news websites, it covers all the crucial aspects of the story in a neutral and evidence-based manner.  \nI will give you the entire article from some news website and you wi

In [92]:
runnable=lg_runnable(llm=llm_g_2_5, system_message=SUMMARY_GEN_SYS_TEMP)
grunnable=gemini_runnable(llm=llm_g_2_5_f, template=SUMMARY_GEN_SYS_TEMP)

In [103]:
llm_output=grunnable.invoke({'content':articles[3]['content']})
pprint(llm_output.content)

('<title>Attacks Target Iranian Scientists Amid Nuclear Tensions</title>\n'
 '<thumbnail_snippet>Bomb blasts in Tehran kill one Iranian nuclear scientist '
 'and injure another, sparking an investigation and drawing parallels to past '
 'incidents.</thumbnail_snippet>\n'
 '<insights>\n'
 'An Iranian nuclear scientist, Dr. Majid Shahriari, was killed, and another, '
 'Dr. Fereydoon Abbasi, was injured along with their wives in two separate '
 'bomb attacks in Tehran. Dr. Shahriari was a member of the nuclear '
 'engineering department at Shahid Beheshti University. Dr. Abbasi, who holds '
 'a PhD in nuclear physics and conducts research at the defense ministry, is '
 'described as a specialist in isotope separation and a member of the '
 'Revolutionary Guards.\n'
 '\n'
 'The attacks, which occurred on Monday morning, involved a bomb attached to '
 "the victims' cars by a motorbike rider. This method is similar to the "
 "January assassination of Massoud Ali-Mohammadi. Following Ali-Moha

In [None]:
_extract_text_between_last_tag_pair(llm_output.content, tag='insights')

'An Iranian nuclear scientist, Dr. Majid Shahriari, was killed, and another, Dr. Fereydoon Abbasi, was injured along with their wives in two separate bomb attacks in Tehran. Dr. Shahriari was a member of the nuclear engineering department at Shahid Beheshti University. Dr. Abbasi, who holds a PhD in nuclear physics and conducts research at the defense ministry, is described as a specialist in isotope separation and a member of the Revolutionary Guards.\n\nThe attacks, which occurred on Monday morning, involved a bomb attached to the victims\' cars by a motorbike rider. This method is similar to the January assassination of Massoud Ali-Mohammadi. Following Ali-Mohammadi\'s death, Tehran accused the United States and Israel of involvement, an accusation the US dismissed as "absurd." It was later clarified that Ali-Mohammadi was a quantum physicist, not a nuclear scientist, and was not involved in Iran\'s nuclear program.\n\nWhile the perpetrators of the latest attacks remain unknown, som

In [101]:
import boto3
from botocore.exceptions import ClientError

# Step 1: Create a session using the 'root-access' profile
session = boto3.Session(profile_name='root-access',region_name='us-east-1')

# Step 2: Connect to DynamoDB
dynamodb = session.client('dynamodb')

# Step 3: Try to describe the 'news-articles' table
try:
    response = dynamodb.describe_table(TableName='news_articles')
    print("✅ Table found!")
    print("Table status:", response['Table']['TableStatus'])
    print("Item count:", response['Table']['ItemCount'])
    print("Primary key schema:", response['Table']['KeySchema'])

except ClientError as e:
    if e.response['Error']['Code'] == 'ResourceNotFoundException':
        print("❌ Table 'news-articles' not found.")
    else:
        print("❌ Error accessing DynamoDB:", e)


✅ Table found!
Table status: ACTIVE
Item count: 0
Primary key schema: [{'AttributeName': 'title', 'KeyType': 'HASH'}]


# AJ content

In [118]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import re
import xml.etree.ElementTree as ET
import time

ARTICLE_PATTERN = re.compile(r"^https://www\.aljazeera\.com/news/\d{4}/\d{1,2}/\d{1,2}/[a-z0-9\-]+$")

def fetch_page(url):
    try:
        res = requests.get(url, timeout=10)
        res.raise_for_status()
        return BeautifulSoup(res.text, 'html.parser')
    except Exception as e:
        print(f"Failed to fetch {url}: {e}")
        return None

def extract_article_links(soup, base_url):
    """Extract article links matching Al Jazeera's news article pattern."""
    links = set()
    for tag in soup.find_all('a', href=True):
        href = tag['href'].split('#')[0]
        if not href:
            continue
        abs_url = urljoin(base_url, href)
        if ARTICLE_PATTERN.match(abs_url):
            links.add(abs_url)
    return links

def crawl_aljazeera_news(start_url='https://www.aljazeera.com/news/', max_urls=1000):
    seen = set()
    queue = [start_url]
    while queue and len(seen) < max_urls:
        url = queue.pop(0)
        soup = fetch_page(url)
        if not soup:
            continue
        links = extract_article_links(soup, url)
        for link in links:
            if link not in seen:
                seen.add(link)
                queue.append(link)  # to discover more articles through internal linking
                if len(seen) >= max_urls:
                    break
        time.sleep(1)  # be polite to the server
    return seen

def save_sitemap(urls, output_path='sitemap_aljazeera_articles.xml'):
    """Save a list of URLs in standard sitemap XML format."""
    urlset = ET.Element('urlset', xmlns="http://www.sitemaps.org/schemas/sitemap/0.9")
    for url in sorted(urls):
        url_el = ET.SubElement(urlset, 'url')
        loc = ET.SubElement(url_el, 'loc')
        loc.text = url
    tree = ET.ElementTree(urlset)
    tree.write(output_path, encoding='utf-8', xml_declaration=True)
    print(f"Sitemap saved to {output_path} with {len(urls)} articles.")

# Main logic
if __name__ == '__main__':
    print("Starting Al Jazeera news crawl...")
    article_urls = crawl_aljazeera_news()
    save_sitemap(article_urls, 'sitemap_aljazeera_articles.xml')


Starting Al Jazeera news crawl...
Sitemap saved to sitemap_aljazeera_articles.xml with 1000 articles.


In [124]:
article_urls = crawl_aljazeera_news(max_urls=5000)
article_urls

Failed to fetch https://www.aljazeera.com/news/2025/5/29/shock-waves-of-fear-chinese-students-grapple-with-trump-visa-uncertainty: HTTPSConnectionPool(host='www.aljazeera.com', port=443): Max retries exceeded with url: /news/2025/5/29/shock-waves-of-fear-chinese-students-grapple-with-trump-visa-uncertainty (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000001989477A010>: Failed to resolve 'www.aljazeera.com' ([Errno 11001] getaddrinfo failed)"))
Failed to fetch https://www.aljazeera.com/news/2025/4/15/harvard-university-defies-trumps-demands-faces-2-3bn-in-funding-cuts: HTTPSConnectionPool(host='www.aljazeera.com', port=443): Max retries exceeded with url: /news/2025/4/15/harvard-university-defies-trumps-demands-faces-2-3bn-in-funding-cuts (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x0000019894779FD0>: Failed to resolve 'www.aljazeera.com' ([Errno 11001] getaddrinfo failed)"))
Failed to fetch https://www.aljazeera.com/

KeyboardInterrupt: 

In [123]:
len(article_urls)

200

In [119]:
file_path = r'C:\Users\AbhinavChhabra\projects\unbiasedupdates\sitemap_aljazeera_articles.xml'
article_urls = get_article_urls_from_local_sitemap(file_path,pattern_prefix="https://www.aljazeera.com/news/")
print(f"Found {len(article_urls)} article URLs.")

Found 1000 article URLs.


In [120]:
article_urls

['https://www.aljazeera.com/news/2003/9/25/traces-of-enriched-uranium-found-in-iran',
 'https://www.aljazeera.com/news/2007/4/16/33-killed-in-us-university-shooting',
 'https://www.aljazeera.com/news/2009/10/8/afghan-blast-targets-indian-embassy',
 'https://www.aljazeera.com/news/2010/11/29/iranian-nuclear-scientist-killed',
 'https://www.aljazeera.com/news/2011/8/1/kashmir-and-the-politics-of-water',
 'https://www.aljazeera.com/news/2012/11/21/india-hangs-lone-surviving-mumbai-attacker',
 'https://www.aljazeera.com/news/2012/12/15/us-school-shooting-leaves-20-children-dead',
 'https://www.aljazeera.com/news/2012/4/3/profile-lashkar-e-taiba',
 'https://www.aljazeera.com/news/2013/4/25/nixon-library-throws-light-on-watergate',
 'https://www.aljazeera.com/news/2014/12/17/children-massacred-in-pakistan-school-attack',
 'https://www.aljazeera.com/news/2015/11/17/russia-says-plane-in-egypts-sinai-brought-down-by-bomb',
 'https://www.aljazeera.com/news/2015/12/3/us-police-probe-motive-of-cal

In [60]:
import requests
from bs4 import BeautifulSoup
import time

def get_article_content_and_images(url, headers):
    """
    Fetches the title, content, main image, and date of a single Al Jazeera article
    
    Args:
        url (str): Article URL
        headers (dict): Request headers
        
    Returns:
        tuple: (title, content_string, main_image_url, publication_date)
    """
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()

        soup = BeautifulSoup(response.content, 'html.parser')

        # EXTRACT TITLE
        title = ""
        title_selectors = [
            'header.article-header h1',  # Al Jazeera main title
            'h1',  # Fallback
        ]
        
        for selector in title_selectors:
            title_element = soup.select_one(selector)
            if title_element:
                title = title_element.get_text(strip=True)
                break
        
        if not title:
            title = "Title not found"

        # EXTRACT DATE
        publication_date = ""
        date_selectors = [
            '.article-dates .date-simple span[aria-hidden="true"]',  # Al Jazeera date format
            '.date-simple span[aria-hidden="true"]',  # Alternative date selector
            'time[datetime]',  # Generic time with datetime attribute
            '.article-dates',  # Broader date container
        ]
        
        for selector in date_selectors:
            date_element = soup.select_one(selector)
            if date_element:
                # Try to get the datetime attribute first, then text content
                publication_date = (date_element.get('datetime') or 
                                  date_element.get_text(strip=True))
                break
        
        if not publication_date:
            publication_date = "Date not found"

        # EXTRACT CONTENT
        content_selectors = [
            '.wysiwyg.wysiwyg--all-content p',  # Al Jazeera main content paragraphs
            '.wysiwyg p',  # Alternative content selector
            'article p',  # Generic article paragraphs
            '.article-content p',  # Another possible content selector
        ]

        content_paragraphs = []
        for selector in content_selectors:
            elements = soup.select(selector)
            if elements:
                for element in elements:
                    # Skip elements that are likely ads or navigation
                    if (element.find_parent(['aside', 'nav', '.more-on', '.article-related-list']) or
                        'newsletter' in element.get('class', []) or
                        'advertisement' in element.get_text().lower()):
                        continue
                    
                    text = element.get_text(strip=True)
                    if text and len(text) > 20:  # Filter out very short text
                        content_paragraphs.append(text)
                break

        # EXTRACT MAIN IMAGE
        main_image_url = ""
        image_selectors = [
            'figure.article-featured-image img',  # Al Jazeera featured image
            '.article-featured-image img',  # Alternative featured image
            'figure img',  # Generic figure image
            '.responsive-image img',  # Responsive image container
        ]

        for selector in image_selectors:
            img = soup.select_one(selector)
            if img:
                img_url = img.get('src') or img.get('data-src') or img.get('data-lazy-src')
                if img_url:
                    # Handle relative URLs
                    if img_url.startswith('//'):
                        img_url = 'https:' + img_url
                    elif img_url.startswith('/'):
                        img_url = 'https://www.aljazeera.com' + img_url
                    
                    # Prefer high-quality images (look for wp-content which Al Jazeera uses)
                    if 'wp-content' in img_url or img_url.startswith('https://'):
                        main_image_url = img_url
                        break

        content = '\n\n'.join(content_paragraphs) if content_paragraphs else "Content could not be extracted"

        return title, content, main_image_url, publication_date

    except Exception as e:
        return "Error extracting title", f"Error fetching content: {str(e)}", "", "Date not found"

def scrape_articles_from_urls(article_urls, max_articles=10):
    """
    Scrapes Al Jazeera articles from a list of URLs
    
    Args:
        article_urls (list): List of article URLs to scrape
        max_articles (int): Maximum number of articles to process
        
    Returns:
        list: List of dictionaries containing article information
    """
    
    # Headers to mimic a real browser request
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
    }
    
    articles = []
    
    print(f"Processing {min(len(article_urls), max_articles)} Al Jazeera articles...")
    
    for i, url in enumerate(article_urls[:max_articles]):
        print(f"Processing article {i+1}/{min(len(article_urls), max_articles)}: {url[:70]}...")
        
        try:
            # Extract article data including date
            title, content, main_image_url, publication_date = get_article_content_and_images(url, headers)
            
            articles.append({
                'title': title,
                'url': url,
                'content': content,
                'main_image': main_image_url,
                'publication_date': publication_date,
                'source': 'Al Jazeera'
            })
            
            # Be respectful with request timing
            time.sleep(1.5)
            
        except Exception as e:
            print(f"Error processing {url}: {e}")
            continue
    
    return articles

def display_articles(articles):
    """Display Al Jazeera articles with content, main image, and date"""
    print("Al Jazeera Articles with Content, Main Image, and Date")
    print("=" * 70)
    
    for i, article in enumerate(articles, 1):
        print(f"\n{i}. {article['title']}")
        print(f"URL: {article['url']}")
        print(f"Published: {article['publication_date']}")
        print("-" * 50)
        print("CONTENT:")
        print(article['content'][:500] + "..." if len(article['content']) > 500 else article['content'])
        print("-" * 30)
        print("MAIN IMAGE:")
        if article['main_image']:
            print(f"  {article['main_image']}")
        else:
            print("  No main image found")
        print("=" * 70)

def process_sitemap_urls(article_urls, max_articles=10):
    """
    Main function to process Al Jazeera articles from sitemap URLs
    
    Args:
        article_urls (list): List of article URLs from sitemap
        max_articles (int): Maximum number of articles to process
    """
    print(f"Starting to process {len(article_urls)} Al Jazeera article URLs from sitemap...")
    
    # Scrape articles
    articles = scrape_articles_from_urls(article_urls, max_articles)
    
    if articles:
        # Display articles
        display_articles(articles)
    else:
        print("No articles were successfully processed.")
    
    return articles

# Main execution
if __name__ == "__main__":
    print("Al Jazeera Article Scraper for Sitemap URLs")
    print("=" * 50)
    
    # Example usage - replace with actual Al Jazeera URLs from your sitemap
    example_urls = [
        "https://www.aljazeera.com/news/2014/9/8/how-successful-was-israels-iron-dome",
        # Add more URLs from your sitemap here
    ]
    example_urls=article_urls[:10]
    
    print("To use this script:")
    print("1. Replace 'example_urls' with your actual list of Al Jazeera article URLs from the sitemap")
    print("2. Call process_sitemap_urls(your_article_urls, max_articles=10)")
    print("\nExample:")
    print("articles = process_sitemap_urls(your_urls_list, max_articles=5)")
    
    # Uncomment the line below to test with example URLs
    articles = process_sitemap_urls(example_urls, max_articles=10)

Al Jazeera Article Scraper for Sitemap URLs
To use this script:
1. Replace 'example_urls' with your actual list of Al Jazeera article URLs from the sitemap
2. Call process_sitemap_urls(your_article_urls, max_articles=10)

Example:
articles = process_sitemap_urls(your_urls_list, max_articles=5)
Starting to process 10 Al Jazeera article URLs from sitemap...
Processing 10 Al Jazeera articles...
Processing article 1/10: https://www.aljazeera.com/news/2003/9/25/traces-of-enriched-uranium-fo...
Processing article 2/10: https://www.aljazeera.com/news/2007/4/16/33-killed-in-us-university-sh...
Processing article 3/10: https://www.aljazeera.com/news/2009/10/8/afghan-blast-targets-indian-e...
Processing article 4/10: https://www.aljazeera.com/news/2010/11/29/iranian-nuclear-scientist-ki...
Processing article 5/10: https://www.aljazeera.com/news/2011/10/18/hamas-and-israel-exchange-pr...
Processing article 6/10: https://www.aljazeera.com/news/2011/8/1/kashmir-and-the-politics-of-wa...
Processing 

[{'title': 'Traces of enriched uranium found in Iran',
  'url': 'https://www.aljazeera.com/news/2003/9/25/traces-of-enriched-uranium-found-in-iran',
  'content': 'Coming a month before a UN deadline for Tehran to prove it has no secret atomic weapons programme, the latest discovery is sure to fuel the worldwide debate over Iran’s suspected nuclear ambitions.\n\nBolstered by the finding, US President George Bush said on Thursday that Iran would face “universal condemnation” if it keeps pursuing a nuclear weapons programme.\n\nBush also told reporters that Iran’s alleged pursuit of a nuclear weapon will be on the agenda for his talks on Friday and Saturday with Russian President Vladimir Putin.\n\nDiplomats speaking on condition of anonymity told Reuters news agency that new traces of enriched uranium were found in environmental samples taken during inspections at the Kalaye Electric Company on the southern outskirts of Tehran.\n\nA spokeswoman for the International Atomic Energy Agency 

In [None]:
## Scrape the content
## Check the date and if one week from now and title already doesnt exits in the db then add the link deatils in the db, alonng with the summuraization

In [None]:
## If this is not a news article and some random text snippent then tell null 
## Only add links in the db which are one week from the current date
## Add parsed attribute to True if the link content is parsed by bs4
## generate its summary only if the summary is not presnet in the table or can have a summary genereteted attribute to False if its not generated for this link
## Create different lambda funtionn for different news provider and connect it with the step funtions