In [32]:
#import necessary libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import datetime 
import re

In [26]:
# URL for BBC News homepage
news_url = "https://www.bbc.com/news"
# Fetch and parse the page
response = requests.get(news_url)
news_soup = BeautifulSoup(response.content, "html.parser")

In [27]:
# Try multiple selectors for headlines
headlines = news_soup.find_all("h3", class_="gs-c-promo-heading__title")

# If not found, try anchor tags with class 'gs-c-promo-heading'
if not headlines:
    promo_anchors = news_soup.select("a.gs-c-promo-heading")
    headlines = [a for a in promo_anchors if a.text.strip()]

# If still not found, fallback to all anchor tags with '/news/' in href and non-empty text
if not headlines:
    headlines = [
        a for a in news_soup.find_all("a", href=True)
        if "/news/" in a["href"] and a.text.strip()
    ]

if not headlines:
    print("No headlines found using known selectors.")
else:
    for idx, headline in enumerate(headlines, start=1):
        # Get the headline text and link
        headline_text = headline.text.strip()
        # Try to get the URL from the parent anchor or from the tag itself
        link = None
        if headline.name == "a" and headline.has_attr("href"):
            link = headline["href"]
        else:
            parent_a = headline.find_parent("a", href=True)
            if parent_a:
                link = parent_a["href"]
        # Make sure the link is absolute
        if link and link.startswith("/"):
            link = "https://www.bbc.com" + link
        # Print headline
        print(f"{idx}. {headline_text}")
        if link:
            print(f"   Link: {link}")
            # Fetch the news article page
            try:
                article_resp = requests.get(link)
                article_soup = BeautifulSoup(article_resp.content, "html.parser")
                # Try to extract all paragraphs in the article body
                # BBC often uses <article> tag or role="main"
                article_tag = article_soup.find("article")
                if not article_tag:
                    article_tag = article_soup.find(attrs={"role": "main"})
                if article_tag:
                    paragraphs = article_tag.find_all("p")
                else:
                    paragraphs = article_soup.find_all("p")
                # Combine the text of all paragraphs
                article_text = " ".join([p.get_text(strip=True) for p in paragraphs])
                # Print a snippet (first 400 chars)
                snippet = article_text[:400] + ("..." if len(article_text) > 400 else "")
                # Try to extract date and time
                date_str = ""
                # Look for <time> tag with datetime attribute
                time_tag = article_soup.find("time")
                if not time_tag:
                    # Try to find meta tag with property 'article:published_time'
                    meta_time = article_soup.find("meta", attrs={"property": "article:published_time"})
                    if meta_time and meta_time.has_attr("content"):
                        date_str = meta_time["content"]
                if not date_str and time_tag and time_tag.has_attr("datetime"):
                    date_str = time_tag["datetime"]
                elif not date_str and time_tag:
                    date_str = time_tag.get_text(strip=True)
                if date_str:
                    try:
                        dt = datetime.fromisoformat(date_str.replace("Z", "+00:00"))
                        date_str = dt.strftime("%Y-%m-%d %H:%M:%S %Z")
                    except Exception:
                        pass
                print(f"   Date: {date_str if date_str else '(No date found)'}")
                print(f"   News: {snippet}")
            except Exception as e:
                print(f"   Date: (No date found)")
                print(f"   News: (Could not fetch article: {e})")
        else:
            print("   Link: (No link found)")
            print("   Date: (No date found)")
            print("   News: (No article found)")

1. Israel-Gaza War
   Link: https://www.bbc.com/news/topics/c2vdnvdg6xxt
   Date: (No date found)
2. War in Ukraine
   Link: https://www.bbc.com/news/war-in-ukraine
   Date: (No date found)
   News: Commander-in-chief Oleksandr Syrsky disputed Russian claims that Ukraine had been pushed out of Kursk. Higher defence spending will top the agenda when members of the Western alliance gather in The Hague. Kyiv says it received 1,245 bodies on Monday, while Moscow says 78 of its dead soldiers were repatriated. Dmitriy Kurashov is the first Russian soldier to stand trial in Ukraine over a battlefiel...
3. US & Canada
   Link: https://www.bbc.com/news/us-canada
   Date: (No date found)
   News: The US hit three nuclear sites in Iran overnight - but it remains unclear whether the strikes did destroy all of Iran's nuclear capabilities. As well as retaliation against US troops, a president who styled himself as "peacemaker" could face party dissent. In a televised address, Trump called on Iran to

In [28]:
#save headlines to CSV
headlines_data = []
for idx, headline in enumerate(headlines, start=1):
    headline_text = headline.text.strip()
    link = None
    if headline.name == "a" and headline.has_attr("href"):
        link = headline["href"]
    else:
        parent_a = headline.find_parent("a", href=True)
        if parent_a:
            link = parent_a["href"]
    if link and link.startswith("/"):
        link = "https://www.bbc.com" + link
    headlines_data.append({
        "Index": idx,
        "Headline": headline_text,
        "Link": link
    })
# Create a DataFrame and save to CSV
headlines_df = pd.DataFrame(headlines_data)
headlines_df.to_csv("bbc_news_headlines.csv", index=False)


In [29]:
#show save headlines
print("\nHeadlines saved to bbc_news_headlines.csv")
bbc_news_headlines = pd.read_csv("bbc_news_headlines.csv")
print(bbc_news_headlines.head())


Headlines saved to bbc_news_headlines.csv
   Index         Headline                                          Link
0      1  Israel-Gaza War  https://www.bbc.com/news/topics/c2vdnvdg6xxt
1      2   War in Ukraine       https://www.bbc.com/news/war-in-ukraine
2      3      US & Canada            https://www.bbc.com/news/us-canada
3      4               UK                   https://www.bbc.com/news/uk
4      5           Africa         https://www.bbc.com/news/world/africa


# **India Pakistan War news scrap**

In [30]:
# Search query for BBC
query = "pakistan india war"
# query = input("Enter search query: ")
# URL for BBC search
search_url = f"https://www.bbc.co.uk/search?q={query.replace(' ', '+')}"

response = requests.get(search_url)
soup = BeautifulSoup(response.content, "html.parser")

In [33]:
# Try to find all promo items (less dependent on class names)
results = []
pattern = re.compile(r"\bPakistan\b", re.IGNORECASE)
pattern2 = re.compile(r"\bIndia\b", re.IGNORECASE)

In [34]:
for item in soup.find_all(["article", "li"]):
    # Try to get headline and snippet
    headline_tag = item.find(["h1", "h2", "h3", "span"])
    snippet_tag = item.find("p")
    headline = headline_tag.get_text(strip=True) if headline_tag else ""
    snippet = snippet_tag.get_text(strip=True) if snippet_tag else ""
    # Check if both 'Pakistan' and 'India' are present in either headline or snippet
    if (pattern.search(headline) and pattern2.search(headline)) or \
       (pattern.search(snippet) and pattern2.search(snippet)):
        link_tag = item.find("a", href=True)
        link = link_tag["href"] if link_tag else ""
        if link and link.startswith("/"):
            link = "https://www.bbc.co.uk" + link
        results.append({
            "headline": headline,
            "snippet": snippet,
            "link": link
        })

In [35]:
if not results:
    print("No results found for 'Pakistan India war' on BBC.")
else:
    for idx, res in enumerate(results, 1):
        print(f"{idx}. {res['headline']}")
        print(f"   Link: {res['link']}")
        print(f"   Snippet: {res['snippet']}\n")

1. India-Pakistan conflict: How real is the risk of nuclear war?
   Link: https://www.bbc.co.uk/news/articles/c2e373yzndro
   Snippet: India-Pakistan conflict: How real is the risk of nuclear war?

2. India-Pakistan tensions: 'Whether there is war or ceasefire, our children will not come back'
   Link: https://www.bbc.co.uk/news/articles/clyg2rv81pvo
   Snippet: India-Pakistan tensions: 'Whether there is war or ceasefire, our children will not come back'

3. India and Pakistan: The first drone war between nuclear-armed neighbours
   Link: https://www.bbc.co.uk/news/articles/cwy6w6507wqo
   Snippet: India and Pakistan: The first drone war between nuclear-armed neighbours

4. The World Tonight. Can Pakistan and India avoid war? Listen NowThe World TonightCan Pakistan and India avoid war?
   Link: https://www.bbc.co.uk/sounds/play/m002btyv
   Snippet: The World Tonight. Can Pakistan and India avoid war? Listen NowThe World Tonight

5. The Briefing Room. Are India and Pakistan on the brink

In [37]:
#save india  pak war nes to CSV
india_pak_war_data = []
for idx, res in enumerate(results, start=1):
    india_pak_war_data.append({
        "Index": idx,
        "Headline": res['headline'],
        "Link": res['link'],
        "Snippet": res['snippet']
    })
# Create a DataFrame and save to CSV
india_pak_war_df = pd.DataFrame(india_pak_war_data)
india_pak_war_df.to_csv("bbc_india_pakistan_war.csv", index=False)

In [38]:
#show the DataFramme
print("\nSearch results saved to bbc_india_pakistan_war.csv")
bbc_india_pakistan_war = pd.read_csv("bbc_india_pakistan_war.csv")
print(bbc_india_pakistan_war.head())


Search results saved to bbc_india_pakistan_war.csv
   Index                                           Headline  \
0      1  India-Pakistan conflict: How real is the risk ...   
1      2  India-Pakistan tensions: 'Whether there is war...   
2      3  India and Pakistan: The first drone war betwee...   
3      4  The World Tonight. Can Pakistan and India avoi...   
4      5  The Briefing Room. Are India and Pakistan on t...   

                                               Link  \
0  https://www.bbc.co.uk/news/articles/c2e373yzndro   
1  https://www.bbc.co.uk/news/articles/clyg2rv81pvo   
2  https://www.bbc.co.uk/news/articles/cwy6w6507wqo   
3        https://www.bbc.co.uk/sounds/play/m002btyv   
4        https://www.bbc.co.uk/sounds/play/m002bj77   

                                             Snippet  
0  India-Pakistan conflict: How real is the risk ...  
1  India-Pakistan tensions: 'Whether there is war...  
2  India and Pakistan: The first drone war betwee...  
3  The World Tonig