In [4]:
from bs4 import BeautifulSoup
import requests

def get_news_text(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'lxml')
    news_paragraphs = soup.find_all('p')[2:3]
    news_text = '\n'.join([p.text.strip() for p in news_paragraphs])
    return news_text

def scrape_and_summarize(category_url):
    r = requests.get(category_url)
    soup = BeautifulSoup(r.text, 'lxml')
    
    news_data = soup.find_all('h2')
    
    for h2_tag in news_data:
        headline = h2_tag.text.strip()
        link = h2_tag.a['href']
        print("Headline:", headline)
        print("News Text:")
        print(get_news_text(link))
        print("Link:", link)
        print()

categories_urls = [
    "https://www.ndtv.com/india/",
    # "https://www.ndtv.com/elections/elections-news/",
    "https://www.ndtv.com/latest/",
    "https://www.ndtv.com/cities/",
    "https://www.ndtv.com/education/",
    "https://www.ndtv.com/trends",
    "https://www.ndtv.com/offbeat/",
    "https://www.ndtv.com/south/"
]

for category_url in categories_urls:
    print("Category:", category_url)
    scrape_and_summarize(category_url)


Category: https://www.ndtv.com/india/
Headline: In PM Modi's Open Letter, A List Of Achievements, And Gratitude
News Text:
Pointing out that his partnership with the people is "at the threshold of completing a decade" he has also sought suggestions to help fulfil the resolve of a Viksit Bharat, or developed India.
Link: https://www.ndtv.com/india-news/in-pm-modis-open-letter-a-list-of-achievements-and-gratitude-5246713

Headline: "Cannot Give Up On Principles No Matter...": US Envoy On Citizenship Law CAA Implementation
News Text:
In response to a question on whether an Indian national, facing charge for a murder-for-hire plot to kill a Khalistani separatist on American soil, will affect Indo-US ties, he said, "The pace of our relationship only continues to accelerate amidst this."
Link: https://www.ndtv.com/india-news/cannot-give-up-on-principles-no-matter-us-envoy-eric-garcetti-on-citizenship-law-caa-implementation-5246708

Headline: BJP's Suvendu Adhikari Mocks Mamata Banerjee's Inj

In [6]:
from bs4 import BeautifulSoup
import requests
from joblib import Parallel, delayed

def get_news_text(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'lxml')
    news_paragraphs = soup.find_all('p')[:2]
    news_text = '\n'.join([p.text.strip() for p in reversed(news_paragraphs)])
    return news_text

def scrape_category(url_pattern, tag_name, pages):
    news_data = set()
    for page_num in range(1, pages + 1):
        r = requests.get(f'{url_pattern}{page_num}')
        soup = BeautifulSoup(r.text, 'lxml')
        news_items = soup.find_all(tag_name)
        for item in news_items:
            headline = item.text.strip()
            link = item.a['href']
            news_text = get_news_text(link)
            news_data.add((headline, link, news_text))
    return news_data

def process_category(category_params):
    return scrape_category(*category_params)

categories = [
    ('https://www.ndtv.com/india/page-', 'h2', 14),
    ('https://www.ndtv.com/latest/page-', 'h2', 8),
    ('https://www.ndtv.com/cities/page-', 'h2', 14),
    ('https://www.ndtv.com/education/page-', 'h2', 14),
    ('https://www.ndtv.com/trends', 'h3', 1),
    ('https://www.ndtv.com/offbeat/page-', 'h2', 14),
    ('https://www.ndtv.com/south/page-', 'h2', 14)
]

results = Parallel(n_jobs=32, verbose=100)(delayed(process_category)(category_params) for category_params in categories)

for category, data in zip(categories, results):
    print(f"{category[0]}:")
    for headline, link, news_text in data:
        print(headline)
        print(link)
        print(news_text)
        print()


[Parallel(n_jobs=32)]: Using backend LokyBackend with 32 concurrent workers.


[Parallel(n_jobs=32)]: Done   1 tasks      | elapsed:    5.8s
[Parallel(n_jobs=32)]: Done   2 out of   7 | elapsed:  1.4min remaining:  3.6min
[Parallel(n_jobs=32)]: Done   3 out of   7 | elapsed:  1.6min remaining:  2.1min
[Parallel(n_jobs=32)]: Done   4 out of   7 | elapsed:  2.3min remaining:  1.7min
[Parallel(n_jobs=32)]: Done   5 out of   7 | elapsed:  2.3min remaining:   55.5s
[Parallel(n_jobs=32)]: Done   7 out of   7 | elapsed:  2.6min finished
https://www.ndtv.com/india/page-:
Government Blocks 18 Streaming Platforms In India Over 'Obscene' Content
https://www.ndtv.com/india-news/government-blocks-18-streaming-platforms-in-india-over-obscene-content-5236230
The Ministry of Information & Broadcasting (I&B) has taken action, in coordination with various intermediaries, to block 18 OTT platforms publishing "obscene," "vulgar," and, in some instances, "pornographic content".
18 OTT platforms published "obscene," "vulgar," and, in some instances, "pornographic content".

Supreme Co