In [10]:
import requests
from bs4 import BeautifulSoup

In [186]:
def scrape_news(url, paragraph_tag = 'p', title_tag = 'h1', subtitle_tag = None, exclude = None):
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')

        if title_tag is not None: 
            title = soup.find(title_tag).text
        else:
            title = None
        
        if subtitle_tag is not None:
            subtitle = soup.find(subtitle_tag).text
        else:
            subtitle = None


        parent_classes_filters = [{'class':cls} for cls in exclude]
        

        # Find all paragraphs and filter out those that are in excluded classes (or their parents)
        content_list = []
        if paragraph_tag is not None:
            for p in soup.find_all(paragraph_tag):
                wrapper_elements = p.find_parent(class_=exclude)
                element_class = p.get('class')
                
                # Check if element class or parent is in exclude list, if so skip
                common = set()
                if element_class is not None and exclude is not None:
                    common = set(element_class).intersection(exclude)
                if wrapper_elements is not None or len(common) > 0:
                    continue


                content_list.append(p.text.strip())
            content = ' '.join(content_list)
        else:
            content = None
        
        res_dict = {
            'title': title,
            'subtitle': subtitle,
            'content': content
        }


        # do some cleaning
        for key in res_dict:
            if res_dict[key] is None:
                continue
            res_dict[key] = res_dict[key].replace('\n', ' ')
            res_dict[key] = res_dict[key].replace('\xa0', ' ')
            res_dict[key] = res_dict[key].replace('\t', ' ')
            res_dict[key] = res_dict[key].replace("\'", "'")
            res_dict[key] = res_dict[key].strip()        
        

    
        return res_dict
    else:
        print(f"Failed to retrieve content from {url}")
        return None

In [187]:
def scrape_the_sun(url):
    return scrape_news(url, 'p', 'h1')

def scrape_nbc_universal(url):
    return scrape_news(url, 'p', 'h1')

def scrape_cbs_news(url):
    return scrape_news(url, 'p', 'h1', exclude=['content__meta', 'content__meta--byline', 'breaking-news__headline'])

def scrape_abc_news(url):
    return scrape_news(url, 'p', 'h1', exclude=['Article__Header__Branding', 'PinnedPostsContainer'])


# Scraping not allowed    
# def scrape_the_guardian(url):
#     return scrape_news(url, 'p', 'h1')

# def scrape_buzfeed(url):
#     return scrape_news(url, 'p', 'h1')


In [188]:
scrape_cbs_news('https://www.cbsnews.com/news/israel-palestine-hamas-alshifa-hospital-gaza/')

{'title': 'Hamas-run health ministry releases video inside Al-Shifa hospital as Israeli forces encircle northern Gaza',
 'subtitle': None,
 'content': 'Tel Aviv — The Hamas-run Gaza Health Ministry has released harrowing new footage of what it says are conditions inside Al-Shifa hospital. Israeli forces are continuing to pummel northern Gaza in what the IDF says is an effort to wipe out the Islamist militant group once and for all. In a seven-minute graphic video posted on the ministry\'s Facebook page, an unnamed doctor is seen describing his surroundings as wounded and injured people are strewn across the hospital floor. "The department is completely full of wounded and injured," the unnamed doctor says in the video. "They are on the ground taking their last breaths due to the collapse of the health system, due to the lack of medicines due to the occupation besieging the hospitals," the man says as the camera focuses on seemingly lifeless bodies lying on the floor. Young children are

In [136]:
scrape_abc_news('https://abcnews.go.com/International/live-updates/israel-gaza-hamas/?id=104617602')

{'title': 'Israel-Gaza live updates: Mass exodus from Gaza hospital',
 'subtitle': None,
 'content': 'The hospital has been treating thousands of wounded people. Israeli airstrikes trigger mass evacuations in Gaza Thousands of people have died and thousands more have been injured since the militant group Hamas launched an unprecedented surprise attack on Israel on Oct. 7 and Israel retaliated with a bombing campaign and total siege of the neighboring Gaza Strip, leaving the region on the verge of all-out war. Click here for updates from previous days. Hezbollah issued two statements claiming two attacks on Israel amid daily clashes between Hezbollah and Israel forces along Lebanon’s southern border. For the first time since the Oct. 7 Hamas attacks, Israel carried-out a strike 40 km past its northern border into Lebanon, hitting a truck in Zahrani. For now, this is contained within the the Blue Line, the UN-defined demarcation line between the two countries. As the fighting continues b

In [57]:
scrape_the_sun('https://www.thesun.co.uk/news/24703278/police-guard-remembrance-day-clashes')

{'title': 'Ninety-two protesters arrested after mob hurls missiles at cops in Cenotaph chaos as pro-Palestinian march begins',
 'subtitle': None,
 'content': 'NINETY-two protesters have been arrested after chaos erupted near London\'s Cenotaph today as missile-hurling yobs clashed with cops.  Protesters, who took to the streets over Pro-Palestinian marches being held on Armistice Day, launched missiles at police as shocking scenes unfolded on the streets of the capital this morning. The counter-protests had been organised amid fears pro-Palestine marches would interrupt Remembrance services but fights broke out as cops tried to maintain a ring of steel around the Cenotaph. While police had doubled their numbers in the capital ahead of fears of violence, they struggled to maintain their staunch guard around the memorial - as chants of "England till I die" and "Let us through" echoed through the streets. But as the pro-Palestine group\'s march was underway - with about 300,000 - cops wer

In [58]:
scrape_nbc_universal('https://www.nbcuniversal.com/article/nbc-announces-robust-slate-festive-programming-ahead-holiday-season')

{'title': 'NBC Announces Robust Slate Of Festive Programming Ahead Of The Holiday Season',
 'subtitle': None,
 'content': 'Specials Include “Macy’s Thanksgiving Day Parade,” “Christmas in Rockefeller Center,” “Christmas at Graceland,” “National Dog Show Presented by Purina,” “Christmas at the Opry,” “Barry Manilow’s A Very Barry Christmas” and So Much More   November 03, 2023  NBC is bringing families together to celebrate the holiday season with its beloved specials and additional new programming.   NBC’s holiday programming reached nearly 95 million viewers last year, more than any other broadcast network.  Viewers can once again look forward to a robust slate of their favorite specials including the “Macy’s Thanksgiving Day Parade,” “Christmas in Rockefeller Center” and several others. New programming for 2023 includes “Christmas at the Opry,” hosted by Grammy Award-winning country superstar Wynonna Judd, “Christmas at Graceland” and more. “We take enormous pride in delivering a ple