In [2]:
import requests
from bs4 import BeautifulSoup

In [70]:
def scrape_news(url, paragraph_tag = 'p', title_tag = 'h1', subtitle_tag = None, exclude = None):
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')

        if title_tag is not None: 
            title = soup.find(title_tag).text
        else:
            title = None
        
        if subtitle_tag is not None:
            subtitle = soup.find(subtitle_tag).text
        else:
            subtitle = None
        

        # Find all paragraphs and filter out those that are in excluded classes (or their parents)
        content_list = []
        if paragraph_tag is not None:
            for p in soup.find_all(paragraph_tag):
                if exclude is not None:
                    wrapper_elements_class = p.find_parent(class_=exclude)
                    wrapper_elements_id = p.find_parent(id=exclude)
                    element_class = p.get('class')
                    element_id = p.get('id')
                    
                    # Check if element class or parent is in exclude list, if so skip
                    common_class = set()
                    common_id = set()
                    if element_class is not None and exclude is not None:
                        common_class = set(element_class).intersection(exclude)
                    if element_id is not None and exclude is not None:
                        common_id = set(element_id).intersection(exclude)
                    if wrapper_elements_class is not None or wrapper_elements_id is not None or len(common_class) > 0 or len(common_id) > 0:
                        continue


                content_list.append(p.text.strip())
            content = ' '.join(content_list)
        else:
            content = None
        
        res_dict = {
            'title': title,
            'subtitle': subtitle,
            'content': content
        }


        # do some cleaning
        for key in res_dict:
            if res_dict[key] is None:
                continue
            res_dict[key] = res_dict[key].replace('\n', ' ')
            res_dict[key] = res_dict[key].replace('\xa0', ' ')
            res_dict[key] = res_dict[key].replace('\t', ' ')
            res_dict[key] = res_dict[key].replace("\'", "'")
            res_dict[key] = res_dict[key].strip()        
        

    
        return res_dict
    else:
        print(f"Failed to retrieve content from {url}")
        return None

In [78]:
def scrape_the_sun(url):
    return scrape_news(url, 'p', 'h1', exclude=['copyright-text'])

def scrape_nbc_universal(url):
    return scrape_news(url, 'p', 'h1', exclude=['block-socialblock', 'social-btns', 'social-links-margin', 'block-copyrightblock', 'copyright print-no'])

def scrape_cbs_news(url):
    return scrape_news(url, 'p', 'h1', exclude=['content__meta', 'content__meta--byline', 'breaking-news__headline'])

def scrape_abc_news(url):
    return scrape_news(url, 'p', 'h1', exclude=['Article__Header__Branding', 'PinnedPostsContainer'])


# Scraping not allowed    
# def scrape_the_guardian(url):
#     return scrape_news(url, 'p', 'h1')

# def scrape_buzfeed(url):
#     return scrape_news(url, 'p', 'h1')


In [79]:
scrape_nbc_universal('https://www.nbcuniversal.com/article/nbcuniversal-puppy-training-program-begins-30-rock')

{'title': 'NBCUniversal Partners with America’s Vet Dogs: Service Dog Training at 30 Rock Begins',
 'subtitle': None,
 'content': "NBCUniversal collaborates with America’s Vet Dogs and their sister organization the Guide Dog Foundation for a unique Puppy-in-Training Program at 30 Rock. Discover how this initiative promotes accessibility and the long-standing commitment of NBCUniversal of supporting service animals. November 10, 2023 In an effort to promote accessibility and educate employees, NBCUniversal has entered into an exciting partnershp with America’s Vet Dogs and their sister organization, the Guide Dog Foundation, on the Puppy-In-Training program. This initiative provides a comforting and valuable experience for employees, and also speaks to NBCUniversal’s commitment to accessibility. For 20 years, America’s VetDogs has trained and placed guide and service dogs to provide independence, enhanced mobility, and companionship to veterans with disabilities from all eras. In 2015, 

In [80]:
scrape_cbs_news('https://www.cbsnews.com/news/israel-palestine-hamas-alshifa-hospital-gaza/')

{'title': 'Hamas-run health ministry releases video inside Al-Shifa hospital as Israeli forces encircle northern Gaza',
 'subtitle': None,
 'content': 'Tel Aviv — The Hamas-run Gaza Health Ministry has released harrowing new footage of what it says are conditions inside Al-Shifa hospital. Israeli forces are continuing to pummel northern Gaza in what the IDF says is an effort to wipe out the Islamist militant group once and for all. In a seven-minute graphic video posted on the ministry\'s Facebook page, an unnamed doctor is seen describing his surroundings as wounded and injured people are strewn across the hospital floor. "The department is completely full of wounded and injured," the unnamed doctor says in the video. "They are on the ground taking their last breaths due to the collapse of the health system, due to the lack of medicines due to the occupation besieging the hospitals," the man says as the camera focuses on seemingly lifeless bodies lying on the floor. Young children are

In [81]:
scrape_abc_news('https://abcnews.go.com/International/live-updates/israel-gaza-hamas/?id=104617602')

{'title': 'Israel-Gaza live updates: Mass exodus from Gaza hospital',
 'subtitle': None,
 'content': 'The hospital has been treating thousands of wounded people. Israeli airstrikes trigger mass evacuations in Gaza Thousands of people have died and thousands more have been injured since the militant group Hamas launched an unprecedented surprise attack on Israel on Oct. 7 and Israel retaliated with a bombing campaign and total siege of the neighboring Gaza Strip, leaving the region on the verge of all-out war. Click here for updates from previous days. Hezbollah issued two statements claiming two attacks on Israel amid daily clashes between Hezbollah and Israel forces along Lebanon’s southern border. For the first time since the Oct. 7 Hamas attacks, Israel carried-out a strike 40 km past its northern border into Lebanon, hitting a truck in Zahrani. For now, this is contained within the the Blue Line, the UN-defined demarcation line between the two countries. As the fighting continues b

In [82]:
scrape_the_sun('https://www.thesun.co.uk/news/24703278/police-guard-remembrance-day-clashes')


{'title': 'Ninety-two protesters arrested after violent clashes with cops protecting the Cenotaph and pro-Palestinian march',
 'subtitle': None,
 'content': 'NINETY-two protesters have been arrested after chaos erupted near London\'s Cenotaph today as missile-hurling yobs clashed with cops. Protesters, who took to the streets over pro-Palestinian marches being held on Armistice Day, launched missiles at police as shocking scenes unfolded on the streets of the capital this morning. The counter-protests had been organised amid fears pro-Palestine marches would interrupt Remembrance services but fights broke out as cops tried to maintain a ring of steel around the Cenotaph. While police had doubled their numbers in the capital ahead of fears of violence, they struggled to maintain their staunch guard around the memorial - as chants of "England till I die" and "Let us through" echoed through the streets. But as the pro-Palestine group\'s march was underway - with about 300,000 - cops were 