## HTML Beautiful Soup Attempt

In [4]:
import requests
from bs4 import BeautifulSoup

# Define the URL of the CNN page you want to scrape
url = "https://edition.cnn.com/world/cnn-climate"

# Send an HTTP GET request to the URL
response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content of the page using Beautiful Soup
    soup = BeautifulSoup(response.text, "html.parser")

    # Find and extract article titles and dates
    articles = soup.find_all("div", class_="card")
    
    # Initialize lists to store titles and dates
    article_titles = []
    article_dates = []

    # Loop through the articles and extract titles and dates
    for article in articles:
        title = article.find("h3", class_="card__headline")
        date = article.find("span", class_="card__timestamp")
        
        if title and date:
            article_titles.append(title.text.strip())
            article_dates.append(date.text.strip())

    # Print the extracted titles and dates
    for i in range(len(article_titles)):
        print(f"Title: {article_titles[i]}")
        print(f"Date: {article_dates[i]}")
        print()

else:
    print(f"Failed to retrieve the page. Status code: {response.status_code}")

In [5]:
article_titles

[]

In [3]:
article_dates

[]

In [6]:
? soup.find_all

[0;31mSignature:[0m
 [0msoup[0m[0;34m.[0m[0mfind_all[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mname[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mattrs[0m[0;34m=[0m[0;34m{[0m[0;34m}[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mrecursive[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mstring[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mlimit[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m**[0m[0mkwargs[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Look in the children of this PageElement and find all
PageElements that match the given criteria.

All find_* methods take a common set of arguments. See the online
documentation for detailed explanations.

:param name: A filter on tag name.
:param attrs: A dictionary of filters on attribute values.
:param recursive: If this is True, find_all() will perform a
    r

In [25]:
import requests
from bs4 import BeautifulSoup

# Define the URL of the CNN page you want to scrape
url = "https://edition.cnn.com/world/cnn-climate"

# Send an HTTP GET request to the URL
response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content of the page using Beautiful Soup
    soup = BeautifulSoup(response.text, "html.parser")

    # Find and extract article titles and dates
    articles = soup.find_all("div", class_="card")

    # Initialize lists to store titles and dates
    article_titles = []
    article_dates = []

    # Loop through the articles and extract titles and dates
    for article in articles:
        print(article)
        title = article.find("div", attrs= {"class":"container__item container__item--type-section"})
        print(title)
        date = article.find("time", class_="card__timestamp")
        
        if title and date:
            article_titles.append(title.text.strip())
            article_dates.append(date["datetime"])

    # Print the extracted titles and dates
    for i in range(len(article_titles)):
        print(f"Title: {article_titles[i]}")
        print(f"Date: {article_dates[i]}")
        print()

else:
    print(f"Failed to retrieve the page. Status code: {response.status_code}")

<div class="card container__item container__item--type-section container_lead-plus-headlines-with-images__item container_lead-plus-headlines-with-images__item--type-section" data-component-name="card" data-created-updated-by="true" data-open-link="/videos/world/2022/04/21/zimbabwe-fishing-climate-change-as-equals-lon-orig.cnn" data-unselectable="true" data-uri="cms.cnn.com/_components/card/instances/clezurg57000i68nztsslz64p_fill_1@published" data-video-duration="04:37">
<a class="container__link container_lead-plus-headlines-with-images__link" data-link-type="video" href="/videos/world/2022/04/21/zimbabwe-fishing-climate-change-as-equals-lon-orig.cnn">
<div class="container__item-media-wrapper container_lead-plus-headlines-with-images__item-media-wrapper" data-breakpoints='{"card--media-large": 596}'>
<div class="container__item-media container_lead-plus-headlines-with-images__item-media">
<div class="image image__hide-placeholder" data-breakpoints='{"image--eq-extra-small": 115, "ima

## RSS Beautiful Soup Attempt

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
# install lxml to parse xml code in RSS sites
# RSS sites are in xml format and make it easy to parse the articl title, description, link and date (downside is they don't contain full article)
# list of CNN's RSS sites: https://edition.cnn.com/services/rss/
!pip install lxml



In [3]:
# Function to get the various attributes of the article
def getArticles(articles):
    all_articles = []
    for article in articles:
        article_title = article.find('title').text
        article_link = getattr(article.find('link'), 'text', None)
        article_desc = getattr(article.find('description'), 'text', None)
        article_published = getattr(article.find('pubDate'), 'text', None)
        all_articles.append({
            'title':article_title,
            'link':article_link,
            'description':article_desc,
            'published':article_published
        })
    return all_articles
    
# Function to invoke CNN Scrapper
def cnn_news_scrapper(URL):
    try:
        r = requests.get(URL)
        soupContent = BeautifulSoup(r.content,'xml')
        print('Job Succeeded returning Status Code: ', r.status_code)
        items = soupContent.findAll('item')
        print('Total News Content')
        print(len(items))
        print(items)
        return getArticles(soupContent.findAll('item'))
    except Exception as e:
        print('Scraping failed due to the below exception')
        print(e)

In [4]:
print('Starting scraping')
data = cnn_news_scrapper('https://feeds.bbci.co.uk/news/science_and_environment/rss.xml?edition=uk')
print('Finished scraping')

Starting scraping
Job Succeeded returning Status Code:  200
Total News Content
16
[<item>
<title>Nature crisis: One in six species at risk of extinction in Great Britain</title>
<description>The loss of Britain's wildlife is outpacing efforts to conserve and protect nature, a major report reveals.</description>
<link>https://www.bbc.co.uk/news/science-environment-66923930?at_medium=RSS&amp;at_campaign=KARANGA</link>
<guid isPermaLink="false">https://www.bbc.co.uk/news/science-environment-66923930</guid>
<pubDate>Wed, 27 Sep 2023 18:07:20 GMT</pubDate>
</item>, <item>
<title>Scientists get closer to solving mystery of antimatter</title>
<description>The elusive substance holds the key to discovering how the Universe was formed.</description>
<link>https://www.bbc.co.uk/news/science-environment-66890649?at_medium=RSS&amp;at_campaign=KARANGA</link>
<guid isPermaLink="false">https://www.bbc.co.uk/news/science-environment-66890649</guid>
<pubDate>Wed, 27 Sep 2023 15:06:19 GMT</pubDate>
</it

In [5]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,title,link,description,published
0,Nature crisis: One in six species at risk of e...,https://www.bbc.co.uk/news/science-environment...,The loss of Britain's wildlife is outpacing ef...,"Wed, 27 Sep 2023 18:07:20 GMT"
1,Scientists get closer to solving mystery of an...,https://www.bbc.co.uk/news/science-environment...,The elusive substance holds the key to discove...,"Wed, 27 Sep 2023 15:06:19 GMT"
2,Government to delay new environmental building...,https://www.bbc.co.uk/news/science-environment...,Environmentalists say delaying the implementat...,"Tue, 26 Sep 2023 23:00:49 GMT"
3,Climate change: Six young people take 32 count...,https://www.bbc.co.uk/news/world-europe-669235...,They claim governments' slow action on climate...,"Wed, 27 Sep 2023 00:53:18 GMT"
4,Water firms forced to pay back customers for p...,https://www.bbc.co.uk/news/business-66922070?a...,Regulator Ofwat orders companies in England an...,"Tue, 26 Sep 2023 11:51:13 GMT"


In [6]:
df.shape

(16, 4)

In [7]:
df.head(30)

Unnamed: 0,title,link,description,published
0,Nature crisis: One in six species at risk of e...,https://www.bbc.co.uk/news/science-environment...,The loss of Britain's wildlife is outpacing ef...,"Wed, 27 Sep 2023 18:07:20 GMT"
1,Scientists get closer to solving mystery of an...,https://www.bbc.co.uk/news/science-environment...,The elusive substance holds the key to discove...,"Wed, 27 Sep 2023 15:06:19 GMT"
2,Government to delay new environmental building...,https://www.bbc.co.uk/news/science-environment...,Environmentalists say delaying the implementat...,"Tue, 26 Sep 2023 23:00:49 GMT"
3,Climate change: Six young people take 32 count...,https://www.bbc.co.uk/news/world-europe-669235...,They claim governments' slow action on climate...,"Wed, 27 Sep 2023 00:53:18 GMT"
4,Water firms forced to pay back customers for p...,https://www.bbc.co.uk/news/business-66922070?a...,Regulator Ofwat orders companies in England an...,"Tue, 26 Sep 2023 11:51:13 GMT"
5,Osiris-Rex: Nasa confirms return of asteroid B...,https://www.bbc.co.uk/news/science-environment...,A capsule carrying debris from asteroid Bennu ...,"Mon, 25 Sep 2023 02:27:12 GMT"
6,"Richest oil states should pay climate tax, say...",https://www.bbc.co.uk/news/uk-politics-6690639...,The former Labour PM wants the wealthiest oil ...,"Mon, 25 Sep 2023 05:00:05 GMT"
7,Africa proposes global carbon taxes to fight c...,https://www.bbc.co.uk/news/world-africa-667335...,Despite suffering some of the worst impacts of...,"Thu, 07 Sep 2023 07:39:58 GMT"
8,Kenya's Lake Baringo: Surviving hippo and croc...,https://www.bbc.co.uk/news/world-africa-667075...,Residents say that with Lake Baringo getting b...,"Wed, 06 Sep 2023 00:58:05 GMT"
9,"Ocean heat record broken, with grim implicatio...",https://www.bbc.co.uk/news/science-environment...,The oceans are a vital regulator for the clima...,"Fri, 04 Aug 2023 04:00:03 GMT"


In [27]:
from xml.etree import ElementTree as et

class XMLCombiner(object):
    def __init__(self, filenames):
        assert len(filenames) > 0, 'No filenames!'
        # save all the roots, in order, to be processed later
        self.roots = [et.parse(f).getroot() for f in filenames]

    def combine(self):
        for r in self.roots[1:]:
            # combine each element with the first one, and update that
            self.combine_element(self.roots[0], r)
        # return the string representation
        return et.tostring(self.roots[0])

    def combine_element(self, one, other):
        """
        This function recursively updates either the text or the children
        of an element if another element is found in `one`, or adds it
        from `other` if not found.
        """
        # Create a mapping from tag name to element, as that's what we are fltering with
        mapping = {el.tag: el for el in one}
        for el in other:
            if len(el) == 0:
                # Not nested
                try:
                    # Update the text
                    mapping[el.tag].text = el.text
                except KeyError:
                    # An element with this name is not in the mapping
                    mapping[el.tag] = el
                    # Add it
                    one.append(el)
            else:
                try:
                    # Recursively process the element, and update it in the same way
                    self.combine_element(mapping[el.tag], el)
                except KeyError:
                    # Not in the mapping
                    mapping[el.tag] = el
                    # Just add it
                    one.append(el)

if __name__ == '__main__':
    r = XMLCombiner(('../../../Downloads/bbc_rss/20100714185956/feeds.bbci.co.uk/news/science_and_environment/rss.xml', '../../../Downloads/bbc_rss/20100715032343/feeds.bbci.co.uk/news/science_and_environment/rss.xml')).combine()
    print('-'*20)
    print(r)

--------------------


In [28]:
import os
import xml.etree.ElementTree as ET

# Define the root directory containing date-based subdirectories
root_directory = '../../../Downloads/bbc_rss'
branch_directory = 'feeds.bbci.co.uk/news/science_and_environment'

# Initialize a list to store XML data from all files
all_xml_paths = []

# Iterate through subdirectories by date
for date_directory in os.listdir(root_directory):
    date_directory_path = os.path.join(root_directory, date_directory)
    new_directory_path = os.path.join(date_directory_path, branch_directory)

    # Check if the item is a directory
    if os.path.isdir(new_directory_path):
        # Iterate through XML files within the date directory
        for xml_file_name in os.listdir(new_directory_path):
            if xml_file_name.endswith('.xml'):
                xml_file_path = os.path.join(new_directory_path, xml_file_name)
                all_xml_paths.append(xml_file_path)


In [29]:
all_xml_paths

['../../../Downloads/bbc_rss/20120822102438/feeds.bbci.co.uk/news/science_and_environment/rss.xml',
 '../../../Downloads/bbc_rss/20140909173342/feeds.bbci.co.uk/news/science_and_environment/rss.xml',
 '../../../Downloads/bbc_rss/20120817063010/feeds.bbci.co.uk/news/science_and_environment/rss.xml',
 '../../../Downloads/bbc_rss/20121021180644/feeds.bbci.co.uk/news/science_and_environment/rss.xml',
 '../../../Downloads/bbc_rss/20120522071126/feeds.bbci.co.uk/news/science_and_environment/rss.xml',
 '../../../Downloads/bbc_rss/20110811155019/feeds.bbci.co.uk/news/science_and_environment/rss.xml',
 '../../../Downloads/bbc_rss/20120917042246/feeds.bbci.co.uk/news/science_and_environment/rss.xml',
 '../../../Downloads/bbc_rss/20110123045422/feeds.bbci.co.uk/news/science_and_environment/rss.xml',
 '../../../Downloads/bbc_rss/20120105000719/feeds.bbci.co.uk/news/science_and_environment/rss.xml',
 '../../../Downloads/bbc_rss/20121123220046/feeds.bbci.co.uk/news/science_and_environment/rss.xml',


In [36]:
import os
from xml.etree import ElementTree as et

class XMLCombiner(object):
    def __init__(self, *filenames):
        assert len(filenames) > 0, 'No filenames!'
        # Save all the roots from the provided filenames in order
        self.roots = [ET.parse(f).getroot() for f in filenames]

    def combine(self):
        # Loop through each root except the first one
        for r in self.roots[1:]:
            # Combine each element with the first one and update it
            self.combine_element(self.roots[0], r)
        # Return the string representation of the combined root
        return ET.tostring(self.roots[0])

    def combine_element(self, one, other):
        # Create a mapping from tag name to element
        mapping = {el.tag: el for el in one}
        for el in other:
            if len(el) == 0:
                # Not nested, try to update the text or add it
                try:
                    mapping[el.tag].text = el.text
                except KeyError:
                    mapping[el.tag] = el
                    one.append(el)
            else:
                # Recursively process the nested element
                try:
                    self.combine_element(mapping[el.tag], el)
                except KeyError:
                    mapping[el.tag] = el
                    one.append(el)

if __name__ == '__main__':

    # Define the root directory containing date-based subdirectories
    root_directory = '../../../Downloads/bbc_rss'
    branch_directory = 'feeds.bbci.co.uk/news/science_and_environment'
    
    # Initialize a list to store XML data from all files
    all_xml_paths = []
    
    # Iterate through subdirectories by date
    for date_directory in os.listdir(root_directory):
        date_directory_path = os.path.join(root_directory, date_directory)
        new_directory_path = os.path.join(date_directory_path, branch_directory)
    
        # Check if the item is a directory
        if os.path.isdir(new_directory_path):
            # Iterate through XML files within the date directory
            for xml_file_name in os.listdir(new_directory_path):
                if xml_file_name.endswith('.xml'):
                    xml_file_path = os.path.join(new_directory_path, xml_file_name)
                    all_xml_paths.append(xml_file_path)
    
    # Provide a list of filenames instead of just two
    r = XMLCombiner(all_xml_paths).combine()
    print('-'*20)
    print(r)

TypeError: expected str, bytes or os.PathLike object, not list