## Use Beautiful Soup to Scrape data from RSS feeds

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
# install lxml to parse xml code in RSS sites
# !pip install lxml



The **new_directory_path** below contains all the paths of the historical RSS feeds by date from the BBC climate and science located here: https://feeds.bbci.co.uk/news/science_and_environment/rss.xml?edition=uk.  

In order to run this code, you can download the historical RSS feeds by installing the waybackpack package: https://github.com/jsvine/waybackpack  
by running ```pip install waybackpack``` in your terminal.
Then, run the command ```waybackpack https://feeds.bbci.co.uk/news/science_and_environment/rss.xml?edition=uk -d [your path] --from-date 2010``` in your terminal, selecting a specific file path on your local computer where you want to download the RSS feed files.

In [3]:
import os
from xml.etree import ElementTree as ET

class XMLAppender(object):
    """This class appends the separate .xml files from each respective folder.

    The functions in this class loop through all of the .xml files from the listed file paths,
    parse the data in each file, and append the data into one string so that the content of the
    .xml files can be parsed all at once for article titles, links, descriptions, and dates.
    """
    def __init__(self, *filenames):
        assert len(filenames) > 0, 'No filenames!'
        self.filenames = filenames

    def append(self):
        """Loop through files and read content of each .xml file. Append all content to one 
        string object."""
        appended_content = ""
        for filename in self.filenames:
            xml_content = self.read_xml_file(filename)
            appended_content += xml_content
        return appended_content

    def read_xml_file(self, filename):
        """Open the .xml file and read the content"""
        try:
            with open(filename, 'r', encoding='utf-8') as file:
                return file.read()
        except Exception as e:
            print(f"Error reading XML file '{filename}': {e}")
            return ""

if __name__ == '__main__':
    # Define the root directory containing date-based subdirectories
    root_directory = '../../../../Downloads/bbc_rss'
    branch_directory = 'feeds.bbci.co.uk/news/science_and_environment'
    
    # Initialize a list to store XML data from all files
    all_xml_paths = []
    
    # Iterate through subdirectories by date
    for date_directory in os.listdir(root_directory):
        #print(date_directory)
        date_directory_path = os.path.join(root_directory, date_directory)
        new_directory_path = os.path.join(date_directory_path, branch_directory)
    
        # Check if the item is a directory
        if os.path.isdir(new_directory_path):
            # Iterate through XML files within the date directory
            for xml_file_name in os.listdir(new_directory_path):
                if xml_file_name.endswith('.xml'):
                    xml_file_path = os.path.join(new_directory_path, xml_file_name)
                    all_xml_paths.append(xml_file_path)
        else:
            print(f"Path {new_directory_path} is not a directory.")

    # Create an instance of XMLAppender and append the XML files
    appender = XMLAppender(*all_xml_paths)
    appended_xml = appender.append()

Path ../../../../Downloads/bbc_rss/.DS_Store/feeds.bbci.co.uk/news/science_and_environment is not a directory.


In [4]:
# Function to get the various attributes of the article
def getArticles(articles):
    
    """Loop through article items and parse the title, description, link, and date of
    each article, creating a dictionary. Append each dictionary to a list."""
    
    all_articles = []
    for article in articles:
        article_title = article.find('title').text
        article_link = getattr(article.find('link'), 'text', None)
        article_desc = getattr(article.find('description'), 'text', None)
        article_published = getattr(article.find('pubDate'), 'text', None)
        all_articles.append({
            'title':article_title,
            'link':article_link,
            'description':article_desc,
            'published':article_published
        })
    return all_articles

# Initialize a list to store parsed results from each <rss> section
all_parsed_results = []

# Split the XML string into individual <rss> sections
rss_sections = appended_xml.split('</rss>')[:-1]

# Iterate through each <rss> section and parse it
for rss_section in rss_sections:
    # Add the opening <rss> tag back to form a valid XML document
    rss_section = '<rss>' + rss_section
    soup = BeautifulSoup(rss_section, 'xml')

    # Parse and process the content within the <rss> section as needed
    # Example: Extract items, titles, descriptions, etc.
    items = soup.find_all('item')

    # Append the parsed result to the list
    all_parsed_results.append(items)
    

# Combine the parsed results from all <rss> sections as needed
# Example: Flatten the list, merge, or process the data further
combined_results = [item for items in all_parsed_results for item in items]

# Print or process the combined results
# print(combined_results)

# items_2 = combined_results.findAll('item')
articles_dict = getArticles(combined_results)
#print(next)

In [5]:
df = pd.DataFrame(articles_dict)
df.tail()

Unnamed: 0,title,link,description,published
193541,'Hide the decline' revisited,http://www.bbc.co.uk/go/rss/int/news/-/news/sc...,Does Earth's new temperature record hide a dec...,"Wed, 02 Nov 2011 11:15:47 GMT"
193542,What's happened to Thalidomide babies?,http://www.bbc.co.uk/go/rss/int/news/-/news/ma...,"Thalidomide victims search for truth, 50 years on","Thu, 03 Nov 2011 01:18:34 GMT"
193543,Colourful remembrances of lost birds,http://www.bbc.co.uk/go/rss/int/news/-/news/ma...,Bright visions of birds that are no longer wit...,"Wed, 02 Nov 2011 10:46:25 GMT"
193544,7 questions on national animals,http://www.bbc.co.uk/go/rss/int/news/-/news/ma...,Test yourself on birds and beasts used as symbols,"Wed, 02 Nov 2011 10:55:50 GMT"
193545,Five things Scott found in Antarctica,http://www.bbc.co.uk/go/rss/int/news/-/news/ma...,Five strange things Captain Scott found in the...,"Wed, 02 Nov 2011 04:44:08 GMT"


In [6]:
df.shape

(193546, 4)

In [7]:
df_unique = df.drop_duplicates(subset= 'title')

In [8]:
df_unique.shape

(20102, 4)

In [9]:
df_unique.to_csv("../data/bbc_science_and_climate_articles_2010-2023.csv")