## HTML Beautiful Soup Attempt

In [4]:
import requests
from bs4 import BeautifulSoup

# Define the URL of the CNN page you want to scrape
url = "https://edition.cnn.com/world/cnn-climate"

# Send an HTTP GET request to the URL
response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content of the page using Beautiful Soup
    soup = BeautifulSoup(response.text, "html.parser")

    # Find and extract article titles and dates
    articles = soup.find_all("div", class_="card")
    
    # Initialize lists to store titles and dates
    article_titles = []
    article_dates = []

    # Loop through the articles and extract titles and dates
    for article in articles:
        title = article.find("h3", class_="card__headline")
        date = article.find("span", class_="card__timestamp")
        
        if title and date:
            article_titles.append(title.text.strip())
            article_dates.append(date.text.strip())

    # Print the extracted titles and dates
    for i in range(len(article_titles)):
        print(f"Title: {article_titles[i]}")
        print(f"Date: {article_dates[i]}")
        print()

else:
    print(f"Failed to retrieve the page. Status code: {response.status_code}")

In [5]:
article_titles

[]

In [3]:
article_dates

[]

In [6]:
? soup.find_all

[0;31mSignature:[0m
 [0msoup[0m[0;34m.[0m[0mfind_all[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mname[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mattrs[0m[0;34m=[0m[0;34m{[0m[0;34m}[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mrecursive[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mstring[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mlimit[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m**[0m[0mkwargs[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Look in the children of this PageElement and find all
PageElements that match the given criteria.

All find_* methods take a common set of arguments. See the online
documentation for detailed explanations.

:param name: A filter on tag name.
:param attrs: A dictionary of filters on attribute values.
:param recursive: If this is True, find_all() will perform a
    r

In [25]:
import requests
from bs4 import BeautifulSoup

# Define the URL of the CNN page you want to scrape
url = "https://edition.cnn.com/world/cnn-climate"

# Send an HTTP GET request to the URL
response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content of the page using Beautiful Soup
    soup = BeautifulSoup(response.text, "html.parser")

    # Find and extract article titles and dates
    articles = soup.find_all("div", class_="card")

    # Initialize lists to store titles and dates
    article_titles = []
    article_dates = []

    # Loop through the articles and extract titles and dates
    for article in articles:
        print(article)
        title = article.find("div", attrs= {"class":"container__item container__item--type-section"})
        print(title)
        date = article.find("time", class_="card__timestamp")
        
        if title and date:
            article_titles.append(title.text.strip())
            article_dates.append(date["datetime"])

    # Print the extracted titles and dates
    for i in range(len(article_titles)):
        print(f"Title: {article_titles[i]}")
        print(f"Date: {article_dates[i]}")
        print()

else:
    print(f"Failed to retrieve the page. Status code: {response.status_code}")

<div class="card container__item container__item--type-section container_lead-plus-headlines-with-images__item container_lead-plus-headlines-with-images__item--type-section" data-component-name="card" data-created-updated-by="true" data-open-link="/videos/world/2022/04/21/zimbabwe-fishing-climate-change-as-equals-lon-orig.cnn" data-unselectable="true" data-uri="cms.cnn.com/_components/card/instances/clezurg57000i68nztsslz64p_fill_1@published" data-video-duration="04:37">
<a class="container__link container_lead-plus-headlines-with-images__link" data-link-type="video" href="/videos/world/2022/04/21/zimbabwe-fishing-climate-change-as-equals-lon-orig.cnn">
<div class="container__item-media-wrapper container_lead-plus-headlines-with-images__item-media-wrapper" data-breakpoints='{"card--media-large": 596}'>
<div class="container__item-media container_lead-plus-headlines-with-images__item-media">
<div class="image image__hide-placeholder" data-breakpoints='{"image--eq-extra-small": 115, "ima

## RSS Beautiful Soup Attempt

In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [80]:
# install lxml to parse xml code in RSS sites
# RSS sites are in xml format and make it easy to parse the articl title, description, link and date (downside is they don't contain full article)
# list of CNN's RSS sites: https://edition.cnn.com/services/rss/
!pip install lxml



In [70]:
# Function to get the various attributes of the article
def getArticles(articles):
    all_articles = []
    for article in articles:
        article_title = article.find('title').text
        article_link = getattr(article.find('link'), 'text', None)
        article_desc = getattr(article.find('description'), 'text', None)
        article_published = getattr(article.find('pubDate'), 'text', None)
        all_articles.append({
            'title':article_title,
            'link':article_link,
            'description':article_desc,
            'published':article_published
        })
    return all_articles
    
# Function to invoke CNN Scrapper
def cnn_news_scrapper(URL):
    try:
        r = requests.get(URL)
        soupContent = BeautifulSoup(r.content,'xml')
        print('Job Succeeded returning Status Code: ', r.status_code)
        items = soupContent.findAll('item')
        print('Total News Content')
        print(len(items))
        print(items)
        return getArticles(soupContent.findAll('item'))
    except Exception as e:
        print('Scraping failed due to the below exception')
        print(e)

In [76]:
print('Starting scraping')
data = cnn_news_scrapper('http://rss.cnn.com/rss/money_news_international.rss')
print('Finished scraping')

Starting scraping
Job Succeeded returning Status Code:  200
Total News Content
20
[<item>
<title>Mexico ready to retaliate by hurting US farmers</title>
<link>http://money.cnn.com/2017/02/13/news/economy/mexico-trump-us-corn/index.html?section=money_news_international</link>
<guid>http://money.cnn.com/2017/02/13/news/economy/mexico-trump-us-corn/index.html?section=money_news_international</guid>
<media:thumbnail height="90" url="http://i2.cdn.turner.com/money/dam/assets/170213111654-us-corn-field-nafta-120x90.jpg" width="120"/>
<description>Mexican Senator Armando Rios Piter told CNN that he plans to introduce a bill this week that would stop Mexican purchases of American corn. </description>
<pubDate>Mon, 13 Feb 2017 12:37:06 EST</pubDate>
</item>, <item>
<title>Will the next iPhone charge wirelessly? </title>
<link>http://money.cnn.com/2017/02/13/technology/apple-wireless-charging/index.html?section=money_news_international</link>
<guid>http://money.cnn.com/2017/02/13/technology/appl

In [77]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,title,link,description,published
0,Mexico ready to retaliate by hurting US farmers,http://money.cnn.com/2017/02/13/news/economy/m...,Mexican Senator Armando Rios Piter told CNN th...,"Mon, 13 Feb 2017 12:37:06 EST"
1,Will the next iPhone charge wirelessly?,http://money.cnn.com/2017/02/13/technology/app...,The days of plugging in an iPhone to charge wo...,"Mon, 13 Feb 2017 14:42:14 EST"
2,How 'America First' could turn into to 'India ...,http://money.cnn.com/2017/02/13/technology/ind...,"Nandan Nilekani, co-founder of one of India's ...","Mon, 13 Feb 2017 14:20:08 EST"
3,Swiss voters reject corporate tax overhaul,http://money.cnn.com/2017/02/13/pf/taxes/switz...,Read full story for latest details.,"Mon, 13 Feb 2017 10:45:35 EST"
4,Stocks hit record again. Is Trump the reason?,http://money.cnn.com/2017/02/13/investing/stoc...,The market is at all-time highs and many say T...,"Mon, 13 Feb 2017 12:35:07 EST"


In [78]:
df.shape

(20, 4)

In [79]:
df.head(30)

Unnamed: 0,title,link,description,published
0,Mexico ready to retaliate by hurting US farmers,http://money.cnn.com/2017/02/13/news/economy/m...,Mexican Senator Armando Rios Piter told CNN th...,"Mon, 13 Feb 2017 12:37:06 EST"
1,Will the next iPhone charge wirelessly?,http://money.cnn.com/2017/02/13/technology/app...,The days of plugging in an iPhone to charge wo...,"Mon, 13 Feb 2017 14:42:14 EST"
2,How 'America First' could turn into to 'India ...,http://money.cnn.com/2017/02/13/technology/ind...,"Nandan Nilekani, co-founder of one of India's ...","Mon, 13 Feb 2017 14:20:08 EST"
3,Swiss voters reject corporate tax overhaul,http://money.cnn.com/2017/02/13/pf/taxes/switz...,Read full story for latest details.,"Mon, 13 Feb 2017 10:45:35 EST"
4,Stocks hit record again. Is Trump the reason?,http://money.cnn.com/2017/02/13/investing/stoc...,The market is at all-time highs and many say T...,"Mon, 13 Feb 2017 12:35:07 EST"
5,Apple stock nears record high,http://money.cnn.com/2017/02/13/technology/app...,Apple stock is less than $1 away from its all-...,"Mon, 13 Feb 2017 12:24:58 EST"
6,"America's NAFTA nemesis: Canada, not Mexico",http://money.cnn.com/2017/02/13/news/economy/n...,President-elect Donald Trump focuses his criti...,"Mon, 13 Feb 2017 11:59:43 EST"
7,"Verizon's plan: Consumers win, investors lose",http://money.cnn.com/2017/02/13/investing/veri...,Verizon has decided to bring back unlimited da...,"Mon, 13 Feb 2017 11:32:13 EST"
8,Oil prices have doubled in a year. Here's why,http://money.cnn.com/2017/02/13/investing/oil-...,The price of crude oil has more than doubled o...,"Mon, 13 Feb 2017 09:39:58 EST"
9,Tesla will sell electric cars in the Middle East,http://money.cnn.com/2017/02/13/investing/tesl...,The automaker announced Monday that its first ...,"Mon, 13 Feb 2017 11:18:47 EST"
