In [1]:
import pandas as pd
import requests
import re
import json

from bs4 import BeautifulSoup
from datetime import datetime, timedelta

# Scraping Hacker News
**Home URL**: https://thehackernews.com/  
**Task Description**: Get the news articles published on March 11-12.  
**Attributes per news**: datetime, title, author, category, full article and link

# Discovering the structure of the Website

In [2]:
root_url = 'https://www.cnet.com'
news_url = 'https://www.cnet.com/news'

## Requesting the home page then parsing it to BeautifulSoup

In [3]:
result = requests.get(news_url)
soup = BeautifulSoup(result.content, 'html.parser')

## Getting the list of news

In [4]:
list_container = soup.find(class_='fdListingContainer')
list_of_news = list_container.find_all('div', class_='riverPost')
len(list_of_news)

15

## Getting the attributes per news

In [5]:
index = 3

### Datetime
The date isn't shown in the lists of news. However the time of how long the news was posted is known. You can use this to as a parameter to timedelta to be able to subtract it to the current datetime to get the date posted.

In [6]:
value, unit = list_of_news[index].find('div', class_='timeAgo').find_all('span')
value = int(value.text)
unit = unit.text
print(value, '-', unit)

30 - minutes ago


In [7]:
def get_date_posted(value, unit):
    date_posted = None
    
    if(unit == 'seconds ago' or unit == 'second ago'):
        date_posted = datetime.today() - timedelta(seconds=value)
    elif(unit == 'minutes ago' or unit == 'minute ago'):
        date_posted = datetime.today() - timedelta(minutes=value)
    elif(unit == 'hours ago' or unit == 'hour ago'):
        date_posted = datetime.today() - timedelta(hours=value)
    elif(unit == 'days ago' or unit == 'day ago'):
        date_posted = datetime.today() - timedelta(days=value)
        
    return date_posted.strftime('%m/%d/%Y')

get_date_posted(value, unit)

'04/12/2022'

### Title

In [8]:
def get_title_from_list(article):
    return article.find('h3').text.strip()

get_title_from_list(list_of_news[index])

'Save 25% on Fleur & Bee Products Today'

### Link

In [9]:
def get_link_from_list(article):
    return root_url + article.find('a').get('href')

get_link_from_list(list_of_news[index])

'https://www.cnet.com/health/personal-care/save-25-on-fleur-bee-products-today/'

### Category

In [10]:
def get_category_from_list(article):
    return article.find('a', class_='topicName').text

get_category_from_list(list_of_news[index])

'Skin Care'

### Author

In [11]:
def get_author_from_list(article):
    return article.find('span', class_='assetAuthor').find('a', class_='name').text

get_author_from_list(list_of_news[index])

'Robin Mosley'

### Full Article
The full article cannot be seen in the list of news, you'd have to go to the news article page to be able to get the whole contents of the news article

In [12]:
result = requests.get(get_link_from_list(list_of_news[index]))
soup = BeautifulSoup(result.content, 'html.parser')

In [13]:
def get_official_datetime_from_page(article):
    return article.find('time').text

get_official_datetime_from_page(soup)

'April 12, 2022 7:19 a.m. PT'

In [14]:
def get_full_article_from_page(article):
    full_contents = ''
    try:
        for element in article.find('div', class_='article-main-body').contents[:-5]:
            if(type(element).__name__ == 'Tag'): # only get the elements that are tags
                stringed_element = re.sub(r'\s+', ' ', str(element).strip()) # remove trailing spaces
                if('myfinance-news' not in stringed_element and 'lazyloadElement' not in stringed_element): # filter out unnecessary tags
                    full_contents += stringed_element
    except:
        for element in article.find('div', class_='c-galleryVertical').contents:
            if(type(element).__name__ == 'Tag'): # only get the elements that are tags
                stringed_element = re.sub(r'\s+', ' ', str(element).strip()) # remove trailing spaces
                if('myfinance-news' not in stringed_element and 'lazyloadElement' not in stringed_element): # filter out unnecessary tags
                    full_contents += stringed_element
    return full_contents

get_full_article_from_page(soup)

'<figure class="image image-large pull-none shortcode" section="shortcodeImage"><span class="imageContainer"><span><img alt="fleur-bee.png" class="" height="615" src="https://www.cnet.com/a/img/resize/cefdd139784264d4e8c3bd5cbd35226349ec1a93/2022/04/12/a90b22f2-0939-4c15-9cc7-8e9594d6595c/fleur-bee.png?auto=webp&amp;width=1092" width="1092"/></span></span><figcaption><span class="credit"> Amazon </span></figcaption></figure><p class="speakableTextP1">Some moms admire makeup, some love fashion and others love skin care. In honor of Mother\'s Day, you can <span class="comLink nolinks norewrite" data-track="commerceLink" section="commerce-link"><a data-component="leadsTracker" data-leads-tracker-options=\'{"appendCreditCardParams":false,"numNodes":1,"fireProductViewed":false,"trackingData":{"asid":"","assetguid":"aa89413e-cddb-44fd-836d-bd4e544dd328","contype":"review","destUrl":"https:\\/\\/assoc-redirect.amazon.com\\/g\\/r\\/https:\\/\\/www.amazon.com\\/s?k=Fleur+%26+Bee&amp;ref=bl_dp_s

`[:-5]` mainly means that it won't include the last 5 values in the array. This is because these last 5 values aren't considered to be included in the full article.

# Finding then saving the articles which were posted in March 11-12

In [15]:
has_found_march_12 = False # starts in march 12 since the articles are sorted by date in a descending order
has_finished_march_11 = False
page = 100 # starts at page 100

news_articles = []

while(not(has_found_march_12 and has_finished_march_11)):
    result = requests.get('%s/%d' % (news_url, page))
    soup = BeautifulSoup(result.content, 'html.parser')
    
    list_container = soup.find(class_='fdListingContainer')
    list_of_news = list_container.find_all('div', class_='riverPost')
    
    dates = []
    
    for article in list_of_news:
        value, unit = article.find('div', class_='timeAgo').find_all('span')
        extracted_datetime = get_date_posted(int(value.text), unit.text)
        dates.append(extracted_datetime)
        if(extracted_datetime <= '03/12/2022'):
            has_found_march_12 = True
        if(extracted_datetime <= '03/10/2022'):
            has_finished_march_11 = True
            break
        if(has_found_march_12 and not has_finished_march_11): # add to news_articles
            link = get_link_from_list(article)
            if(link not in [na['link'] for na in news_articles]):
                news_articles.append({
                    'title': get_title_from_list(article),
                    'author': get_author_from_list(article),
                    'category': get_category_from_list(article),
                    'link': link,
                })
    print('=== Page %d ===' % page)
    print('Max Date:', dates[0])
    print('Min Date:', dates[-1])
    print('==============')
    page += 1

=== Page 100 ===
Max Date: 03/14/2022
Min Date: 03/13/2022
=== Page 101 ===
Max Date: 03/13/2022
Min Date: 03/12/2022
=== Page 102 ===
Max Date: 03/12/2022
Min Date: 03/11/2022
=== Page 103 ===
Max Date: 03/11/2022
Min Date: 03/11/2022
=== Page 104 ===
Max Date: 03/11/2022
Min Date: 03/11/2022
=== Page 105 ===
Max Date: 03/11/2022
Min Date: 03/11/2022
=== Page 106 ===
Max Date: 03/11/2022
Min Date: 03/10/2022


In [16]:
news_articles[:5]

[{'title': 'California Wine Ruined by Wildfires Leads Chemists to Analyze Grapes for Smoke',
  'author': 'Monisha Ravisetti',
  'category': 'Climate',
  'link': 'https://www.cnet.com/science/climate/california-wine-ruined-by-wildfires-leads-chemists-to-analyze-grapes-for-smoke/'},
 {'title': 'Check Out How Porsche and Pixar Are Creating a Modern Sally Carrera 911',
  'author': 'Daniel Golson',
  'category': 'Coupes',
  'link': 'https://www.cnet.com/roadshow/pictures/porsche-911-sally-carrera-pixar-cars-design-process/'},
 {'title': 'You Can Snag a Ring Video Doorbell for $30 Right Now -- Today Only (Update: Expired)',
  'author': 'Adrian Marlow',
  'category': 'Security Cameras',
  'link': 'https://www.cnet.com/home/security/you-can-snag-a-ring-video-doorbell-for-30-right-now-today-only/'},
 {'title': "Prime Members Can Grab Amazon's Halo Fitness and Sleep-Tracking Band for Just $55",
  'author': 'Adrian Marlow',
  'category': 'Fitness Accessories',
  'link': 'https://www.cnet.com/heal

In [17]:
len(news_articles)

67

# Going through each page and getting the full attributes

In [18]:
data = []
for i, article in enumerate(news_articles):
    print('[%d] %s' % (i, article['title']))
    result = requests.get(article['link'])
    soup = BeautifulSoup(result.content, 'html.parser')
    data.append({
        'Title': article['title'],
        'Author': article['author'],
        'Datetime Posted': get_official_datetime_from_page(soup),
        'Category': article['category'],
        'Contents': get_full_article_from_page(soup),
        'Link': article['link'],
    })

[0] California Wine Ruined by Wildfires Leads Chemists to Analyze Grapes for Smoke
[1] Check Out How Porsche and Pixar Are Creating a Modern Sally Carrera 911
[2] You Can Snag a Ring Video Doorbell for $30 Right Now -- Today Only (Update: Expired)
[3] Prime Members Can Grab Amazon's Halo Fitness and Sleep-Tracking Band for Just $55
[4] Bring Home a Massive 75-Inch 4K Screen for Just $690 Today
[5] 6 Things That Didn't Make an Appearance at the Apple Event
[6] Apple's Missing Out by Not Having an Under-$300 iPhone SE
[7] 9 Great Reads From CNET This Week: iPhone SE, Magic Leap 2, Tech in Texas and More
[8] Chase Center, Home of the Golden State Warriors, Adds New Plant-Based Eatery
[9] Robot Battery-Swapping Might Be the Best Way to Recharge an EV
[10] The Real Cost of Setting Up an Amazon Alexa Smart Home
[11] Apple Studio Display vs. Pro Display XDR: The Same, Yet Not
[12] The White House Issues First Crypto Order. This Week's Top Bitcoin and Crypto News
[13] You Can Rearrange Your Sp

In [19]:
data[:5]

[{'Title': 'California Wine Ruined by Wildfires Leads Chemists to Analyze Grapes for Smoke',
  'Author': 'Monisha Ravisetti',
  'Datetime Posted': 'March 12, 2022 2:14 p.m. PT',
  'Category': 'Climate',
  'Contents': '<figure class="image image-large pull-none hasCaption shortcode" section="shortcodeImage"><span class="imageContainer"><span><img alt="gettyimages-1228760182" class="" height="729" src="https://www.cnet.com/a/img/resize/3ef6723033173240150cd81213edcc956cc08252/2022/03/11/04a698ed-d218-4f8e-997c-1a95683da384/gettyimages-1228760182.jpg?auto=webp&amp;width=1092" width="1092"/></span></span><figcaption><span class="caption"><p>The Glass Fire burns near vineyards in Napa Valley, California, in September 2020.</p></span><span class="credit"> Samuel Corum/Getty Images </span></figcaption></figure><p class="speakableTextP1">The year 2020 left <a data-component="externalLink" href="https://www.winespectator.com/articles/how-did-2020-s-wildfires-impact-california-wine" rel="noopene

In [20]:
len(data)

67

# Export the data to a json file

In [21]:
with open('cnet 03-11-22 till 03-12-22.json', 'w') as file:
    json.dump(data, file)

# Further Analysis (Optional)

In [22]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,Title,Author,Datetime Posted,Category,Contents,Link
0,California Wine Ruined by Wildfires Leads Chem...,Monisha Ravisetti,"March 12, 2022 2:14 p.m. PT",Climate,"<figure class=""image image-large pull-none has...",https://www.cnet.com/science/climate/californi...
1,Check Out How Porsche and Pixar Are Creating a...,Daniel Golson,"March 11, 2022 1:57 p.m. PT",Coupes,"<div class=""c-galleryItem""><div class=""c-galle...",https://www.cnet.com/roadshow/pictures/porsche...
2,You Can Snag a Ring Video Doorbell for $30 Rig...,Adrian Marlow,"March 12, 2022 9:33 a.m. PT",Security Cameras,"<figure class=""image image-large pull-none sho...",https://www.cnet.com/home/security/you-can-sna...
3,Prime Members Can Grab Amazon's Halo Fitness a...,Adrian Marlow,"March 12, 2022 7:56 a.m. PT",Fitness Accessories,"<figure class=""image image-large pull-none sho...",https://www.cnet.com/health/fitness/prime-memb...
4,Bring Home a Massive 75-Inch 4K Screen for Jus...,Adrian Marlow,"March 12, 2022 7:02 a.m. PT",TVs,"<figure class=""image image-large pull-none sho...",https://www.cnet.com/tech/home-entertainment/b...
