# Lab 1: Data Scrapping from the Web

In [92]:
from __future__ import print_function

from bs4 import BeautifulSoup
from bs4 import ResultSet

import requests
import json 

import time
import datetime

In [65]:
url = 'https://www.theguardian.com/books/2023/jan/18/ts-eliot-prize-winner-anthony-joseph-how-poetry-helped-me-love-my-absent-father'
req = requests.get(url)
source = req.text
soup = BeautifulSoup(source, 'html.parser')

## Web scraping using Beautiful Soup 

### Task 1: Find all in body links

Open the above URL in your browser. You will notice that parts of the main news story are hyperlinked
to other news stories published previously. For instance, the first paragraph is linked to a news story on
Anthony Joseph winning the TS Eliot prize. Your first task is to extract the links to these other news
stories in the main news. 

In [66]:
articles = soup.find_all('a', attrs={
    'data-link-name': 'in body link'
})

for article in articles: 
    print(article['href'][:], article.text[:20])

https://www.theguardian.com/books/2023/jan/16/anthony-joseph-wins-ts-eliot-prize-for-luminous-poetry-collection TS Eliot prize
https://www.theguardian.com/books/2018/aug/04/kitch-anthony-joseph-review-windrush-trinidad-calypso in his Guardian revi
https://www.theguardian.com/books/poetry Poetry
https://guardianbookshop.com/sonnets-for-albert-9781526649942 guardianbookshop.com


### Task 2. Extracting Topics or Categories

In [67]:
articles = soup.find_all('a', attrs={
    'class' : 'dcr-viu5to'
})

for article in articles: 
    print(article['href'][:], article.text[:20])

/books/books Books
/books/ts-eliot-prize-for-poetry TS Eliot prize for p
/books/poetry Poetry
/culture/awards-and-prizes Awards and prizes
/tone/interview interviews


### Task 3. Listing All News Stories in a Section 

In [68]:
url = 'https://www.theguardian.com/uk/technology'
req = requests.get(url)
source = req.text
soup = BeautifulSoup(source, 'html.parser')

In [69]:
articles = soup.find_all('a', attrs={
    'class': 'js-headline-text'
})

prefix = 'https://www.theguardian.com/technology'
for article in articles: 
    if article['href'].startswith(prefix):
        print(article['href'][:], article.text[:20])

https://www.theguardian.com/technology/2023/jan/27/elon-musk-doesnt-seem-like-right-person-to-own-twitter-says-co-founder Tesla CEO ‘doesn’t s
https://www.theguardian.com/technology/2023/jan/26/state-linked-hackers-in-russia-and-iran-are-targeting-uk-groups-ncsc-warns State-linked hackers
https://www.theguardian.com/technology/2023/jan/25/first-uk-industrial-action-against-amazon-is-making-an-impact-says-gmb First UK industrial 
https://www.theguardian.com/technology/2023/jan/25/microsoft-investigates-outage-affecting-teams-and-outlook-users-worldwide Company investigates
https://www.theguardian.com/technology/2022/oct/31/google-pixel-7-review-camera-price-android-software Cracking camera at a
https://www.theguardian.com/technology/2022/oct/10/iphone-14-review-apple-price-design Familiar design but 
https://www.theguardian.com/technology/2022/sep/05/samsung-galaxy-z-fold-4-review-cutting-edge-excellence-at-eye-watering-price Cutting-edge excelle
https://www.theguardian.com/technology/2

### Task 4. List 50 Most Recent Technology-Related News Stories 

In [70]:
articles = ResultSet([])
numArticles = 50
pageNum = 1

while len(articles) < numArticles:
    url = f'https://www.theguardian.com/technology?page={pageNum}'
    req = requests.get(url)
    source = req.text
    soup = BeautifulSoup(source, 'html.parser')
    
    articles.extend(soup.find_all('a', attrs={
        'data-link-name' : "article",
        'class' : 'u-faux-block-link__overlay js-headline-text'
    }))

articles = articles[:50]
print(f'There are {len(articles)} articles.')

for article in articles: 
    print(article['href'][:], article.text[:20])

There are 50 articles.
https://www.theguardian.com/us-news/commentisfree/2023/jan/27/trump-return-facebook-meta-hypocrisy-democracy Facebook is allowing
https://www.theguardian.com/fashion/2023/jan/27/what-bereal-selfies-have-taught-me-about-my-fashion-choices The camera never lie
https://www.theguardian.com/tv-and-radio/2023/jan/28/mumbling-actors-bad-speakers-or-lazy-listeners-why-everyone-is-watching-tv-with-subtitles-on Mumbling actors, bad
https://www.theguardian.com/stage/2023/jan/27/saint-jude-review-swamp-motel-london Saint Jude review – 
https://www.theguardian.com/environment/2023/jan/27/daily-wire-google-ads-climate-crisis-deniers Google let Daily Wir
https://www.theguardian.com/technology/2023/jan/27/elon-musk-doesnt-seem-like-right-person-to-own-twitter-says-co-founder Elon Musk ‘doesn’t s
https://www.theguardian.com/us-news/2023/jan/26/uber-lyft-new-york-zero-emission-2030 Uber and Lyft in New
https://www.theguardian.com/society/2023/jan/27/covid-lockdowns-created-online-

### Task 4 Bonus. List All Technology-Related New Stories Published After January 20, 2023

In [86]:
def getTimestamp(date):
    months = {
        'jan': '01', 
        'feb': '02', 
        'mar': '03', 
        'apr': '04', 
        'may': '05', 
        'jun': '06', 
        'jul': '07', 
        'aug': '08', 
        'sep': '09', 
        'oct': '10', 
        'nov': '11', 
        'dec': '12'
    }
    date = date.split('/')
    date[1] = months[date[1]]
    date = date[0] + '/' + date[1] + '/' + date[2]
    
    element = datetime.datetime.strptime(date,"%Y/%m/%d")

    tuple = element.timetuple()
    timestamp = time.mktime(tuple)

    return timestamp

In [91]:
articles = []
numArticles = 50
pageNum = 1
dateAfter = '2023/jan/20'
afterTimestamp = getTimestamp(dateAfter)
dateReached = False

# Loop through multiple pages if required 
while not dateReached:
    # Scrape a page of the all technology page
    url = f'https://www.theguardian.com/technology?page={pageNum}'
    req = requests.get(url)
    source = req.text
    soup = BeautifulSoup(source, 'html.parser')
    
    # Find the desired HTML blocks 
    pageResults = soup.find_all('a', attrs={
        'data-link-name' : "article",
        'class' : 'u-faux-block-link__overlay js-headline-text'
    })

    # Add each result if it is after the date
    for result in pageResults:
        # Extract timestamp from URL
        strippedURL = result['href'].removeprefix('https://www.theguardian.com/')
        splitURL = strippedURL.split('/')
        while not splitURL[0].isnumeric():
            splitURL.pop(0)
        articleTimestamp = getTimestamp(splitURL[0] + '/' + splitURL[1] + '/' + splitURL[2])
        
        # Add the article if it is after the date otherwise stop
        if articleTimestamp > afterTimestamp:
            articles.append(result['href'] + ' ' + splitURL[0] + '/' + splitURL[1] + '/' + splitURL[2])
        else:
            dateReached = True
            break

    pageNum += 1

print(f'There are {len(articles)} articles.')

for article in articles: 
    print(article)

There are 57 articles.
https://www.theguardian.com/us-news/commentisfree/2023/jan/27/trump-return-facebook-meta-hypocrisy-democracy 2023/jan/27
https://www.theguardian.com/fashion/2023/jan/27/what-bereal-selfies-have-taught-me-about-my-fashion-choices 2023/jan/27
https://www.theguardian.com/tv-and-radio/2023/jan/28/mumbling-actors-bad-speakers-or-lazy-listeners-why-everyone-is-watching-tv-with-subtitles-on 2023/jan/28
https://www.theguardian.com/stage/2023/jan/27/saint-jude-review-swamp-motel-london 2023/jan/27
https://www.theguardian.com/environment/2023/jan/27/daily-wire-google-ads-climate-crisis-deniers 2023/jan/27
https://www.theguardian.com/technology/2023/jan/27/elon-musk-doesnt-seem-like-right-person-to-own-twitter-says-co-founder 2023/jan/27
https://www.theguardian.com/us-news/2023/jan/26/uber-lyft-new-york-zero-emission-2030 2023/jan/26
https://www.theguardian.com/society/2023/jan/27/covid-lockdowns-created-online-backdoor-for-child-abusers-says-charity 2023/jan/27
https://www

## Using the Guiardian API

### Task 6. Response Statistics