# Web Scraping: The Guardian

In [60]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import time

URL definition:

In [61]:
# url definition
url = "https://www.theguardian.com/uk"

List of news:

In [62]:
# Request
r1 = requests.get(url)
r1.status_code

# cover page content
coverpage = r1.content

# Soup creation
soup1 = BeautifulSoup(coverpage, 'html5lib')

# News identification
coverpage_news = soup1.find_all('h3', class_='fc-item__title')
len(coverpage_news)

116

list, where every element is a news article:

In [63]:
coverpage_news[4]

<h3 class="fc-item__title"><a class="fc-item__link" data-link-name="article" href="https://www.theguardian.com/australia-news/2021/mar/03/australias-attorney-general-comes-forward-to-deny-historical-allegation"><span class="fc-item__kicker">Australia </span> <span class="u-faux-block-link__cta fc-item__headline"> <span class="js-headline-text">Attorney general comes forward to deny historical rape allegation</span></span> </a></h3>

#### Extract the text from the articles:

First, we define the number of articles we want:

In [64]:
number_of_articles = 1

In [65]:
# Empty lists for content, links and titles
news_contents = []
list_links = []
list_titles = []

for n in np.arange(0, number_of_articles):
        
    
    if "live" in coverpage_news[n].find('a')['href']:  
        continue
    
    # Getting the link of the article
    link = coverpage_news[n].find('a')['href']
    list_links.append(link)
    
    #
    
    # Getting the title
    title = coverpage_news[n].find('a').get_text()
    list_titles.append(title)
    
    # Reading the content (it is divided in paragraphs)
    article = requests.get(link)
    article_content = article.content
    soup_article = BeautifulSoup(article_content, 'html5lib')
    body = soup_article.find_all('div', class_='content__article-body from-content-api js-article__body')
    x = body[0].find_all('p')
    
    # Unifying the paragraphs
    list_paragraphs = []
    for p in np.arange(0, len(x)):
        paragraph = x[p].get_text()
        list_paragraphs.append(paragraph)
        final_article = " ".join(list_paragraphs)
        
    news_contents.append(final_article)

In [66]:
# df_features
df_features = pd.DataFrame(
     {'Article Content': news_contents 
    })

# df_show_info
df_show_info = pd.DataFrame(
    {'Article Title': list_titles,
     'Article Link': list_links})

In [67]:
df_features = df_features
df_features

Unnamed: 0,Article Content
0,Rishi Sunak will announce on Wednesday that th...


In [68]:
df_show_info = df_show_info
df_show_info

Unnamed: 0,Article Title,Article Link
0,Budget 2021 UK budget to extend furlough unti...,https://www.theguardian.com/uk-news/2021/mar/0...


### Time Elapsed

We find, In how much time the script takes to get the news.

In [69]:
def get_news_theguardian():
    
    # url definition
    url = "https://www.theguardian.com/uk"
    
    # Request
    r1 = requests.get(url)
    r1.status_code

    # We'll save in coverpage the cover page content
    coverpage = r1.content

    # Soup creation
    soup1 = BeautifulSoup(coverpage, 'html5lib')

    # News identification
    coverpage_news = soup1.find_all('h3', class_='fc-item__title')
    len(coverpage_news)
    
    number_of_articles = 1

    # Empty lists for content, links and titles
    news_contents = []
    list_links = []
    list_titles = []

    for n in np.arange(0, number_of_articles):

        # We need to ignore "live" pages since they are not articles
        if "live" in coverpage_news[n].find('a')['href']:  
            continue

        # Getting the link of the article
        link = coverpage_news[n].find('a')['href']
        list_links.append(link)

        # Getting the title
        title = coverpage_news[n].find('a').get_text()
        list_titles.append(title)

        # Reading the content (it is divided in paragraphs)
        article = requests.get(link)
        article_content = article.content
        soup_article = BeautifulSoup(article_content, 'html5lib')
        body = soup_article.find_all('div', class_='content__article-body from-content-api js-article__body')
        x = body[0].find_all('p')

        # Unifying the paragraphs
        list_paragraphs = []
        for p in np.arange(0, len(x)):
            paragraph = x[p].get_text()
            list_paragraphs.append(paragraph)
            final_article = " ".join(list_paragraphs)

        news_contents.append(final_article)

    # df_features
    df_features = pd.DataFrame(
         {'Content': news_contents 
        })

    # df_show_info
    df_show_info = pd.DataFrame(
        {'Article Title': list_titles,
         'Article Link': list_links,
         'Newspaper': 'The Guardian'})

    
    return (df_features, df_show_info)

In [70]:
start = time.time()
x, y = get_news_theguardian()
end =time.time()
te = end-start
print("The time elapsed is %f seconds" %(te))

The time elapsed is 2.299036 seconds


In [71]:
time_elapsed = te
time_elapsed

2.2990355491638184

### Connect pymongo

In [72]:
import pymongo
from pymongo import MongoClient

In [73]:
client = MongoClient('localhost', 27017)

In [74]:
dbs = client['test']
dbs

Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'test')

#### Create the database(The_Guardian_Data) where all the required results will be stored

In [75]:
db = client['The_Guardian_Data']
db

Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'The_Guardian_Data')

Insert article content in The_Guardian_Data

In [76]:
Article_Content = db['df_features']
Article_Content

Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'The_Guardian_Data'), 'df_features')

Insert article features in The_Guardian_Data

In [77]:
Article_Features = db['df_show_info']
Article_Features

Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'The_Guardian_Data'), 'df_show_info')

Insert article time elapsed in The_Guardian_Data

In [78]:
TimeElapsed = db['te']
TimeElapsed

Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'The_Guardian_Data'), 'te')

### Create API

In [79]:
from flask import Flask
from flask import jsonify
from flask import request
from flask_pymongo import PyMongo

In [80]:
app = Flask(__name__)

app.config['MONGO_DBNAME'] = 'The_Guardian_Data'
app.config['MONGO_URI'] = 'mongodb://localhost:27017/The_Guardian_Data'

mongo = PyMongo(app)

In [81]:
@app.route('/news', methods=['GET'])
def get_all_news():
  star = mongo.db.all_news
  output = []
  for n in news.find():
    output.append({'Article_Content' : n['Article_Content'], 'Article_Features' : n['Article_Features'], 'TimeElapsed' : n['TimeElapsed']})
  return jsonify({'result' : output})

In [82]:
@app.route('/news/', methods=['GET'])
def get_one_news(Article_Content):
  star = mongo.db.stars
  n = news.find_one({'name' : name})
  if n:
    output = {'Article_Content' : n['Article_Content'], 'Article_Features' : n['Article_Features'], 'TimeElapsed' : n['TimeElapsed']}
  else:
    output = "No such name"
  return jsonify({'result' : output})

In [83]:
@app.route('/news', methods=['POST'])
def add_news():
  news = mongo.db.all_news
  Article_Content = request.json['Article_Content']
  Article_Features = request.json['Article_Features']
  TimeElapsed = request.json['TimeElapsed']
  return jsonify({'result' : output})

In [84]:
if __name__ == '__main__':
    app.run(debug=True)

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: on


 * Restarting with windowsapi reloader


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
