# Web-Scraping data using BeautifulSoup
### By: Anas Puthawala

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

In [2]:
# page = 1
# while page != 9:
#     url = f'https://deepmind.com/blog?page={page}'
#     page += 1
    

In [3]:

url = 'https://deepmind.com/blog?page=1'
html_source = requests.get(url).text

# using BeautifulSoup
soup = BeautifulSoup(html_source, 'html.parser')
soup.prettify()

'<!DOCTYPE html>\n<html class="loading" dir="ltr" lang="ALL">\n <head>\n  <meta charset="utf-8"/>\n  <meta content="width=device-width, initial-scale=1, minimum-scale=1" name="viewport"/>\n  <base href="/"/>\n  <!-- Facebook base meta tags. -->\n  <meta content="600" property="og:image:width"/>\n  <meta content="600" property="og:image:height"/>\n  <meta content="Deepmind" property="og:site_name"/>\n  <meta content="website" property="og:type"/>\n  <!-- Twitter base meta tags. -->\n  <meta content="summary_large_image" name="twitter:card"/>\n  <meta content="@DeepMindAI" name="twitter:creator"/>\n  <!-- Set in Angular. -->\n  <title>\n   Blog | DeepMind\n  </title>\n  <meta content="Read the latest articles and stories from DeepMind and find out more about our latest breakthroughs in cutting-edge AI research." name="description"/>\n  <link href="https://lh3.googleusercontent.com" rel="preconnect"/>\n  <link href="assets/favicon.ico" rel="icon"/>\n  <link href="styles.4ac422d2d6faf63ca7

In [4]:
res = 'content-card-body'

match = soup.find_all('div', class_=res)

#Loop through match and find all seperate sections ('content') and get the title
titlelist = []
for content in match:
    title = content.find('a') 
    titlelist.append(title.text.strip())
titlelist

['Language modelling at scale',
 'Exploring the beauty of pure mathematics in novel ways',
 'Real-World Challenges for AGI',
 'Opening up a physics simulator for robotics',
 'Stacking our way to more general robots',
 'Predicting gene expression with AI',
 'Nowcasting',
 'Building architectures that can handle the world’s data',
 'Generally capable agents emerge from open-ended play',
 'Putting the power of AlphaFold into the world’s hands',
 'An update on our racial justice efforts',
 'Advancing sports analytics through AI research',
 'Game theory as an engine for large-scale data analysis',
 'MuZero: Mastering Go, chess, shogi and Atari without rules',
 'Using JAX to accelerate our research',
 'AlphaFold: a solution to a 50-year-old grand challenge in biology',
 'Breaking down global barriers to access',
 'FermiNets: Quantum Physics and Chemistry from First Principles',
 'Fast reinforcement learning through the composition of behaviours',
 'Traffic prediction with advanced Graph Neur

In [5]:
# Next let's get the summaries
summarylist = []
for content in match:
    summary = content.find('p', class_ = 'medium')
    summarylist.append(summary.text.strip())
    
summarylist
    

['We are releasing three papers on language models, Gopher, ethical considerations, and retrieval.',
 'Discovering new patterns in the fields of topology and representation theory with machine learning',
 'Koray Kavukcuoglu, VP of Research, discusses why addressing real-world challenges now helps advance the development of...',
 "As part of DeepMind's mission of advancing science, we have acquired the MuJoCo physics simulator and are making it...",
 'Introducing RGB-Stacking as a new benchmark for vision-based robotic manipulation.',
 'Our new Enformer architecture advances genetic research by improving the ability to predict how DNA sequence influences...',
 'Our latest research and state-of-the-art model advances the science of Precipitation Nowcasting.',
 'Perceiver IO, a more general version of the Perceiver architecture, can produce a wide variety of outputs from many...',
 'In new work, algorithmic advances and new training environments lead to agents which exhibit general heuris

In [6]:
# Let's also get the date of the publication while we're at it
dates = []
for content in match:
    footer = content.find('div', class_='footer') #Find the footer
    
    dates.append(footer.find('p').text.strip()) #Append the dates that are being scraped from the footer
dates

['08 Dec 2021',
 '01 Dec 2021',
 '02 Nov 2021',
 '18 Oct 2021',
 '11 Oct 2021',
 '04 Oct 2021',
 '29 Sep 2021',
 '03 Aug 2021',
 '27 Jul 2021',
 '22 Jul 2021',
 '04 Jun 2021',
 '07 May 2021',
 '06 May 2021',
 '23 Dec 2020',
 '04 Dec 2020',
 '30 Nov 2020',
 '05 Nov 2020',
 '19 Oct 2020',
 '12 Oct 2020',
 '03 Sep 2020']

In [7]:
#Last but not least, let's see if it was a 'Research' article
researchlist = []
for content in match:
    header = content.find('div', class_=['header', 'category']) #Header
    researchlist.append(header.find_all('p', class_="caption")[-1].text.strip()) # Append to researchlist, the 'type' of article
    
researchlist

['Research',
 'Research',
 '',
 'News',
 'Research',
 'Research',
 '',
 '',
 'Research',
 'Research',
 'News',
 '',
 '',
 '',
 '',
 'Research',
 '',
 'Research',
 '',
 'Research']

## Sculpting the DataFrame

Now that we have the title, the summaries, the dates, and the type of article it is let's throw it all into a Pandas DataFrame and output a CSV file at the end of it all

In [8]:
df = pd.DataFrame(list(zip(titlelist, summarylist, dates, researchlist)), columns = ['Title', 'Summary', 'Date', 'Type of Article'])
df

Unnamed: 0,Title,Summary,Date,Type of Article
0,Language modelling at scale,We are releasing three papers on language mode...,08 Dec 2021,Research
1,Exploring the beauty of pure mathematics in no...,Discovering new patterns in the fields of topo...,01 Dec 2021,Research
2,Real-World Challenges for AGI,"Koray Kavukcuoglu, VP of Research, discusses w...",02 Nov 2021,
3,Opening up a physics simulator for robotics,As part of DeepMind's mission of advancing sci...,18 Oct 2021,News
4,Stacking our way to more general robots,Introducing RGB-Stacking as a new benchmark fo...,11 Oct 2021,Research
5,Predicting gene expression with AI,Our new Enformer architecture advances genetic...,04 Oct 2021,Research
6,Nowcasting,Our latest research and state-of-the-art model...,29 Sep 2021,
7,Building architectures that can handle the wor...,"Perceiver IO, a more general version of the Pe...",03 Aug 2021,
8,Generally capable agents emerge from open-ende...,"In new work, algorithmic advances and new trai...",27 Jul 2021,Research
9,Putting the power of AlphaFold into the world’...,"In partnership with EMBL-EBI, we’re incredibly...",22 Jul 2021,Research


In [10]:
# Output it as a excel file

filename = 'deepmind_webscrape.xlsx'
df.to_excel(filename)