In [61]:
# Import important libraries 

# Standard Python Libraries 

from collections import Counter, defaultdict
import html # for html manipulations
import json # for key-value pair data structures
import os
import re # regular expression
import time 


# 3rd Party Libraries

from bs4 import BeautifulSoup # HTML parsing
from IPython.display import HTML # render HTML in the notebook
import matplotlib.pyplot as plt # plotting library
import numpy as np
import pandas as pd # tabular data (Dataframes)
import requests # get/post HTML requests

In [62]:
# define the url of the website we are interested in 
all_prizes_url = 'https://www.nobelprize.org/prizes/lists/all-nobel-prizes/all/'

# save the contents of the url to a variable
response = requests.get(all_prizes_url) # requests.get returns an object with different key-value pairs 

print(f"The status code of the URL we selected is: {response.status_code}")

The status code of the URL we selected is: 200


In [63]:
# I want to show you what happens when you make a get request to a webpage that doesn't exist 
fake_url = 'https://www.google.com/fake'

google = requests.get(fake_url)

google.status_code ## As you can see you get a 404 status code which means the page doesn't exist 

404

In [None]:
html = response.text # saved the html content on the webpage to a variable



In [65]:
# Say I want to find all the occurances of my name, Aaron on the webpage
print(re.findall(r'Aaron \S+', html)) # \S non whitespace chars, notice how it caught the anchor closing tag
print(re.findall(r'Aaron \w+', html)) # \w word chars: letters, numbers, underscores

# there are two people with my name who won the Nobel Prize

['Aaron Ciechanover</a>,', 'Aaron Klug</a>']
['Aaron Ciechanover', 'Aaron Klug']


In [71]:
# Regex is nice but it is not as complex enough to do more intricate tasks and manipulating/extracting data from the DOM

# thats where BeautifulSoup comes in, it translates the html into a tree structure for Python to understand

soup = BeautifulSoup(html)

print(type(soup))
soup

<class 'bs4.BeautifulSoup'>


 <!DOCTYPE html>

<html class="no-js" lang="en-US">
<head>
<meta charset="utf-8"/><script type="text/javascript">(window.NREUM||(NREUM={})).init={privacy:{cookies_enabled:true},ajax:{deny_list:["bam.eu01.nr-data.net"]},distributed_tracing:{enabled:true}};(window.NREUM||(NREUM={})).loader_config={agentID:"212354677",accountID:"3196970",trustKey:"3196970",xpid:"VwcOV19UCBACVVNRAgUCUlc=",licenseKey:"NRJS-0940f7eaf4cb46e1cfe",applicationID:"212354543"};;/*! For license information please see nr-loader-spa-1.296.0.min.js.LICENSE.txt */
<meta content="width=device-width, initial-scale=1" name="viewport">
<link href="http://gmpg.org/xfn/11" rel="profile"/>
<script>(function(html){html.className = html.className.replace(/\bno-js\b/,'js')})(document.documentElement);</script>
<script type="text/javascript">
/* <![CDATA[ */
window.JetpackScriptData = {"site":{"icon":"https://i0.wp.com/www.nobelprize.org/uploads/2018/08/Nobel-favicon.png?w=64\u0026ssl=1","title":"NobelPrize.org","host":"unknown",

In [None]:
# adds \n to make it more readable
indented_html = soup.prettify() # returns a string 

print(indented_html)

<!DOCTYPE html>
<html class="no-js" lang="en-US">
 <head>
  <meta charset="utf-8"/>
  <script type="text/javascript">
   (window.NREUM||(NREUM={})).init={privacy:{cookies_enabled:true},ajax:{deny_list:["bam.eu01.nr-data.net"]},distributed_tracing:{enabled:true}};(window.NREUM||(NREUM={})).loader_config={agentID:"212354677",accountID:"3196970",trustKey:"3196970",xpid:"VwcOV19UCBACVVNRAgUCUlc=",licenseKey:"NRJS-0940f7eaf4cb46e1cfe",applicationID:"212354543"};;/*! For license information please see nr-loader-spa-1.296.0.min.js.LICENSE.txt */
  </script>
  <meta content="width=device-width, initial-scale=1" name="viewport">
   <link href="http://gmpg.org/xfn/11" rel="profile"/>
   <script>
    (function(html){html.className = html.className.replace(/\bno-js\b/,'js')})(document.documentElement);
   </script>
   <script type="text/javascript">
    /* <![CDATA[ */
window.JetpackScriptData = {"site":{"icon":"https://i0.wp.com/www.nobelprize.org/uploads/2018/08/Nobel-favicon.png?w=64\u0026ssl=1

In [74]:
print(soup.select('title')) # selected all title tags, returns array
soup.select_one('a') # selected first anchor tag

[<title>All Nobel Prizes - NobelPrize.org</title>, <title>Open the search menu</title>, <title>Close the search menu</title>, <title>Open the search menu</title>, <title>Close the search menu</title>]


<a class="skip-link screen-reader-text" href="#content">
			Skip to content		</a>

In [81]:
print(soup.select('h3 a')[:3]) # selected all a tags in h3 element, first 3
soup.select('a')[:5] # selected first 5 a tags on the webpage

[<a href="https://www.nobelprize.org/prizes/physics/2024/summary/">
			Nobel Prize in Physics 2024		</a>, <a href="https://www.nobelprize.org/prizes/chemistry/2024/summary/">
			Nobel Prize in Chemistry 2024		</a>, <a href="https://www.nobelprize.org/prizes/medicine/2024/summary/">
			Nobel Prize in Physiology or Medicine 2024		</a>]


[<a class="skip-link screen-reader-text" href="#content">
 			Skip to content		</a>,
 <a aria-label="Nobel Prize" href="https://www.nobelprize.org" itemprop="url" name="top">
 <svg class="site-logo-icon site-logo-icon--big" fill="none" height="60" viewbox="0 0 77 60" width="77" xmlns="http://www.w3.org/2000/svg"><path d="M25.298 1.997h-4.631v14.66h-2.21V1.997h-4.674V0h11.515v1.997ZM42.126 0v16.657h-2.21V8.711h-9.179v7.946h-2.21V0h2.21v6.714h9.179V0h2.21Zm14.617 16.657V14.66h-7.691V8.711h7.011V6.714h-7.011V1.997h7.564V0h-9.731v16.657h9.858Zm-43.045 5.014V38.33h-1.53L2.225 25.41v12.918H.1V21.67h1.7l9.773 12.578V21.671h2.125Zm12.025-.34c-4.801 0-8.71 3.867-8.71 8.669 0 4.802 3.909 8.669 8.71 8.669 4.802 0 8.711-3.867 8.711-8.669 0-4.802-3.91-8.669-8.71-8.669Zm.043 2.083c3.527 0 6.459 2.932 6.459 6.586 0 3.654-2.932 6.586-6.46 6.586-3.526 0-6.458-2.932-6.458-6.586 0-3.654 2.932-6.586 6.459-6.586Zm23.116 9.943c0 2.932-2.125 4.972-5.227 4.972h-5.779V21.67h5.27c3.059 0 4.971 1.53 4.971 3.995 