# Web Scrapping in Python
### Keyword: always to try to get permission before scrapping
### Component: HTML, CSS, JS
### HTML: basic structure, CSS: design style, JS: define interactive elements of a webpage

In [None]:
# install necessary libraries
# pip install requests
# pip install lxml
# pip install bs4

In [1]:
# import modules
import requests
import bs4
import lxml

## Grabbing a Webpage Title

In [7]:
# Grabbing a Webpage Title, and assign to variable result
result = requests.get('http://www.example.com')

In [3]:
# know the type
type(result)

requests.models.Response

In [6]:
# display the text of the result
result.text

'<!doctype html>\n<html>\n<head>\n    <title>Example Domain</title>\n\n    <meta charset="utf-8" />\n    <meta http-equiv="Content-type" content="text/html; charset=utf-8" />\n    <meta name="viewport" content="width=device-width, initial-scale=1" />\n    <style type="text/css">\n    body {\n        background-color: #f0f0f2;\n        margin: 0;\n        padding: 0;\n        font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;\n        \n    }\n    div {\n        width: 600px;\n        margin: 5em auto;\n        padding: 2em;\n        background-color: #fdfdff;\n        border-radius: 0.5em;\n        box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);\n    }\n    a:link, a:visited {\n        color: #38488f;\n        text-decoration: none;\n    }\n    @media (max-width: 700px) {\n        div {\n            margin: 0 auto;\n            width: auto;\n        }\n    }\n    </style>    \n</head>\n\n<body>\n<div>\n    <

In [8]:
# use bs4
soup = bs4.BeautifulSoup(result.text,'lxml')

In [10]:
# call soup
soup  # the syntax of html more neat

<!DOCTYPE html>
<html>
<head>
<title>Example Domain</title>
<meta charset="utf-8"/>
<meta content="text/html; charset=utf-8" http-equiv="Content-type"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<style type="text/css">
    body {
        background-color: #f0f0f2;
        margin: 0;
        padding: 0;
        font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
        
    }
    div {
        width: 600px;
        margin: 5em auto;
        padding: 2em;
        background-color: #fdfdff;
        border-radius: 0.5em;
        box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);
    }
    a:link, a:visited {
        color: #38488f;
        text-decoration: none;
    }
    @media (max-width: 700px) {
        div {
            margin: 0 auto;
            width: auto;
        }
    }
    </style>
</head>
<body>
<div>
<h1>Example Domain</h1>
<p>This domain is for use in illustrative examples

In [13]:
# grab things like Title use soup and make it to a list
soup.select('title')   # grab title
soup.select('p')       # grab paragraph

[<title>Example Domain</title>]

[<p>This domain is for use in illustrative examples in documents. You may use this
     domain in literature without prior coordination or asking for permission.</p>,
 <p><a href="https://www.iana.org/domains/example">More information...</a></p>]

In [14]:
# get the content of Title
soup.select('title')[0].getText()

'Example Domain'

In [16]:
# get the content of paragraph
site_paragraphs = soup.select('p')
site_paragraphs[0].getText()

'This domain is for use in illustrative examples in documents. You may use this\n    domain in literature without prior coordination or asking for permission.'

## Grabbing all elements of a class
#### Keywords:
##### soup.select('div')
##### soup.select('#some_id')
##### soup.select('.some_class')
##### soup.select('div span')
##### soup.select('div > span')

In [17]:
res = requests.get('https://en.wikipedia.org/wiki/Grace_Hopper')

In [18]:
soup = bs4.BeautifulSoup(res.text,'lxml')

In [22]:
soup.select('.vector-toc-numb')

[<span class="vector-toc-numb">1</span>,
 <span class="vector-toc-numb">2</span>,
 <span class="vector-toc-numb">2.1</span>,
 <span class="vector-toc-numb">2.2</span>,
 <span class="vector-toc-numb">2.3</span>,
 <span class="vector-toc-numb">2.4</span>,
 <span class="vector-toc-numb">3</span>,
 <span class="vector-toc-numb">4</span>,
 <span class="vector-toc-numb">5</span>,
 <span class="vector-toc-numb">6</span>,
 <span class="vector-toc-numb">7</span>,
 <span class="vector-toc-numb">8</span>,
 <span class="vector-toc-numb">8.1</span>,
 <span class="vector-toc-numb">8.2</span>,
 <span class="vector-toc-numb">9</span>,
 <span class="vector-toc-numb">9.1</span>,
 <span class="vector-toc-numb">9.2</span>,
 <span class="vector-toc-numb">9.3</span>,
 <span class="vector-toc-numb">9.3.1</span>,
 <span class="vector-toc-numb">10</span>,
 <span class="vector-toc-numb">11</span>,
 <span class="vector-toc-numb">12</span>,
 <span class="vector-toc-numb">13</span>,
 <span class="vector-toc-numb">

In [23]:
first_item = soup.select('.vector-toc-numb')[0]

In [25]:
for item in soup.select('.vector-toc-numb'):
    print(item.text)

1
2
2.1
2.2
2.3
2.4
3
4
5
6
7
8
8.1
8.2
9
9.1
9.2
9.3
9.3.1
10
11
12
13
14
15


## Grabbing an Image

In [28]:
res = requests.get("https://en.wikipedia.org/wiki/Deep_Blue_(chess_computer)")

In [30]:
soup = bs4.BeautifulSoup(res.text, 'lxml')

In [32]:
soup.select('img')[0]

<img alt="" aria-hidden="true" class="mw-logo-icon" height="50" src="/static/images/icons/wikipedia.png" width="50"/>

In [33]:
soup.select('.thumbimage')

[<img class="thumbimage" data-file-height="600" data-file-width="800" decoding="async" height="165" src="//upload.wikimedia.org/wikipedia/commons/thumb/6/6f/Kasparov_Magath_1985_Hamburg-2.png/220px-Kasparov_Magath_1985_Hamburg-2.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/6/6f/Kasparov_Magath_1985_Hamburg-2.png/330px-Kasparov_Magath_1985_Hamburg-2.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/6/6f/Kasparov_Magath_1985_Hamburg-2.png/440px-Kasparov_Magath_1985_Hamburg-2.png 2x" width="220"/>,
 <img class="thumbimage" data-file-height="2756" data-file-width="2067" decoding="async" height="293" src="//upload.wikimedia.org/wikipedia/commons/thumb/8/83/One_of_Deep_Blue%27s_processors_%282586060990%29.jpg/220px-One_of_Deep_Blue%27s_processors_%282586060990%29.jpg" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/8/83/One_of_Deep_Blue%27s_processors_%282586060990%29.jpg/330px-One_of_Deep_Blue%27s_processors_%282586060990%29.jpg 1.5x, //upload.wikimedia.org/wiki

In [34]:
computer = soup.select('.thumbimage')[0]

In [35]:
computer

<img class="thumbimage" data-file-height="600" data-file-width="800" decoding="async" height="165" src="//upload.wikimedia.org/wikipedia/commons/thumb/6/6f/Kasparov_Magath_1985_Hamburg-2.png/220px-Kasparov_Magath_1985_Hamburg-2.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/6/6f/Kasparov_Magath_1985_Hamburg-2.png/330px-Kasparov_Magath_1985_Hamburg-2.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/6/6f/Kasparov_Magath_1985_Hamburg-2.png/440px-Kasparov_Magath_1985_Hamburg-2.png 2x" width="220"/>

In [36]:
computer['class']
computer['src']

['thumbimage']

'//upload.wikimedia.org/wikipedia/commons/thumb/6/6f/Kasparov_Magath_1985_Hamburg-2.png/220px-Kasparov_Magath_1985_Hamburg-2.png'

In [40]:
image_link = requests.get('https://upload.wikimedia.org/wikipedia/commons/thumb/6/6f/Kasparov_Magath_1985_Hamburg-2.png/220px-Kasparov_Magath_1985_Hamburg-2.png')

In [46]:
# image_link.content

In [48]:
f = open('my_computer_image.jpg','wb')

In [49]:
f.write(image_link.content) # to put the image we grab into the my_computer_image.jpg

85054

In [50]:
f.close()

## Book Examples Part 1

In [51]:
# GOAL: Get title of every book with a 2 star rating

In [52]:
import requests
import bs4

In [53]:
'http://books.toscrape.com/catalogue/page-2.html'

'http://books.toscrape.com/catalogue/page-2.html'

In [54]:
base_url = 'http://books.toscrape.com/catalogue/page-{}.html'

In [55]:
page_num = 12
base_url.format(page_num)

'http://books.toscrape.com/catalogue/page-12.html'

In [56]:
res = requests.get(base_url.format(1))

In [57]:
soup = bs4.BeautifulSoup(res.text,'lxml')

In [60]:
soup.select('.product_pod')[0] # select class product_pod

<article class="product_pod">
<div class="image_container">
<a href="a-light-in-the-attic_1000/index.html"><img alt="A Light in the Attic" class="thumbnail" src="../media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg"/></a>
</div>
<p class="star-rating Three">
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
</p>
<h3><a href="a-light-in-the-attic_1000/index.html" title="A Light in the Attic">A Light in the ...</a></h3>
<div class="product_price">
<p class="price_color">Â£51.77</p>
<p class="instock availability">
<i class="icon-ok"></i>
    
        In stock
    
</p>
<form>
<button class="btn btn-primary btn-block" data-loading-text="Adding..." type="submit">Add to basket</button>
</form>
</div>
</article>

## Book Examples Part 2

In [61]:
products = soup.select('.product_pod')

In [62]:
example = products[0]
example

<article class="product_pod">
<div class="image_container">
<a href="a-light-in-the-attic_1000/index.html"><img alt="A Light in the Attic" class="thumbnail" src="../media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg"/></a>
</div>
<p class="star-rating Three">
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
</p>
<h3><a href="a-light-in-the-attic_1000/index.html" title="A Light in the Attic">A Light in the ...</a></h3>
<div class="product_price">
<p class="price_color">Â£51.77</p>
<p class="instock availability">
<i class="icon-ok"></i>
    
        In stock
    
</p>
<form>
<button class="btn btn-primary btn-block" data-loading-text="Adding..." type="submit">Add to basket</button>
</form>
</div>
</article>

In [65]:
# dirty way
'star-rating Three' in str(example)

True

In [67]:
# cool way
example.select('.star-rating.Three')  # when use select, space we replace with . before Three

[<p class="star-rating Three">
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 </p>]

In [68]:
example

<article class="product_pod">
<div class="image_container">
<a href="a-light-in-the-attic_1000/index.html"><img alt="A Light in the Attic" class="thumbnail" src="../media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg"/></a>
</div>
<p class="star-rating Three">
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
</p>
<h3><a href="a-light-in-the-attic_1000/index.html" title="A Light in the Attic">A Light in the ...</a></h3>
<div class="product_price">
<p class="price_color">Â£51.77</p>
<p class="instock availability">
<i class="icon-ok"></i>
    
        In stock
    
</p>
<form>
<button class="btn btn-primary btn-block" data-loading-text="Adding..." type="submit">Add to basket</button>
</form>
</div>
</article>

In [69]:
example.select('a')

[<a href="a-light-in-the-attic_1000/index.html"><img alt="A Light in the Attic" class="thumbnail" src="../media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg"/></a>,
 <a href="a-light-in-the-attic_1000/index.html" title="A Light in the Attic">A Light in the ...</a>]

In [70]:
type(example.select('a')[1])

bs4.element.Tag

In [71]:
example.select('a')[1]['title']

'A Light in the Attic'

In [72]:
# We can check if something is 2 stars (string call in, example.select(rating))
# example.select('a')[1]['title'] to grab the book title

In [77]:
# Web Scrapping Final
two_star_titles = []

for n in range(1,51): # 51 because there is 51 pages
    
    scrape_url = base_url.format(n)
    res = requests.get(scrape_url)
    
    soup = bs4.BeautifulSoup(res.text,'lxml')
    books = soup.select('.product_pod')
    
    for book in books:
        if len(book.select('.star-rating.Two')) != 0:
            book_title = book.select('a')[1]['title']
            two_star_titles.append(book_title)

In [78]:
two_star_titles

['Starving Hearts (Triangular Trade Trilogy, #1)',
 'Libertarianism for Beginners',
 "It's Only the Himalayas",
 'How Music Works',
 'Maude (1883-1993):She Grew Up with the country',
 "You can't bury them all: Poems",
 'Reasons to Stay Alive',
 'Without Borders (Wanderlove #1)',
 'Soul Reader',
 'Security',
 'Saga, Volume 5 (Saga (Collected Editions) #5)',
 'Reskilling America: Learning to Labor in the Twenty-First Century',
 'Political Suicide: Missteps, Peccadilloes, Bad Calls, Backroom Hijinx, Sordid Pasts, Rotten Breaks, and Just Plain Dumb Mistakes in the Annals of American Politics',
 'Obsidian (Lux #1)',
 'My Paris Kitchen: Recipes and Stories',
 'Masks and Shadows',
 'Lumberjanes, Vol. 2: Friendship to the Max (Lumberjanes #5-8)',
 'Lumberjanes Vol. 3: A Terrible Plan (Lumberjanes #9-12)',
 'Judo: Seven Steps to Black Belt (an Introductory Guide for Beginners)',
 'I Hate Fairyland, Vol. 1: Madly Ever After (I Hate Fairyland (Compilations) #1-5)',
 'Giant Days, Vol. 2 (Giant Day