## Web Scraping Example


In [93]:
pip install beautifulsoup4



In [94]:
pip install requests



In [95]:
pip install webbrowser

[31mERROR: Could not find a version that satisfies the requirement webbrowser (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for webbrowser[0m[31m
[0m

In [96]:
pip install selenium



In [97]:
import requests as rqst
import webbrowser as wb

In [98]:
from bs4 import BeautifulSoup as bs
import pandas as pd

In [99]:
rqst.get("https://ww.google.com/")

<Response [200]>

In [100]:
html_doc = """<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

In [101]:
soup = bs(html_doc, 'html.parser')

In [102]:
soup

<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body></html>

In [103]:
print(soup.prettify())

<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>


In [104]:
soup.title

<title>The Dormouse's story</title>

In [105]:
soup.a

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [106]:
soup.p

<p class="title"><b>The Dormouse's story</b></p>

In [107]:
soup.find_all('a')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [108]:
paragraph =[]
for a in soup.find_all('p'):
  paragraph.append(a.text)

In [109]:
paragraph

["The Dormouse's story",
 'Once upon a time there were three little sisters; and their names were\nElsie,\nLacie and\nTillie;\nand they lived at the bottom of a well.',
 '...']

In [110]:
soup.find(id="link3")

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>

In [111]:
url =[]
for link in soup.find_all('a'):
    url.append(link.get('href'))

In [112]:
url


['http://example.com/elsie',
 'http://example.com/lacie',
 'http://example.com/tillie']

In [113]:
title = soup.title.text

In [114]:
data = {
    'paragraph': paragraph,
    'url':url,
    'title':title

}

In [115]:
df = pd.DataFrame(data)

In [116]:
df

Unnamed: 0,paragraph,url,title
0,The Dormouse's story,http://example.com/elsie,The Dormouse's story
1,Once upon a time there were three little siste...,http://example.com/lacie,The Dormouse's story
2,...,http://example.com/tillie,The Dormouse's story


In [117]:
df.title

0    The Dormouse's story
1    The Dormouse's story
2    The Dormouse's story
Name: title, dtype: object

## Books Scraping Example

In [170]:
import numpy as np
pages = np.arange(1,51)

In [202]:
page_num = 2
book_title = []
book_rating=[]
book_price = []
stock = []

for page_num in range(1,51):
  print(f" I am scraping page no: {page_num}")
  URI = f'http://books.toscrape.com/catalogue/category/books_1/page-5.html'
  web=rqst.get(URI)
  soup = bs(web.content, 'html.parser')
  for title in soup.find_all('h3'):
      book_title.append(title.a['title'])

  for i in soup.find_all('article'):
      book_rating.append(i.p['class'][1])

  for i in soup.find_all('p',{'class':'price_color'}):
      book_price.append(i.text)

  for i in soup.find_all('p',{'class': 'instock availability'}):
      stock.append(i.text.split('\n')[3].strip())


 I am scraping page no: 1
 I am scraping page no: 2
 I am scraping page no: 3
 I am scraping page no: 4
 I am scraping page no: 5
 I am scraping page no: 6
 I am scraping page no: 7
 I am scraping page no: 8
 I am scraping page no: 9
 I am scraping page no: 10
 I am scraping page no: 11
 I am scraping page no: 12
 I am scraping page no: 13
 I am scraping page no: 14
 I am scraping page no: 15
 I am scraping page no: 16
 I am scraping page no: 17
 I am scraping page no: 18
 I am scraping page no: 19
 I am scraping page no: 20
 I am scraping page no: 21
 I am scraping page no: 22
 I am scraping page no: 23
 I am scraping page no: 24
 I am scraping page no: 25
 I am scraping page no: 26
 I am scraping page no: 27
 I am scraping page no: 28
 I am scraping page no: 29
 I am scraping page no: 30
 I am scraping page no: 31
 I am scraping page no: 32
 I am scraping page no: 33
 I am scraping page no: 34
 I am scraping page no: 35
 I am scraping page no: 36
 I am scraping page no: 37
 I am scra

In [203]:
web.status_code

200

In [205]:

from selenium import webdriver

In [206]:
soup.article.p['class'][1]

'Five'

In [207]:
soup.p['class'][1]

'Five'

In [208]:
data = {
   'Book_title':book_title,
   'Book_Rating':book_rating,
   'Book_Price':book_price,
   'Availability': stock
}

In [209]:
df = pd.DataFrame(data)

In [210]:
df.head()

Unnamed: 0,Book_title,Book_Rating,Book_Price,Availability
0,"Princess Jellyfish 2-in-1 Omnibus, Vol. 01 (Pr...",Five,£13.61,In stock
1,Princess Between Worlds (Wide-Awake Princess #5),Five,£13.34,In stock
2,"Pop Gun War, Volume 1: Gift",One,£18.97,In stock
3,"Political Suicide: Missteps, Peccadilloes, Bad...",Two,£36.28,In stock
4,Patience,Three,£10.16,In stock


In [211]:
df.to_csv('BOOK_DATA.csv', index=False)

In [212]:
df

Unnamed: 0,Book_title,Book_Rating,Book_Price,Availability
0,"Princess Jellyfish 2-in-1 Omnibus, Vol. 01 (Pr...",Five,£13.61,In stock
1,Princess Between Worlds (Wide-Awake Princess #5),Five,£13.34,In stock
2,"Pop Gun War, Volume 1: Gift",One,£18.97,In stock
3,"Political Suicide: Missteps, Peccadilloes, Bad...",Two,£36.28,In stock
4,Patience,Three,£10.16,In stock
...,...,...,...,...
995,Lumberjanes Vol. 3: A Terrible Plan (Lumberjan...,Two,£19.92,In stock
996,"Layered: Baking, Building, and Styling Spectac...",One,£40.11,In stock
997,Judo: Seven Steps to Black Belt (an Introducto...,Two,£53.90,In stock
998,Join,Five,£35.67,In stock


In [201]:
import numpy as np
df['Book_title'].value_counts().sum()

20

In [140]:
df['Book_Price']

0     £51.77
1     £53.74
2     £50.10
3     £47.82
4     £54.23
5     £22.65
6     £33.34
7     £17.93
8     £22.60
9     £52.15
10    £13.99
11    £20.66
12    £17.46
13    £52.29
14    £35.02
15    £57.25
16    £23.88
17    £37.59
18    £51.33
19    £45.17
Name: Book_Price, dtype: object

In [141]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Book_title    20 non-null     object
 1   Book_Rating   20 non-null     object
 2   Book_Price    20 non-null     object
 3   Availability  20 non-null     object
dtypes: object(4)
memory usage: 768.0+ bytes


In [142]:
df['Book_Rating'].unique()

array(['Three', 'One', 'Four', 'Five', 'Two'], dtype=object)