In [1]:
import requests

In [2]:
page = requests.get("http://dataquestio.github.io/web-scraping-pages/simple.html")
page

<Response [200]>

In [3]:
page.content

b'<!DOCTYPE html>\n<html>\n    <head>\n        <title>A simple example page</title>\n    </head>\n    <body>\n        <p>Here is some simple content for this page.</p>\n    </body>\n</html>'

In [4]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(page.content, 'html.parser')
print(soup.prettify)

<bound method Tag.prettify of <!DOCTYPE html>

<html>
<head>
<title>A simple example page</title>
</head>
<body>
<p>Here is some simple content for this page.</p>
</body>
</html>>


In [5]:
list(soup.children)

['html',
 '\n',
 <html>
 <head>
 <title>A simple example page</title>
 </head>
 <body>
 <p>Here is some simple content for this page.</p>
 </body>
 </html>]

In [6]:
[type(item) for item in list(soup.children)] # List Comprehension
# In below output, there are 3 tags i.e.  1st line html, 2nd line newline and 3rd line html tag.

[bs4.element.Doctype, bs4.element.NavigableString, bs4.element.Tag]

## Breakdown tags

In [7]:
# store 3rd tag in a element html
html = list(soup.children)[2]

In [8]:
# now make list and show
list(html.children)

['\n',
 <head>
 <title>A simple example page</title>
 </head>,
 '\n',
 <body>
 <p>Here is some simple content for this page.</p>
 </body>,
 '\n']

In [9]:
body = list(html.children)[3]

In [10]:
list(body.children)

['\n', <p>Here is some simple content for this page.</p>, '\n']

In [12]:
p = list(body.children)[1]
list(p.children)   # showing in a form of list

['Here is some simple content for this page.']

In [13]:
# to show in a form of text
p.get_text()

'Here is some simple content for this page.'

## To show all instances at once

In [14]:
# it will show all the paragraphs tags in the form of list
p = soup.find_all('p')
p

[<p>Here is some simple content for this page.</p>]

In [15]:
# it will show all the paragraphs tags in the form of text and it is fast process as compared to breakdown process
# as you can see only 1 step is used for this whole process
p = soup.find_all('p')[0].get_text()
p

'Here is some simple content for this page.'

## Example using classs or id's


In [16]:
page = requests.get("http://dataquestio.github.io/web-scraping-pages/ids_and_classes.html")
soup = BeautifulSoup(page.content, 'html.parser')
soup

<html>
<head>
<title>A simple example page</title>
</head>
<body>
<div>
<p class="inner-text first-item" id="first">
                First paragraph.
            </p>
<p class="inner-text">
                Second paragraph.
            </p>
</div>
<p class="outer-text first-item" id="second">
<b>
                First outer paragraph.
            </b>
</p>
<p class="outer-text">
<b>
                Second outer paragraph.
            </b>
</p>
</body>
</html>

In [17]:
soup.find_all('p', class_='outer-text')  # in this line 'p' is unnecessary as it can be search only by class name

[<p class="outer-text first-item" id="second">
 <b>
                 First outer paragraph.
             </b>
 </p>,
 <p class="outer-text">
 <b>
                 Second outer paragraph.
             </b>
 </p>]

In [18]:
soup.find_all(id='first')

[<p class="inner-text first-item" id="first">
                 First paragraph.
             </p>]

### css selectors

In [19]:
# take all p tags from div tag
soup.select("div p")

[<p class="inner-text first-item" id="first">
                 First paragraph.
             </p>,
 <p class="inner-text">
                 Second paragraph.
             </p>]

In [20]:
# take all p tags of first-item class  --------------   one is inside div class and other is outside div class
soup.select("p.first-item")  # in this line it will take only p tags. To take anchor tags we will use a.first-item

[<p class="inner-text first-item" id="first">
                 First paragraph.
             </p>,
 <p class="outer-text first-item" id="second">
 <b>
                 First outer paragraph.
             </b>
 </p>]

In [21]:
# it will take all instances of first-item
soup.select('.first-item')

[<p class="inner-text first-item" id="first">
                 First paragraph.
             </p>,
 <p class="outer-text first-item" id="second">
 <b>
                 First outer paragraph.
             </b>
 </p>]

## Wikipedia Web Scrapping

In [22]:
page = requests.get("https://en.wikipedia.org/wiki/Web_scraping")
page.status_code

200

In [23]:
soup = BeautifulSoup(page.content, "html.parser")

title = soup.find("h1", id="firstHeading")     #    || title = soup.find("h1", id="firstHeading").text     
print(title.text)                              #    ||    print(title)

Web scraping


In [24]:
data = soup.find("div", id="mw-content-text").text
print(data)

Data scraping used for extracting data from websites
This article needs additional citations for verification. Please help improve this article by adding citations to reliable sources. Unsourced material may be challenged and removed.Find sources: "Web scraping" – news · newspapers · books · scholar · JSTOR (June 2017) (Learn how and when to remove this template message)
For broader coverage of this topic, see Data scraping.
Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. The web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis.
Web scraping a web

In [25]:
import pandas as pd

In [26]:
data = pd.read_excel("data.xlsx")
data

Unnamed: 0,Names
0,Artificial Intelligence
1,Blockchain
2,Internet of things
3,Cloud Computing
4,Amazon
5,Laptop
6,Mobile
7,Computer
8,Windows
9,Steve Jobs


In [27]:
data['Names']

0    Artificial Intelligence
1                 Blockchain
2         Internet of things
3            Cloud Computing
4                     Amazon
5                     Laptop
6                     Mobile
7                   Computer
8                    Windows
9                 Steve Jobs
Name: Names, dtype: object

In [28]:
# in wikipedia, url usually look like Artificial_Intelligence, so in order to replace Artificial Intelligence
# into the former one we use below function
# Method number one
#data['Names changed'] = data['Names'].str.replace(" ","_")

In [29]:
#data['Names changed']

In [30]:
# Method number two
def change(val):
    val2 = val.split(" ")
    val2 = "_".join(val2)
    print(val2)
    return val2

In [31]:
data["Names New"] = data["Names"].apply(change)

Artificial_Intelligence
Blockchain
Internet_of_things
Cloud_Computing
Amazon
Laptop
Mobile
Computer
Windows
Steve_Jobs


In [32]:
data

Unnamed: 0,Names,Names New
0,Artificial Intelligence,Artificial_Intelligence
1,Blockchain,Blockchain
2,Internet of things,Internet_of_things
3,Cloud Computing,Cloud_Computing
4,Amazon,Amazon
5,Laptop,Laptop
6,Mobile,Mobile
7,Computer,Computer
8,Windows,Windows
9,Steve Jobs,Steve_Jobs


In [33]:
# put Name New column in a list
# Method one
name = data['Names'].to_list()
name

['Artificial Intelligence',
 'Blockchain',
 'Internet of things',
 'Cloud Computing',
 'Amazon',
 'Laptop',
 'Mobile',
 'Computer',
 'Windows',
 'Steve Jobs']

In [34]:
# Method two
names = list(data['Names'])
names

['Artificial Intelligence',
 'Blockchain',
 'Internet of things',
 'Cloud Computing',
 'Amazon',
 'Laptop',
 'Mobile',
 'Computer',
 'Windows',
 'Steve Jobs']

In [35]:
titles = []
data2 = []
for i in names:
    page = requests.get(f"https://en.wikipedia.org/wiki/{i}")
    soup = BeautifulSoup(page.content,'html.parser')
    
    title = soup.find("h1", id="firstHeading").text
    content = soup.find("div", id="mw-content-text").text
    titles.append(title)
    data2.append(content)


print(titles)

['Artificial intelligence', 'Blockchain', 'Internet of things', 'Cloud computing', 'Amazon', 'Laptop', 'Mobile', 'Computer', 'Microsoft Windows', 'Steve Jobs']


In [56]:
# print(data2)

In [37]:
all_data = {"Names":titles, "Data":data2}

In [58]:
# all_data

In [59]:
df = pd.DataFrame(all_data)
df.to_csv("scrapped_data.csv",index=False)