# Extracting Data using Web Scraping.

### Note: it is recommended to obtain permissions from websites' administrators before attempting any scraping on their sites. Due to these reasons, we will just be scraping from a little html markup. The steps remain the same for a website

In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
html_script = """
<!doctype html>
<html lang=en>
<head>
   <title>Jupyter Notebook magic functions</title>
<body>
    <h1 style="color:DarkBlue">Exploring Jupyter Notebook magic functions</h1>
    <p id="author">Author :: Carlos St Valery Vouking.</p>
    <p id="description"> These so called 'magical functions' are used to extend the development environment to an entire new level.
       'Magical functions' always begin with a '%' sign for a single line and '%%' for a multiline. We will explore some of them here</p>
     <h3 style="color:red">Magical Functions</h3>
     <table id="function">
     <tr>
        <th>Function</th>
        <th>Representation</th>
        <th>Usage</th>
     </tr>
     <tr>
        <td> Matplotlib</td>
        <td>%matplotlib inline</td>
        <td>Matplotlib is a very used data vizualization package in python. 'inline' to show the visualizations inside jupyter notebook itself.</td>
     </tr>
     <tr>
        <td>time</td>
        <td>%time x = range(100000)</td>
        <td>time() function helps in measuring estimation time.</td>
     </tr>
     <tr>
        <td>writefile</td>
        <td>%%writefile write_file.py</td>
        <td>writefile creates a file and write in it. If the file exists it is overwritten otherwise it is created.</td>
     </tr>
     <tr>
        <td>HTML</td>
        <td>%%HTML -- <i> Image embedded in jupyter notebook </i></td>
        <td>HTML() is used to embed html markup in jypyter notebook</td>
     </tr>
     <tr>
        <td>Latex</td>
        <td>%%latex F(x)= Ax + B</td>
        <td>Embed equations in notebook with latex() magical function</td>
     </tr>
      <tr>
        <td>load_ext</td>
        <td>%load_ext sql</td>
        <td>load_ext() is used to load other extensions. For instance, a sql code can be plugged right in the notebook</td>
     </tr>     
     
     </table>
</body>
</head>

</html>
"""

In [3]:
from IPython.core.display import display, HTML
display(HTML(html_script))

Function,Representation,Usage
Matplotlib,%matplotlib inline,Matplotlib is a very used data vizualization package in python. 'inline' to show the visualizations inside jupyter notebook itself.
time,%time x = range(100000),time() function helps in measuring estimation time.
writefile,%%writefile write_file.py,writefile creates a file and write in it. If the file exists it is overwritten otherwise it is created.
HTML,%%HTML -- Image embedded in jupyter notebook,HTML() is used to embed html markup in jypyter notebook
Latex,%%latex F(x)= Ax + B,Embed equations in notebook with latex() magical function
load_ext,%load_ext sql,"load_ext() is used to load other extensions. For instance, a sql code can be plugged right in the notebook"


In [4]:
ps = BeautifulSoup(html_script)



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


In [5]:
ps

<!DOCTYPE html>
<html lang="en">
<head>
<title>Jupyter Notebook magic functions</title>
</head><body>
<h1 style="color:DarkBlue">Exploring Jupyter Notebook magic functions</h1>
<p id="author">Author :: Carlos St Valery Vouking.</p>
<p id="description"> These so called 'magical functions' are used to extend the development environment to an entire new level.
       'Magical functions' always begin with a '%' sign for a single line and '%%' for a multiline. We will explore some of them here</p>
<h3 style="color:red">Magical Functions</h3>
<table id="function">
<tr>
<th>Function</th>
<th>Representation</th>
<th>Usage</th>
</tr>
<tr>
<td> Matplotlib</td>
<td>%matplotlib inline</td>
<td>Matplotlib is a very used data vizualization package in python. 'inline' to show the visualizations inside jupyter notebook itself.</td>
</tr>
<tr>
<td>time</td>
<td>%time x = range(100000)</td>
<td>time() function helps in measuring estimation time.</td>
</tr>
<tr>
<td>writefile</td>
<td>%%writefile write_f

In [6]:
# use the name parameter to extract by tag name: 'body'
body = ps.find(name="body")

In [7]:
print(body)

<body>
<h1 style="color:DarkBlue">Exploring Jupyter Notebook magic functions</h1>
<p id="author">Author :: Carlos St Valery Vouking.</p>
<p id="description"> These so called 'magical functions' are used to extend the development environment to an entire new level.
       'Magical functions' always begin with a '%' sign for a single line and '%%' for a multiline. We will explore some of them here</p>
<h3 style="color:red">Magical Functions</h3>
<table id="function">
<tr>
<th>Function</th>
<th>Representation</th>
<th>Usage</th>
</tr>
<tr>
<td> Matplotlib</td>
<td>%matplotlib inline</td>
<td>Matplotlib is a very used data vizualization package in python. 'inline' to show the visualizations inside jupyter notebook itself.</td>
</tr>
<tr>
<td>time</td>
<td>%time x = range(100000)</td>
<td>time() function helps in measuring estimation time.</td>
</tr>
<tr>
<td>writefile</td>
<td>%%writefile write_file.py</td>
<td>writefile creates a file and write in it. If the file exists it is overwritten 

In [8]:
# use text attribute to get the content of the tag
print(body.find(name="h1").text)

Exploring Jupyter Notebook magic functions


In [9]:
# get the first element
print(body.find(name="p"))

<p id="author">Author :: Carlos St Valery Vouking.</p>


In [10]:
# get all elements
print(body.findAll(name="p"))

[<p id="author">Author :: Carlos St Valery Vouking.</p>, <p id="description"> These so called 'magical functions' are used to extend the development environment to an entire new level.
       'Magical functions' always begin with a '%' sign for a single line and '%%' for a multiline. We will explore some of them here</p>]


In [11]:
# iterate through each element
all_elements = [elt for elt in body.findAll(name="p")]
for p in all_elements:
    print(p.text)

Author :: Carlos St Valery Vouking.
 These so called 'magical functions' are used to extend the development environment to an entire new level.
       'Magical functions' always begin with a '%' sign for a single line and '%%' for a multiline. We will explore some of them here


* Add attributes in the selection process

In [12]:
print(body.find(name="p", attrs={"id":"author"}))

<p id="author">Author :: Carlos St Valery Vouking.</p>


In [13]:
print(body.find(name="p", attrs ={"id":"description"}))

<p id="description"> These so called 'magical functions' are used to extend the development environment to an entire new level.
       'Magical functions' always begin with a '%' sign for a single line and '%%' for a multiline. We will explore some of them here</p>


* set up the complete scraping process.

In [17]:
# body
body = ps.find(name = "body")
# table
function_table = ps.find(name='table', attrs={"id": "function"})
# Iterate through each row ine table,,,skip the first row
All_rows_first_row_skipped = function_table.findAll(name="tr")[1:]

for row in All_rows_first_row_skipped:
    #function
    function = row.findAll(name = 'td')[0].text
    #representation
    representation = row.findAll(name = 'td')[1].text
    #usage
    usage = row.findAll(name = 'td')[2].text
    
    print(function, representation)    

 Matplotlib %matplotlib inline
time %time x = range(100000)
writefile %%writefile write_file.py
HTML %%HTML --  Image embedded in jupyter notebook 
Latex %%latex F(x)= Ax + B
load_ext %load_ext sql
