## Data Scrapping using Beautiful Soup
- Import Beautiful Soap
- Make a Get request to fetch Page Data
- Parse HTML
- Filter relevant parts

### Installation
pip install bs4

In [1]:
from urllib.request import urlopen

In [2]:
android_url = "https://en.wikipedia.org/wiki/Android_version_history"

In [3]:
android_data = urlopen(android_url)

In [4]:
print(type(android_data))

<class 'http.client.HTTPResponse'>


In [5]:
android_html = android_data.read()
# print(android_html)

In [7]:
android_data.close()

## Parsing Data

In [8]:
from bs4 import BeautifulSoup as soup

In [9]:
android_soup = soup(android_html, 'html.parser')
# print(android_soup)

In [11]:
print(type(android_soup))

<class 'bs4.BeautifulSoup'>


In [12]:
android_soup.h1

<h1 class="firstHeading" id="firstHeading" lang="en">Android version history</h1>

In [13]:
android_soup.findAll('h1', {})

[<h1 class="firstHeading" id="firstHeading" lang="en">Android version history</h1>]

In [14]:
tables = android_soup.findAll('table', {'class':'wikitable'})

In [15]:
print(len(tables))

31


In [16]:
android_table = tables[0]
# print(android_table)

<table class="wikitable">
<tbody><tr>
<th>Name
</th>
<th>Version number(s)
</th>
<th>Initial stable<br/>release date
</th>
<th>Supported (security fixes)
</th>
<th>API level
</th>
<th>References
</th></tr>
<tr>
<td rowspan="2">No official codename
</td>
<td>1.0
</td>
<td>September 23, 2008
</td>
<td class="table-no" style="background:#F99;vertical-align:middle;text-align:center;">No
</td>
<td>1
</td>
<td><sup class="reference" id="cite_ref-unofficial_and_official_codenames_9-1"><a href="#cite_note-unofficial_and_official_codenames-9">[9]</a></sup>
</td></tr>
<tr>
<td>1.1
</td>
<td>February 9, 2009
</td>
<td class="table-no" style="background:#F99;vertical-align:middle;text-align:center;">No
</td>
<td>2
</td>
<td><sup class="reference" id="cite_ref-unofficial_and_official_codenames_9-2"><a href="#cite_note-unofficial_and_official_codenames-9">[9]</a></sup><sup class="reference" id="cite_ref-14"><a href="#cite_note-14">[14]</a></sup>
</td></tr>
<tr>
<td><a href="/wiki/Android_Cupcake" ti

## Extracting Useful Information
- Remove undesired tag
- Extract table header & data

In [17]:
headers = android_table.findAll('th', {})
print(len(headers))
print(headers)

6
[<th>Name
</th>, <th>Version number(s)
</th>, <th>Initial stable<br/>release date
</th>, <th>Supported (security fixes)
</th>, <th>API level
</th>, <th>References
</th>]


In [18]:
print(headers[0].text)

Name



In [19]:
column_titles = [ct.text[:-1] for ct in headers]
column_titles[2] = 'Initial stable release date'
print(column_titles)

['Name', 'Version number(s)', 'Initial stable release date', 'Supported (security fixes)', 'API level', 'References']


In [20]:
type(column_titles)

list

In [21]:
rows_data = android_table.findAll('tr', {})[1:]
print(len(rows_data))

18


In [22]:
first_row = rows_data[0]
print(first_row)

<tr>
<td rowspan="2">No official codename
</td>
<td>1.0
</td>
<td>September 23, 2008
</td>
<td class="table-no" style="background:#F99;vertical-align:middle;text-align:center;">No
</td>
<td>1
</td>
<td><sup class="reference" id="cite_ref-unofficial_and_official_codenames_9-1"><a href="#cite_note-unofficial_and_official_codenames-9">[9]</a></sup>
</td></tr>


In [23]:
first_row = rows_data[0].findAll('td', {})
for i in first_row:
    print(i.text[:-1])

No official codename
1.0
September 23, 2008
No
1
[9]


In [24]:
second_row = rows_data[1].findAll('td', {})
for i in second_row:
    print(i.text[:-1])

1.1
February 9, 2009
No
2
[9][14]


In [25]:
table_rows = []
for count, row in enumerate(rows_data):
    current_row = []
    if count == 1:
        current_row.append("No official codename")
    row_data = row.findAll('td', {})
    for idx, data in enumerate(row_data):
        current_row.append(data.text[:-1])
    table_rows.append(current_row)

In [26]:
print(table_rows)

[['No official codename', '1.0', 'September 23, 2008', 'No', '1', '[9]'], ['No official codename', '1.1', 'February 9, 2009', 'No', '2', '[9][14]'], ['Cupcake', '1.5', 'April 27, 2009', 'No', '3', '[15]'], ['Donut', '1.6', 'September 15, 2009', 'No', '4', '[16]'], ['Eclair', '2.0 – 2.1', 'October 26, 2009', 'No', '5 – 7', '[17]'], ['Froyo', '2.2 – 2.2.3', 'May 20, 2010', 'No', '8', '[18]'], ['Gingerbread', '2.3 – 2.3.7', 'December 6, 2010', 'No', '9 – 10', '[19]'], ['Honeycomb', '3.0 – 3.2.6', 'February 22, 2011', 'No', '11 – 13', '[20]'], ['Ice Cream Sandwich', '4.0 – 4.0.4', 'October 18, 2011', 'No', '14 – 15', '[21]'], ['Jelly Bean', '4.1 – 4.3.1', 'July 9, 2012', 'No', '16 – 18', '[22]'], ['KitKat', '4.4 – 4.4.4', 'October 31, 2013', 'No', '19 – 20', '[23]'], ['Lollipop', '5.0 – 5.1.1', 'November 12, 2014', 'No', '21 – 22', '[24]'], ['Marshmallow', '6.0 – 6.0.1', 'October 5, 2015', 'No', '23', '[25]'], ['Nougat', '7.0 – 7.1.2', 'August 22, 2016', 'No', '24 – 25', '[26][27][28][29]'

## Writing & Reading CSV Files
- CSV stands for Comma Separated File

In [27]:
filename = 'android_version_history.csv'
with open(filename, 'w', encoding = 'utf-8') as f:
    # Write the header
    header_string = ','.join(column_titles)
    header_string += '\n'
    f.write(header_string)
    
    for row in table_rows:
        row_string = ""
        for w in row:
            w = w.replace(',','')
            row_string += w + ','
        row_string = row_string[:-1]
        row_string += '\n'
        f.write(row_string)

## Data Cleaning
- Removing unwanted commas & symbols
- undesired information

In [28]:
import pandas as pd

In [29]:
df = pd.read_csv('android_version_history.csv')

In [30]:
df.head(n=10)

Unnamed: 0,Name,Version number(s),Initial stable release date,Supported (security fixes),API level,References
0,No official codename,1.0,September 23 2008,No,1,[9]
1,No official codename,1.1,February 9 2009,No,2,[9][14]
2,Cupcake,1.5,April 27 2009,No,3,[15]
3,Donut,1.6,September 15 2009,No,4,[16]
4,Eclair,2.0 – 2.1,October 26 2009,No,5 – 7,[17]
5,Froyo,2.2 – 2.2.3,May 20 2010,No,8,[18]
6,Gingerbread,2.3 – 2.3.7,December 6 2010,No,9 – 10,[19]
7,Honeycomb,3.0 – 3.2.6,February 22 2011,No,11 – 13,[20]
8,Ice Cream Sandwich,4.0 – 4.0.4,October 18 2011,No,14 – 15,[21]
9,Jelly Bean,4.1 – 4.3.1,July 9 2012,No,16 – 18,[22]


In [31]:
print(df.iloc[0][1])

1.0


In [32]:
print(df.iloc[4][2])

October 26 2009
