1. Import libraries

In [26]:
import requests
from bs4 import BeautifulSoup

2. Set URL

In [27]:
url = "https://avinashjairam.github.io/tableExample1.html"

3. Make a request to the server the website is hosted on

In [28]:
page = requests.get(url)

4. Check status code

In [37]:
page.status_code

200

5. Look at raw HTML

In [30]:
page.content

b"<html>\n<head>\n\n<title>Table Scraping: Ex. #1</title>\n\n<style>\ntable, th, td {\n  border: 1px solid black;\n  border-collapse: collapse;\n}\nth, td {\n  padding: 5px;\n  text-align: left;\n}\n</style>\n</head>\n\n\n<body>\n\n<h2>Monthly Savings</h2>\n<p>Let's Track Our Savings</p>\n\n<table>\n  <caption>Monthly Savings</caption>\n  <tr>\n    <th>Month</th>\n    <th>Savings</th>\n  </tr>\n  <tr>\n    <td>January</td>\n    <td>$100</td>\n  </tr>\n  <tr>\n    <td>February</td>\n    <td>$50</td>\n  </tr>\n\n <tr>\n    <td>March</td>\n    <td>$340</td>\n  </tr>\n  <tr>\n    <td>April</td>\n    <td>$50</td>\n  </tr>\n\n <tr>\n    <td>May</td>\n    <td>$10</td>\n  </tr>\n  <tr>\n    <td>June</td>\n    <td>$5</td>\n  </tr>\n\n <tr>\n    <td>July</td>\n    <td>$210</td>\n  </tr>\n  <tr>\n    <td>August</td>\n    <td>$54</td>\n  </tr>\n\n <tr>\n    <td>September</td>\n    <td>$20</td>\n  </tr>\n\n  <tr>\n    <td>October</td>\n    <td>$0</td>\n  </tr>\n\n  <tr>\n    <td>November</td>\n    

6. Import the raw HTML into BeautifulSoup

In [31]:
soup = BeautifulSoup(page.content, 'html.parser')

7. Look at formatted HTML via print statement and prettify()

In [32]:
print(soup.prettify())

<html>
 <head>
  <title>
   Table Scraping: Ex. #1
  </title>
  <style>
   table, th, td {
  border: 1px solid black;
  border-collapse: collapse;
}
th, td {
  padding: 5px;
  text-align: left;
}
  </style>
 </head>
 <body>
  <h2>
   Monthly Savings
  </h2>
  <p>
   Let's Track Our Savings
  </p>
  <table>
   <caption>
    Monthly Savings
   </caption>
   <tr>
    <th>
     Month
    </th>
    <th>
     Savings
    </th>
   </tr>
   <tr>
    <td>
     January
    </td>
    <td>
     $100
    </td>
   </tr>
   <tr>
    <td>
     February
    </td>
    <td>
     $50
    </td>
   </tr>
   <tr>
    <td>
     March
    </td>
    <td>
     $340
    </td>
   </tr>
   <tr>
    <td>
     April
    </td>
    <td>
     $50
    </td>
   </tr>
   <tr>
    <td>
     May
    </td>
    <td>
     $10
    </td>
   </tr>
   <tr>
    <td>
     June
    </td>
    <td>
     $5
    </td>
   </tr>
   <tr>
    <td>
     July
    </td>
    <td>
     $210
    </td>
   </tr>
   <tr>
    <td>
     August
    </td>

8. Locate all table rows on the page

In [33]:
rows = soup.find_all('tr')
rows

[<tr>
 <th>Month</th>
 <th>Savings</th>
 </tr>,
 <tr>
 <td>January</td>
 <td>$100</td>
 </tr>,
 <tr>
 <td>February</td>
 <td>$50</td>
 </tr>,
 <tr>
 <td>March</td>
 <td>$340</td>
 </tr>,
 <tr>
 <td>April</td>
 <td>$50</td>
 </tr>,
 <tr>
 <td>May</td>
 <td>$10</td>
 </tr>,
 <tr>
 <td>June</td>
 <td>$5</td>
 </tr>,
 <tr>
 <td>July</td>
 <td>$210</td>
 </tr>,
 <tr>
 <td>August</td>
 <td>$54</td>
 </tr>,
 <tr>
 <td>September</td>
 <td>$20</td>
 </tr>,
 <tr>
 <td>October</td>
 <td>$0</td>
 </tr>,
 <tr>
 <td>November</td>
 <td>$400</td>
 </tr>,
 <tr>
 <td>December</td>
 <td>$2</td>
 </tr>]

9. Use a for loop to iterate over list

In [34]:
months = []
savings = []

for row in rows:
  # print(row.find_all('td')) # first row prints empty

  # extracts all cells in each row of the table which are built using <td> tags
  cells = row.find_all('td')
  # print(cells, len(cells))

  if len(cells) == 2:
    # print(cells[0], cells[1]) # this prints the tags (i.e. <td>January</td> <td>$100</td>) but not the actual text
    # print(cells[0].get_text(), cells[1].get_text()) # prints: January $100 \n Feburary $50 \n March $340 etc.

    month = cells[0].get_text()
    money = float(cells[1].get_text()[1:]) # slice off $ then turn from string type into a float so that numbers can be added
    # print(type(month), type(money))

    months.append(month)
    savings.append(money)

# print(months, savings)  # prints: ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'] [100.0, 50.0, 340.0, 50.0, 10.0, 5.0, 210.0, 54.0, 20.0, 0.0, 400.0, 2.0]

10. DataFrame

In [41]:
import pandas as pd

monthly_savings = {
  'Months': months,
  'Savings': savings
}

df = pd.DataFrame(monthly_savings)
# df is a virtual table that exists within python

df

Unnamed: 0,Months,Savings
0,January,100.0
1,February,50.0
2,March,340.0
3,April,50.0
4,May,10.0
5,June,5.0
6,July,210.0
7,August,54.0
8,September,20.0
9,October,0.0


11. Export as a csv file

In [39]:
df.to_csv('monthly_savings.csv', index=False)
# first argument is the name of the file
# running this line will create a file on local machine

12. Get stats of table

In [42]:
df.describe()

Unnamed: 0,Savings
count,12.0
mean,103.416667
std,138.105663
min,0.0
25%,8.75
50%,50.0
75%,127.5
max,400.0
