# WEB SCRAPING

## BeautifulSoup and Requests

In [3]:
from bs4 import BeautifulSoup as BS
import requests
import pandas as pd

In [4]:
url = 'https://www.scrapethissite.com/pages/forms/'
response = requests.get(url)

In [66]:
soup = BS(response.text, 'html')
#print(soup.prettify())

### Find and Find_all

In [7]:
soup.find('div')

<div class="container">
<div class="col-md-12">
<ul class="nav nav-tabs">
<li id="nav-homepage">
<a class="nav-link hidden-sm hidden-xs" href="/">
<img id="nav-logo" src="/static/images/scraper-icon.png"/>
                                Scrape This Site
                            </a>
</li>
<li id="nav-sandbox">
<a class="nav-link" href="/pages/">
<i class="glyphicon glyphicon-console hidden-sm hidden-xs"></i>
                                Sandbox
                            </a>
</li>
<li id="nav-lessons">
<a class="nav-link" href="/lessons/">
<i class="glyphicon glyphicon-education hidden-sm hidden-xs"></i>
                                Lessons
                            </a>
</li>
<li id="nav-faq">
<a class="nav-link" href="/faq/">
<i class="glyphicon glyphicon-flag hidden-sm hidden-xs"></i>
                                FAQ
                            </a>
</li>
<li class="pull-right" id="nav-login">
<a class="nav-link" href="/login/">
                                Login

In [8]:
soup.find_all('p', class_ = 'lead')

[<p class="lead">
                             Browse through a database of NHL team stats since 1990. Practice building a scraper that handles common website interface components.
                             Take a look at how pagination and search elements change the URL as your browse. Build a web scraper that can conduct searches and paginate through the results.
                         </p>]

In [9]:
soup.find('p', class_ = 'lead').text.strip()

'Browse through a database of NHL team stats since 1990. Practice building a scraper that handles common website interface components.\n                            Take a look at how pagination and search elements change the URL as your browse. Build a web scraper that can conduct searches and paginate through the results.'

In [10]:
soup.find('th').text.strip()

'Team Name'

In [11]:
table = soup.find('table')
headers = table.find_all('th')
formatted_headers = [title.text.strip() for title in headers]
df = pd.DataFrame(columns = formatted_headers)
df

Unnamed: 0,Team Name,Year,Wins,Losses,OT Losses,Win %,Goals For (GF),Goals Against (GA),+ / -


In [12]:
column_data = table.find_all('tr')[1:]
for row in column_data:
    row_data = row.find_all('td')
    formatted_row_data = [data.text.strip() for data in row_data]
    
    length = len(df)
    df.loc[length] = formatted_row_data

In [13]:
df

Unnamed: 0,Team Name,Year,Wins,Losses,OT Losses,Win %,Goals For (GF),Goals Against (GA),+ / -
0,Boston Bruins,1990,44,24,,0.55,299,264,35
1,Buffalo Sabres,1990,31,30,,0.388,292,278,14
2,Calgary Flames,1990,46,26,,0.575,344,263,81
3,Chicago Blackhawks,1990,49,23,,0.613,284,211,73
4,Detroit Red Wings,1990,34,38,,0.425,273,298,-25
5,Edmonton Oilers,1990,37,37,,0.463,272,272,0
6,Hartford Whalers,1990,31,38,,0.388,238,276,-38
7,Los Angeles Kings,1990,46,24,,0.575,340,254,86
8,Minnesota North Stars,1990,27,39,,0.338,256,266,-10
9,Montreal Canadiens,1990,39,30,,0.487,273,249,24


In [14]:
df.to_csv('Hockey_team.csv', index=False)

### Scraping Data From A Real Website + Pandas

In [16]:
from bs4 import BeautifulSoup as BS
import requests

In [17]:
url = 'https://en.wikipedia.org/wiki/List_of_largest_companies_in_the_United_States_by_revenue'
response = requests.get(url)
soup = BS(response.text, 'html')
#print(soup.prettify())

In [18]:
table = soup.find_all('table')[0]
print(table)

<table class="wikitable sortable">
<caption>
</caption>
<tbody><tr>
<th>Rank
</th>
<th>Name
</th>
<th>Industry
</th>
<th>Revenue <br/>(USD millions)
</th>
<th>Revenue growth
</th>
<th>Employees
</th>
<th>Headquarters
</th></tr>
<tr>
<td>1
</td>
<td><a href="/wiki/Walmart" title="Walmart">Walmart</a>
</td>
<td><a href="/wiki/Retail" title="Retail">Retail</a>
</td>
<td style="text-align:center;">648,125
</td>
<td style="text-align:center;"><span typeof="mw:File"><span title="Increase"><img alt="Increase" class="mw-file-element" data-file-height="300" data-file-width="300" decoding="async" height="11" src="//upload.wikimedia.org/wikipedia/commons/thumb/b/b0/Increase2.svg/20px-Increase2.svg.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/b/b0/Increase2.svg/40px-Increase2.svg.png 2x" width="11"/></span></span> <span data-sort-value="7000300000000000000♠" style="display:none"></span> 6.0%
</td>
<td style="text-align:center;">2,100,000
</td>
<td><a href="/wiki/Bentonville,_Arkansa

In [19]:
world_titles = table.find_all('th')
print(world_titles)

[<th>Rank
</th>, <th>Name
</th>, <th>Industry
</th>, <th>Revenue <br/>(USD millions)
</th>, <th>Revenue growth
</th>, <th>Employees
</th>, <th>Headquarters
</th>]


In [20]:
world_table_titles = [title.text.strip() for title in world_titles]
print(world_table_titles)

['Rank', 'Name', 'Industry', 'Revenue (USD millions)', 'Revenue growth', 'Employees', 'Headquarters']


In [21]:
import pandas as pd

In [22]:
df = pd.DataFrame(columns = world_table_titles)
df

Unnamed: 0,Rank,Name,Industry,Revenue (USD millions),Revenue growth,Employees,Headquarters


In [23]:
column_data = table.find_all('tr')[1:]
for row in column_data:
    row_data = row.find_all('td')
    individual_row_data = [data.text.strip() for data in row_data]
    #print(individual_row_data)

    length = len(df)
    df.loc[length] = individual_row_data

    

In [24]:
df

Unnamed: 0,Rank,Name,Industry,Revenue (USD millions),Revenue growth,Employees,Headquarters
0,1,Walmart,Retail,648125,6.0%,2100000,"Bentonville, Arkansas"
1,2,Amazon,Retail and cloud computing,574785,11.9%,1525000,"Seattle, Washington"
2,3,Apple,Electronics industry,383482,-2.8%,161000,"Cupertino, California"
3,4,UnitedHealth Group,Healthcare,371622,14.6%,440000,"Minnetonka, Minnesota"
4,5,Berkshire Hathaway,Conglomerate,364482,20.7%,396500,"Omaha, Nebraska"
...,...,...,...,...,...,...,...
95,96,TIAA,Financials,45735,11.8%,16023,"New York City, New York"
96,97,CHS,Agriculture cooperative,45590,-4.6%,10609,"Inver Grove Heights, Minnesota"
97,98,Bristol-Myers Squibb,Pharmaceutical industry,45006,-2.5%,34100,"New York City, New York"
98,99,Dow Chemical Company,Chemical industry,44622,-21.6%,35900,"Midland, Michigan"


In [25]:
df.to_csv('companies.csv', index=False)

## Scraping the second table

In [27]:
from bs4 import BeautifulSoup as BS
import requests

In [28]:
url = 'https://en.wikipedia.org/wiki/List_of_largest_companies_in_the_United_States_by_revenue'
response = requests.get(url)
soup = BS(response.text, 'html')


In [29]:
table = soup.find_all('table')[1]
table

<table class="wikitable sortable">
<tbody><tr>
<th>Rank
</th>
<th>Name
</th>
<th>Industry
</th>
<th>Revenue <br/>(USD billions)
</th>
<th>Employees
</th>
<th>Headquarters
</th></tr>
<tr>
<td>1
</td>
<td><a href="/wiki/Cargill" title="Cargill">Cargill</a>
</td>
<td>Food industry
</td>
<td style="text-align:center;">177
</td>
<td style="text-align:center;">160,000
</td>
<td><a href="/wiki/Minnetonka,_Minnesota" title="Minnetonka, Minnesota">Minnetonka, Minnesota</a>
</td></tr>
<tr>
<td>2
</td>
<td><a class="mw-redirect" href="/wiki/Koch_Industries" title="Koch Industries">Koch Industries</a>
</td>
<td>Conglomerate
</td>
<td style="text-align:center;">125
</td>
<td style="text-align:center;">120,000
</td>
<td><a href="/wiki/Wichita,_Kansas" title="Wichita, Kansas">Wichita, Kansas</a>
</td></tr>
<tr>
<td>3
</td>
<td><a class="mw-redirect" href="/wiki/Publix_Super_Markets" title="Publix Super Markets">Publix Super Markets</a>
</td>
<td>Retail
</td>
<td style="text-align:center;">54.5
</td>


In [30]:
headers = table.find_all('th')
formated_headers = [title.text.strip() for title in headers]
print(formated_headers)

['Rank', 'Name', 'Industry', 'Revenue (USD billions)', 'Employees', 'Headquarters']


In [31]:
import pandas as pd

In [32]:
df = pd.DataFrame(columns = formated_headers)
df

Unnamed: 0,Rank,Name,Industry,Revenue (USD billions),Employees,Headquarters


In [33]:
column_data = table.find_all('tr')[1:]
for row in column_data:
    row_data = row.find_all('td')
    formated_row_data = [data.text.strip() for data in row_data]
    length = len(df)
    df.loc[length] = formated_row_data
    

In [34]:
df


Unnamed: 0,Rank,Name,Industry,Revenue (USD billions),Employees,Headquarters
0,1,Cargill,Food industry,177.0,160000,"Minnetonka, Minnesota"
1,2,Koch Industries,Conglomerate,125.0,120000,"Wichita, Kansas"
2,3,Publix Super Markets,Retail,54.5,250000,"Lakeland, Florida"
3,4,"Mars, Incorporated",Food industry,47.0,140000,"McLean, Virginia"
4,5,H-E-B,Retail,43.6,145000,"San Antonio, Texas"
5,6,Reyes Holdings,Wholesaling,40.0,36000,"Rosemont, Illinois"
6,7,Enterprise Holdings,Car rental,35.0,90000,"Clayton, Missouri"
7,8,C&S Wholesale Grocers,Wholesaling,34.7,15000,"Keene, New Hampshire"
8,9,Love's,Petroleum industry and Retail,26.5,40000,"Oklahoma City, Oklahoma"
9,10,Southern Glazer's Wine and Spirits,Food industry,26.0,24000,"Miramar, Florida"


In [35]:
df.to_csv('List_of_largest_private_companies.csv', index=False)