# Scraping from a real website + Pandas (Part 2)

In [7]:
# Import BeautifulSoup library to parse HTML

from bs4 import BeautifulSoup
import requests

In [8]:
# Enter the URL

url = 'https://en.wikipedia.org/wiki/List_of_largest_companies_in_the_United_States_by_revenue'

page = requests.get(url)

soup = BeautifulSoup(page.text, 'html')

In [9]:
# Extract the table

table = soup.find_all('table')[3]
print(table)

<table class="wikitable sortable">
<caption>
</caption>
<tbody><tr>
<th>Rank
</th>
<th>Name
</th>
<th>Industry
</th>
<th>Profits<br/>(USD millions)
</th></tr>
<tr>
<td>1
</td>
<td><a href="/wiki/Apple_Inc." title="Apple Inc.">Apple</a>
</td>
<td>Electronics
</td>
<td style="text-align:center;">99,803
</td></tr>
<tr>
<td>2
</td>
<td><a href="/wiki/Microsoft" title="Microsoft">Microsoft</a>
</td>
<td>Technology
</td>
<td style="text-align:center;">72,738
</td></tr>
<tr>
<td>3
</td>
<td><a href="/wiki/Alphabet_Inc." title="Alphabet Inc.">Alphabet</a>
</td>
<td>Technology
</td>
<td style="text-align:center;">59,972
</td></tr>
<tr>
<td>4
</td>
<td><a href="/wiki/United_States_Postal_Service" title="United States Postal Service">United States Postal Service</a>
</td>
<td>Logistics
</td>
<td style="text-align:center;">56,046
</td></tr>
<tr>
<td>5
</td>
<td><a href="/wiki/ExxonMobil" title="ExxonMobil">ExxonMobil</a>
</td>
<td>Petroleum industry
</td>
<td style="text-align:center;">55,740
</td

In [10]:
# Extract all headers from the extracted table

world_titles = table.find_all('th')

In [11]:
world_titles

[<th>Rank
 </th>,
 <th>Name
 </th>,
 <th>Industry
 </th>,
 <th>Profits<br/>(USD millions)
 </th>]

In [12]:
# Extract the titles from the extracted headers of the extracted table

world_table_titles = [title.text.strip() for title in world_titles ]

print(world_table_titles)

['Rank', 'Name', 'Industry', 'Profits(USD millions)']


In [14]:
# Import Pandas library to create dataframe

import pandas as pd

In [15]:
df = pd.DataFrame(columns = world_table_titles)

df

Unnamed: 0,Rank,Name,Industry,Profits(USD millions)


In [17]:
# Extract each row from the extracted table

column_data = table.find_all('tr')

In [18]:
for row in column_data[1:]: # Going through each extracted row, starting from row with index 1 because index 0 is empty
    row_data = row.find_all('td') # Extract the cells from each row
    individual_row_data = [data.text.strip() for data in row_data] # Extract only the data from each cell

    # Every data extracted from each cell is then put into the new row
    length = len(df) # Calculate the length of the dataframe/table, initially the length is 0 because there is no data yet
    df.loc[length] = individual_row_data # Insert data in index 0 with the extracted data from the cells
    
    # And then the dataframe/table will have length = 1, then insert new data in index[length], i.e. index[1] and so on.

In [19]:
df

Unnamed: 0,Rank,Name,Industry,Profits(USD millions)
0,1,Apple,Electronics,99803
1,2,Microsoft,Technology,72738
2,3,Alphabet,Technology,59972
3,4,United States Postal Service,Logistics,56046
4,5,ExxonMobil,Petroleum industry,55740
5,6,JPMorgan Chase,Financials,37676
6,7,Chevron Corporation,Petroleum industry,35465
7,8,Pfizer,Pharmaceutical industry,31372
8,9,Bank of America,Financials,27528
9,10,Meta Platforms,Technology,23200


In [20]:
df.to_csv(r'C:\Users\f.pranata\Documents\Companies_by_Profit.csv', index = False)