In [85]:
# webscrapping a wikipedia page (largest companies in the United States by revenue)

### Importing libraries

In [74]:
import pandas as pd
import lxml
from bs4 import BeautifulSoup
import requests

### Getting a webpage from wikipedia

In [4]:
url = 'https://en.wikipedia.org/wiki/List_of_largest_companies_in_the_United_States_by_revenue'
content = requests.get(url).text
soup = BeautifulSoup(content,'lxml')

### Getting table details

In [76]:
# getting the main table from thge webpage
table_companies = soup.find('table', class_ = 'wikitable')

# getting header names/ column names
table_body = table_companies.find('tbody')
table_header = table_body.find_all('th')
column_names = []
for head in table_header:
    column = head.text.strip()
    column_names.append(column)
print(column_names)

# cleaning column names
col_names = [col_names.lower().replace(' ','_').replace('(','').replace(')','').strip() for col_names in column_names]
print(col_names)

['Rank', 'Name', 'Industry', 'Revenue (USD millions)', 'Revenue growth', 'Employees', 'Headquarters']
['rank', 'name', 'industry', 'revenue_usd_millions', 'revenue_growth', 'employees', 'headquarters']


In [86]:
table_rows = table_body.find_all('tr')

# the first row is the column name
# row[1] onwards, we get company details
# for first company 'walmart':
walmart = table_rows[1].find_all('td')
for details in walmart:
    print(details.text.strip())

1
Walmart
Retail
648,125
6.0%
2,100,000
Bentonville, Arkansas


In [78]:
# creating a dictionary with column names and company details
final_table = []
for data1 in table_rows[1:]:
    row_data = []
    for data2 in data1.find_all('td'):
        row_data.append(data2.text.strip())
    final_table.append(dict(zip(col_names,row_data)))

### Creating a dataframe using Pandas

In [79]:
company_df = pd.DataFrame(final_table)
company_df.head()

Unnamed: 0,rank,name,industry,revenue_usd_millions,revenue_growth,employees,headquarters
0,1,Walmart,Retail,648125,6.0%,2100000,"Bentonville, Arkansas"
1,2,Amazon,Retail and cloud computing,574785,11.9%,1525000,"Seattle, Washington"
2,3,Apple,Electronics industry,383482,-2.8%,161000,"Cupertino, California"
3,4,UnitedHealth Group,Healthcare,371622,14.6%,440000,"Minnetonka, Minnesota"
4,5,Berkshire Hathaway,Conglomerate,364482,20.7%,396500,"Omaha, Nebraska"


### Manipulating dataframe using Pandas

In [80]:
print(company_df.columns)
print(company_df.dtypes)

Index(['rank', 'name', 'industry', 'revenue_usd_millions', 'revenue_growth',
       'employees', 'headquarters'],
      dtype='object')
rank                    object
name                    object
industry                object
revenue_usd_millions    object
revenue_growth          object
employees               object
headquarters            object
dtype: object


In [81]:
# creating two columns with city and state names
company_df['city_name'] = company_df['headquarters'].apply(lambda hd_qtr: hd_qtr.split(',')[0])
company_df['state_name'] = company_df['headquarters'].apply(lambda hd_qtr: hd_qtr.split(', ')[1])

In [82]:
company_df.head()

Unnamed: 0,rank,name,industry,revenue_usd_millions,revenue_growth,employees,headquarters,city_name,state_name
0,1,Walmart,Retail,648125,6.0%,2100000,"Bentonville, Arkansas",Bentonville,Arkansas
1,2,Amazon,Retail and cloud computing,574785,11.9%,1525000,"Seattle, Washington",Seattle,Washington
2,3,Apple,Electronics industry,383482,-2.8%,161000,"Cupertino, California",Cupertino,California
3,4,UnitedHealth Group,Healthcare,371622,14.6%,440000,"Minnetonka, Minnesota",Minnetonka,Minnesota
4,5,Berkshire Hathaway,Conglomerate,364482,20.7%,396500,"Omaha, Nebraska",Omaha,Nebraska


In [83]:
# converting data types of revenue_usd_millions
company_df['revenue_usd_millions'] = \
company_df['revenue_usd_millions'].apply(lambda digit: int(digit.replace(',','')))

# converting data types employees
company_df['employees'] = \
company_df['employees'].apply(lambda digit: int(digit.replace(',','')))

In [84]:
company_df.head()

Unnamed: 0,rank,name,industry,revenue_usd_millions,revenue_growth,employees,headquarters,city_name,state_name
0,1,Walmart,Retail,648125,6.0%,2100000,"Bentonville, Arkansas",Bentonville,Arkansas
1,2,Amazon,Retail and cloud computing,574785,11.9%,1525000,"Seattle, Washington",Seattle,Washington
2,3,Apple,Electronics industry,383482,-2.8%,161000,"Cupertino, California",Cupertino,California
3,4,UnitedHealth Group,Healthcare,371622,14.6%,440000,"Minnetonka, Minnesota",Minnetonka,Minnesota
4,5,Berkshire Hathaway,Conglomerate,364482,20.7%,396500,"Omaha, Nebraska",Omaha,Nebraska
