## Project Description:
In this project, the data for the list of the largest companies in the united states by revenue in the year 2024 was scraped from Wikipedia, and saved as a CSV file.

### Imports

In [39]:
# imports
from bs4 import BeautifulSoup
import requests
import pandas as pd
import os

### Data Gathering

In [40]:
#set url
url = 'https://en.wikipedia.org/wiki/List_of_largest_companies_in_the_United_States_by_revenue'
# get response
response = requests.get(url)
# parse the html content with BeautifulSoup
soup = BeautifulSoup(response.text, 'html')

In [41]:
# find table of largest public companies by revenue in 2024.
table = soup.find_all('table')[0]

In [42]:
# find all column headers of table
table_titles = table.find_all('th')
print(table_titles)

[<th>Rank
</th>, <th>Name
</th>, <th>Industry
</th>, <th>Revenue <br/>(USD millions)
</th>, <th>Revenue growth
</th>, <th>Employees
</th>, <th>Headquarters
</th>]


In [43]:
# use list comprehension to append each title to one list
table_headers = [title.text.strip() for title in table_titles]
print(table_headers)

['Rank', 'Name', 'Industry', 'Revenue (USD millions)', 'Revenue growth', 'Employees', 'Headquarters']


In [44]:
# store table headers as columns in a pandas dataframe.
df = pd.DataFrame(columns = table_headers)

In [45]:
# assess the dataframe
df

Unnamed: 0,Rank,Name,Industry,Revenue (USD millions),Revenue growth,Employees,Headquarters


In [46]:
# gathering row data
rows_html = table.find_all('tr')

In [47]:
# create a for loop to extract data for each row.
for row_html in rows_html[1:]:
    row_data = row_html.find_all('td')
    row_entries = [data.text.strip() for data in row_data]
    # store data for each row in the dataframe 
    length = len(df)
    df.loc[length] = row_entries
# assess dataframe    
df

Unnamed: 0,Rank,Name,Industry,Revenue (USD millions),Revenue growth,Employees,Headquarters
0,1,Walmart,Retail,648125,6.0%,2100000,"Bentonville, Arkansas"
1,2,Amazon,Retail and cloud computing,574785,11.9%,1525000,"Seattle, Washington"
2,3,Apple,Electronics industry,383482,-2.8%,161000,"Cupertino, California"
3,4,UnitedHealth Group,Healthcare,371622,14.6%,440000,"Minnetonka, Minnesota"
4,5,Berkshire Hathaway,Conglomerate,364482,20.7%,396500,"Omaha, Nebraska"
...,...,...,...,...,...,...,...
95,96,TIAA,Financials,45735,11.8%,16023,"New York City, New York"
96,97,CHS,Agriculture cooperative,45590,-4.6%,10609,"Inver Grove Heights, Minnesota"
97,98,Bristol-Myers Squibb,Pharmaceutical industry,45006,-2.5%,34100,"New York City, New York"
98,99,Dow Chemical Company,Chemical industry,44622,-21.6%,35900,"Midland, Michigan"


### Data Cleaning

In [48]:
# delete duplicates
df = df.drop_duplicates()
df

Unnamed: 0,Rank,Name,Industry,Revenue (USD millions),Revenue growth,Employees,Headquarters
0,1,Walmart,Retail,648125,6.0%,2100000,"Bentonville, Arkansas"
1,2,Amazon,Retail and cloud computing,574785,11.9%,1525000,"Seattle, Washington"
2,3,Apple,Electronics industry,383482,-2.8%,161000,"Cupertino, California"
3,4,UnitedHealth Group,Healthcare,371622,14.6%,440000,"Minnetonka, Minnesota"
4,5,Berkshire Hathaway,Conglomerate,364482,20.7%,396500,"Omaha, Nebraska"
...,...,...,...,...,...,...,...
95,96,TIAA,Financials,45735,11.8%,16023,"New York City, New York"
96,97,CHS,Agriculture cooperative,45590,-4.6%,10609,"Inver Grove Heights, Minnesota"
97,98,Bristol-Myers Squibb,Pharmaceutical industry,45006,-2.5%,34100,"New York City, New York"
98,99,Dow Chemical Company,Chemical industry,44622,-21.6%,35900,"Midland, Michigan"


### Export Data

In [49]:
# set path
path = r"C:/Users/SNT/Documents/analyst/da-bootcamp/python_projects/Files/companies.csv"
# convert dataframe to csv file if path doesn't exist
if not os.path.exists(path):
    df.to_csv(path, index = False)
    print('CSV file created successfully!')
else:
    print('File exists already')

File exists already


### File Sorting

In [50]:
# run Automatic_File_Sorter.ipynb to sort the csv file into csv folder
%run C:\Users\SNT\Documents\analyst\da-bootcamp\python_projects\Automatic_File_Sorter.ipynb

All conditions were satisfied, '.csv' file moved!
Not all conditions were satisfied, no file moved!
Not all conditions were satisfied, no file moved!
Not all conditions were satisfied, no file moved!
Not all conditions were satisfied, no file moved!
Not all conditions were satisfied, no file moved!
Not all conditions were satisfied, no file moved!
Not all conditions were satisfied, no file moved!
Not all conditions were satisfied, no file moved!
Not all conditions were satisfied, no file moved!
Not all conditions were satisfied, no file moved!
Not all conditions were satisfied, no file moved!
