## Importing Necessary Libraries

In [2]:
from bs4 import BeautifulSoup  # Library for parsing HTML
import requests  # Library for making HTTP requests
import pandas as pd  # Library for handling tabular data

## Fetching the Web Page

In [3]:
url = 'https://en.wikipedia.org/wiki/List_of_largest_companies_in_the_United_States_by_revenue'
page = requests.get(url)  # Sending a request to the webpage
soup = BeautifulSoup(page.text, 'html.parser')  # Parsing the HTML content

## Extracting the Relevant Table

In [4]:
# Wikipedia pages often have multiple tables; selecting the second table
tables = soup.find_all('table')
if len(tables) > 1:
    table = tables[1]
else:
    raise ValueError("Expected table not found on the webpage.")

##### Alternative method: Finding tables with a specific class (ensuring accuracy)
```python
wiki_tables = soup.find_all('table', class_='wikitable sortable')
if wiki_tables:
    table = wiki_tables[0]  # Selecting the first matching table

print(table)  # Printing table for verification (optional)
```

In [5]:
print(table)  # Printing table for verification (optional)

<table class="wikitable sortable">
<tbody><tr>
<th>Rank
</th>
<th>Name
</th>
<th>Industry
</th>
<th>Revenue <br/>(USD billions)
</th>
<th>Employees
</th>
<th>Headquarters
</th></tr>
<tr>
<td>1
</td>
<td><a href="/wiki/Cargill" title="Cargill">Cargill</a>
</td>
<td>Food industry
</td>
<td style="text-align:center;">177
</td>
<td style="text-align:center;">160,000
</td>
<td><a href="/wiki/Minnetonka,_Minnesota" title="Minnetonka, Minnesota">Minnetonka, Minnesota</a>
</td></tr>
<tr>
<td>2
</td>
<td><a class="mw-redirect" href="/wiki/Koch_Industries" title="Koch Industries">Koch Industries</a>
</td>
<td>Conglomerate
</td>
<td style="text-align:center;">125
</td>
<td style="text-align:center;">120,000
</td>
<td><a href="/wiki/Wichita,_Kansas" title="Wichita, Kansas">Wichita, Kansas</a>
</td></tr>
<tr>
<td>3
</td>
<td><a class="mw-redirect" href="/wiki/Publix_Super_Markets" title="Publix Super Markets">Publix Super Markets</a>
</td>
<td>Retail
</td>
<td style="text-align:center;">54.5
</td>


## Extracting Table Headers

In [7]:
world_titles = table.find_all('th')  # Finding all header elements
world_table_titles = [title.text.strip() for title in world_titles]  # Cleaning header text

## Creating a DataFrame

In [8]:
# Initializing an empty DataFrame with column names
df = pd.DataFrame(columns=world_table_titles)

## Extracting Table Data

In [9]:
column_data = table.find_all('tr')  # Finding all table rows
for row in column_data[1:]:  # Skipping the header row
    row_data = row.find_all('td')  # Finding all cells in the row
    individual_row_data = [data.text.strip() for data in row_data]  # Cleaning cell text
    df.loc[len(df)] = individual_row_data  # Appending row data to DataFrame

In [10]:
df

Unnamed: 0,Rank,Name,Industry,Revenue (USD billions),Employees,Headquarters
0,1,Cargill,Food industry,177.0,160000,"Minnetonka, Minnesota"
1,2,Koch Industries,Conglomerate,125.0,120000,"Wichita, Kansas"
2,3,Publix Super Markets,Retail,54.5,250000,"Lakeland, Florida"
3,4,"Mars, Incorporated",Food industry,47.0,140000,"McLean, Virginia"
4,5,H-E-B,Retail,43.6,145000,"San Antonio, Texas"
5,6,Reyes Holdings,Wholesaling,40.0,36000,"Rosemont, Illinois"
6,7,Enterprise Holdings,Car rental,35.0,90000,"Clayton, Missouri"
7,8,C&S Wholesale Grocers,Wholesaling,34.7,15000,"Keene, New Hampshire"
8,9,Love's,Petroleum industry and Retail,26.5,40000,"Oklahoma City, Oklahoma"
9,10,Southern Glazer's Wine and Spirits,Food industry,26.0,24000,"Miramar, Florida"


## Saving Data to CSV File

In [11]:
output_path = r"C:\Users\user\Desktop\studying\1- python\web scraping\data_scraping.csv"
df.to_csv(output_path, index=False)  # Saving without index column

print("Data saved successfully at:", output_path)

Data saved successfully at: C:\Users\user\Desktop\data_scraped.csv
