In [2]:
"""
Web scraping with BeautifulSoup
David Oduola
Created: 16/02/2025
"""
    
from bs4 import BeautifulSoup #import the necessary libraries
import requests
import pandas as pd

In [3]:
url = 'https://en.wikipedia.org/wiki/List_of_highest-grossing_films' #url of the webpage to scrape
response = requests.get(url) #get the webpage
soup = BeautifulSoup(response.text, 'html.parser') #parse the webpage

# Find all tables with class "wikitable"
tables = soup.find_all("table", class_="wikitable")
table = tables[0]  # Assuming you want to use the first table

headers = [th.text.strip() for th in table.find('tr').find_all('th')[:6]]
headers


['Rank', 'Peak', 'Title', 'Worldwide gross', 'Year', 'Ref']

In [4]:
df = pd.DataFrame(columns=headers) #create a dataframe with the headers as columns

for row in table.find_all('tr')[1:]:  # Loop through each row, skipping the header row
    cells = row.find_all(['th', 'td'])  # Extract cell data including 'th' for row headers
    row_data = [td.text.strip() for td in cells]  # Clean text
    
    if len(row_data) == len(headers):  # Ensure the row data matches the number of headers
        df.loc[len(df)] = row_data  # Append row to DataFrame

In [14]:
df = df[['Title', 'Year', 'Worldwide gross', 'Rank', 'Peak']]  #Reshaping the dataframe
df["Worldwide gross"] = df["Worldwide gross"].astype(str).str.replace(r"[^0-9]", "", regex=True).astype(float)



In [15]:
df.to_csv('movie_data_scrape.csv', index=False,) # Save the DataFrame to a CSV file')