Use BeautifulSoup to scrape the International Movies Database (IMDB) at imdb.com for top films released in year 2023 with the highest US box office. 

This is multi-page scraping
The scrape_movies function is responsible for this multi-page scraping.

(https://www.imdb.com/search/title/?release_date=2023&sort=boxoffice_gross_us,desc&start=1)
using 5 features
1. name - title of the movie,
2. year - release year of the movie,
3. imdb - IMDB score of the movie,
4. vote - number of votes.
5. rating - imdb rating

In [21]:
import bs4
import requests
import time
import random as ran
import pandas as pd
from nbconvert import PDFExporter
import nbformat

In [22]:
def scrape_mblock(movie_block):
    name_element = movie_block.find('a')
    year_element = movie_block.find('span', {'class': 'lister-item-year'})
    rating_element = movie_block.find('div', {'class': 'inline-block ratings-imdb-rating'})
    m_score_element = movie_block.find('span', {'class': 'metascore favorable'})
    votes_element = movie_block.find('span', {'name': 'nv'})
    return {
        'name': name_element.get_text() if name_element else None,
        'year': year_element.contents[0][1:-1] if year_element else None,
        'rating': float(rating_element.get('data-value')) if rating_element else None,
        'm_score': float(m_score_element.contents[0].strip()) if m_score_element else None,
        'votes': int(votes_element.get('data-value').replace(',', '')) if votes_element else None
    }

In [23]:
def scrape_page(movie_blocks):
    return [scrape_mblock(movie_block) for movie_block in movie_blocks]


In [24]:
def scrape_movies(link, target_count):
    movie_data = []
    page_number = 1
    
    while len(movie_data) < target_count:
        url = link + str(page_number)
        source = requests.get(url).text
        soup = bs4.BeautifulSoup(source, 'html.parser')
        movie_blocks = soup.findAll('div', {'class': 'lister-item-content'})
        
        if not movie_blocks:
            break
        
        movie_data.extend(scrape_page(movie_blocks))
        page_number += 50
        time.sleep(ran.randint(0, 10))
    
    return movie_data

In [26]:
def convert_notebook_to_pdf(notebook_file, output_pdf):
    with open(notebook_file, 'r') as f:
        nb = nbformat.read(f, as_version=4)
    pdf_exporter = PDFExporter()
    (body, resources) = pdf_exporter.from_notebook_node(nb)
    with open(output_pdf, 'wb') as f:
        f.write(body)

In [28]:
base_scraping_link = "https://www.imdb.com/search/title?release_date=2023-01-01,2023-12-31&sort=boxoffice_gross_us,desc&start="
top_movies = 150

films = scrape_movies(base_scraping_link, int(top_movies))

In [29]:
# Print and save results
print("\n\nList of top " + str(top_movies) + " movies:\n")
df = pd.DataFrame(films)
print(df)



List of top 150 movies:

                          name      year  rating  m_score     votes
0                         Jodi      2023     8.1      NaN     363.0
1               Shrek 2 Retold      2023     NaN      NaN  150659.0
2            Sit Down Stand Up      2023     NaN      NaN   88388.0
3                        R BnB      2023     9.1      NaN      11.0
4            King of the Block      2023     NaN      NaN    5497.0
..                         ...       ...     ...      ...       ...
145                 The Nun II      2023     NaN      NaN       NaN
146                    Ghosted  I) (2023     5.8      NaN   50629.0
147  Jagun Jagun (The Warrior)      2023     3.6      NaN     596.0
148                      FUBAR    2023–      6.5      NaN   31825.0
149                 Past Lives      2023     8.3     94.0   10097.0

[150 rows x 5 columns]


In [32]:
# Save DataFrame to CSV
csv_filename = "top_movies_2023.csv"
df.to_csv(csv_filename, index=False)
print(f"List of top {top_movies} movies saved as {csv_filename}")

List of top 150 movies saved as top_movies_2023.csv
