In [1]:
# Importing libraries
from bs4 import BeautifulSoup
import requests
import time

# Your user agent
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}

In [2]:
# Send a GET request to the URL with the specified user agent
url = "https://www.imdb.com/chart/top/?ref_=nv_mv_250"
try:
    response = requests.get(url, headers=headers)
    response.raise_for_status() # Raise an HTTPError for bad responses

    # Parse the HTML content of the page
    soup = BeautifulSoup(response.text, 'html.parser')
except requests.exceptions.RequestException as e:
        print(f'Failed to retrieve {url}. Error: {e}')

In [3]:
# Getting the links for all the movies
top_movie_urls = []
for link in soup.select('li a.ipc-title-link-wrapper'):
    top_movie_urls.append("https://www.imdb.com" + link.get('href'))

In [4]:
# Getting the items for each columns
movies_link = []
movies_title = []
movies_year = []
movies_duration = []
movies_description = []
ratings = []
imdb_ratings = []
movies_casts = []
movies_stars = []
directors = []
budgets = []
gross_international = []
countries = []

In [5]:
count = 0
failed_url = []

# Iterate through each URL and failed URL
for x in [top_movie_urls, failed_url]:
    # Iterate through each URL
    for url in x:
        try:
            # Send a GET request to the URL with the specified user agent
            response = requests.get(url, headers=headers)
            response.raise_for_status()  # Raise an HTTPError for bad responses

            # Parse the HTML content of the page
            soup = BeautifulSoup(response.text, 'html.parser')

            for sp in soup.select('span.hero__primary-text'):
                movies_title.append(sp.text)
            for sp in soup.select('div.jqlHBQ li:first-child'):
                movies_year.append(sp.text)
            for sp in soup.select('div.jqlHBQ li:last-child'):
                movies_duration.append(sp.text)
            for sp in soup.select('div.jqlHBQ li:nth-of-type(2) a'):
                ratings.append(sp.text)
            for sp in soup.select('span.chnFO'):
                movies_description.append(sp.text)
            for sp in soup.select('div.llNLpA span.cMEQkK'):
                imdb_ratings.append(sp.text)
            movies_link.append(url)

            casts = ""
            for sp in soup.select('div.gWwKlt a.gCQkeh'):
                casts += sp.text + ', '
            movies_casts.append(casts[:-2])

            director = ""
            for sp in soup.select("div.dIOekc ul li:first-child div a"):
                director += sp.text + ', '
            directors.append(director[:-2])

            star = ""
            for sp in soup.select('div.dIOekc li:nth-of-type(3) div a'):
                star += sp.text + ', '
            movies_stars.append(star[:-2])

            bx = soup.find(attrs={'data-testid': 'BoxOffice'})
            if bx:
                span_element = bx.find_all('span')
                budgets.append(span_element[3].text)
                gross_international.append(span_element[-1].text)
            else:
                budgets.append("")
                gross_international.append("")
            cty = soup.find(attrs={'data-testid': 'title-details-section'})
            countries.append(cty.find_all('a')[3].text)
            
            count += 1
            if count % 50 == 0:
                print(f'{count} successfully scraped')
                
            time.sleep(1)
        except requests.exceptions.RequestException as e:
            failed_url.append(url)
            print(f'Failed to retrieve {url}')
            continue  # Skip to the next URL
print("All scraped")

50 successfully scraped
Failed to retrieve https://www.imdb.com/title/tt1187043/?ref_=chttp_t_87
100 successfully scraped
150 successfully scraped
200 successfully scraped
250 successfully scraped
All scraped


In [6]:
# Creating dataset dict
movie_dict = {
    "Title": movies_title,
    "Directors": directors,
    "Casts": movies_casts,
    "Star": movies_stars,
    "Description": movies_description,
    "Duration": movies_duration,
    "Country": countries,
    "Release_Year": movies_year,
    "Ratings": ratings,
    "IMDB Ratings": imdb_ratings,
    "Budgets": budgets,
    "Gross International": gross_international,
    "Movie Links": movies_link,
}

In [7]:
# Checking length of dicts
for i, v in movie_dict.items():
    print(len(v))

250
250
250
250
250
250
250
250
250
250
250
250
250


In [8]:
# importing and creating dataframe
import pandas as pd
df = pd.DataFrame(movie_dict)

In [9]:
# loading dataset
df.tail()

Unnamed: 0,Title,Directors,Casts,Star,Description,Duration,Country,Release_Year,Ratings,IMDB Ratings,Budgets,Gross International,Movie Links
245,It Happened One Night,Frank Capra,"Clark Gable, Claudette Colbert, Walter Connoll...","Clark Gable, Claudette Colbert, Walter Connolly",A renegade reporter trailing a young runaway h...,1h 45m,United States,1934,PG,8.1,"$325,000 (estimated)","$14,212",https://www.imdb.com/title/tt0025316/?ref_=cht...
246,The 400 Blows,François Truffaut,"Jean-Pierre Léaud, Albert Rémy, Claire Maurier...","Jean-Pierre Léaud, Albert Rémy, Claire Maurier","A young boy, left without attention, delves in...",1h 39m,France,1959,Not Rated,8.1,$509,"$171,268",https://www.imdb.com/title/tt0053198/?ref_=cht...
247,Gangs of Wasseypur,Anurag Kashyap,"Manoj Bajpayee, Nawazuddin Siddiqui, Tigmanshu...","Manoj Bajpayee, Nawazuddin Siddiqui, Tigmanshu...",A clash between Sultan and Shahid Khan leads t...,5h 21m,India,2012,14A,8.2,"₹184,000,000 (estimated)","$4,384,642",https://www.imdb.com/title/tt1954470/?ref_=cht...
248,Aladdin,"Ron Clements, John Musker","Scott Weinger, Robin Williams, Linda Larkin, J...","Scott Weinger, Robin Williams, Linda Larkin",A kind-hearted street urchin and a power-hungr...,1h 30m,United States,1992,G,8.0,"$28,000,000 (estimated)","$504,050,219",https://www.imdb.com/title/tt0103639/?ref_=cht...
249,3 Idiots,Rajkumar Hirani,"Aamir Khan, Madhavan, Mona Singh, Sharman Josh...","Aamir Khan, Madhavan, Mona Singh",Two friends are searching for their long lost ...,2h 50m,India,2009,PG,8.4,"₹550,000,000 (estimated)","$60,262,836",https://www.imdb.com/title/tt1187043/?ref_=cht...


In [10]:
# Creating csv file from dataframe
df.to_csv('imdb_top_250_movies.csv')