In [16]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time

BASE_URL = "https://www.imdb.com/search/title/?groups=top_1000&count=100&sort=user_rating,asc"
headers = {
    "User-Agent": "Mozilla/5.0",
    "Accept-Language": "en-US,en;q=0.9"
}

title_list = []
year_list = []
duration_list = []
certificate_list = []
rating_list = []
votes_list = []
metascore_list = []
desc_list = []

seen_titles = set() 
params = {
    "groups": "top_1000",
    "count": 25,
    "sort": "user_rating,asc",
    "start": start
}
response = requests.get(BASE_URL, headers=headers, params=params)
soup = BeautifulSoup(response.text, "lxml")
movies = soup.find_all("div", class_="sc-fc35a1ef-1 lmHCrT dli-parent")

if not movies:
    # fallback to legacy markup for classic results
    movies = soup.find_all("div", class_="lister-item mode-advanced")
print(f"Page {page} starting at {start}: {len(movies)} movies.")

for item in movies:
    # Title extraction for new IMDb markup
    h3 = item.find("h3", class_="ipc-title__text ipc-title__text--reduced")
    title = h3.text.strip() if h3 else ""

    # SKIP THIS MOVIE IF ALREADY SEEN
    if not title or title in seen_titles:
        continue
    seen_titles.add(title)

    title_list.append(title)

    # All time/cert/date spans (new IMDb card: grab all as list)
    meta_spans = item.find_all("span", class_="sc-caa65599-7 eeMIpC dli-title-metadata-item")
    year, duration, certificate = "", "", ""
    for ms in meta_spans:
        text = ms.text.strip()
        if not year and re.search(r"\d{4}", text):
            year = re.search(r"\d{4}", text).group()
        elif not duration and ("h" in text or "m" in text):
            duration = text
        elif not certificate and len(text) <= 8:
            certificate = text
    year_list.append(year)
    duration_list.append(duration)
    certificate_list.append(certificate)

     # IMDb Rating
    rating_tag = item.find("span", class_="ipc-rating-star--rating")
    imdb_rating = rating_tag.text.strip() if rating_tag else ""
    rating_list.append(imdb_rating)

    # Number of Ratings
    votes_tag = item.find("span", class_="ipc-rating-star--voteCount")
    votes = votes_tag.text.replace(",", "") if votes_tag else ""
    votes_list.append(votes)

    # Metascore
    metascore_tag = item.find("span", class_="sc-9fe7b0ef-0 hDuMnh metacritic-score-box")
    metascore = metascore_tag.text.strip() if metascore_tag else ""
    metascore_list.append(metascore)

    # Description
    desc_tag = item.find("div", class_="ipc-html-content-inner")
    if desc_tag:
        description = desc_tag.text.strip()
    else:
        desc_tag = item.find("div", class_="ipc-html-content-inner-div")
        description = desc_tag.text.strip() if desc_tag else ""
    desc_list.append(description)

    # stop at 700 unique rows
    if len(title_list) >= 1000:
        break

time.sleep(1)

df = pd.DataFrame({
    "Title": title_list,
    "Year": year_list,
    "Duration": duration_list,
    "Certificate": certificate_list,
    "IMDb Rating": rating_list,
    "Number of Ratings": votes_list,
    "Meta score": metascore_list,
    "Description": desc_list
})
print(df.head())
print("Total unique rows:", len(df))
df.to_csv("imdb_top1000_700_unique.csv", index=False)


Page 40 starting at 976: 25 movies.
                                 Title  Year Duration Certificate IMDb Rating  \
0                        1. The Others  2001   1h 44m       PG-13         7.6   
1  2. Once Upon a Time... in Hollywood  2019   2h 41m           R         7.6   
2                               3. Saw  2004   1h 43m           R         7.6   
3                   4. Minority Report  2002   2h 25m       PG-13         7.6   
4                 5. The Fifth Element  1997    2h 6m       PG-13         7.6   

  Number of Ratings Meta score  \
0            (419K)         74   
1            (936K)         84   
2            (500K)         46   
3            (612K)         80   
4            (535K)         52   

                                         Description  
0  In 1945, immediately following the end of Seco...  
1  As Hollywood's Golden Age is winding down duri...  
2  Two men awaken to find themselves on the oppos...  
3  John works with the PreCrime police which stop...