In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

In [None]:


BASE_URL = "https://www.imdb.com/search/title/?title_type=feature&sort=num_votes,desc&start={}&ref_=adv_nxt"

movies_data = []

def scrape_movies(pages=5):  # pages = number of 50-item pages to scrape
    for page in range(1, pages * 50, 50):
        print(f"Scraping page starting at movie {page}...")
        url = BASE_URL.format(page)
        headers = {"User-Agent": "Mozilla/5.0"}
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.text, "html.parser")

        movie_containers = soup.find_all("div", class_="lister-item mode-advanced")

        for movie in movie_containers:
            title = movie.h3.a.text if movie.h3.a else None
            year = movie.h3.find("span", class_="lister-item-year").text if movie.h3.find("span", class_="lister-item-year") else None
            genre = movie.find("span", class_="genre").text.strip() if movie.find("span", class_="genre") else None
            rating = movie.find("div", class_="inline-block ratings-imdb-rating").strong.text if movie.find("div", class_="inline-block ratings-imdb-rating") and movie.find("div", class_="inline-block ratings-imdb-rating").strong else None
            summary = movie.find_all("p", class_="text-muted")
            description = summary[1].text.strip() if len(summary) > 1 else None

            movies_data.append({
                "Title": title,
                "Year": year,
                "Genre": genre,
                "Rating": rating,
                "Description": description
            })

        time.sleep(1)  # be polite and avoid overloading the server

scrape_movies(pages=20)  # 20 pages → 1,000 movies

# Save to CSV
df = pd.DataFrame(movies_data)
df.to_csv("../scraped_data/imdb_movies.csv", index=False, encoding="utf-8")

print(f"Scraped {len(movies_data)} movies and saved to imdb_movies.csv")


Scraping page starting at movie 1...
Scraping page starting at movie 51...
Scraping page starting at movie 101...
Scraping page starting at movie 151...
Scraping page starting at movie 201...
Scraping page starting at movie 251...
Scraping page starting at movie 301...
Scraping page starting at movie 351...
Scraping page starting at movie 401...
Scraping page starting at movie 451...
Scraping page starting at movie 501...
Scraping page starting at movie 551...
Scraping page starting at movie 601...
Scraping page starting at movie 651...
Scraping page starting at movie 701...
Scraping page starting at movie 751...
Scraping page starting at movie 801...
Scraping page starting at movie 851...
Scraping page starting at movie 901...
Scraping page starting at movie 951...
Scraped 0 movies and saved to imdb_movies.csv


In [None]:
# enrichment script for IMDb Top 250 movies
# adding more details like summary, director, runtime, and cast

BASE_CHART_URL = "https://www.imdb.com/chart/top/"
BASE_TITLE_URL = "https://www.imdb.com"

headers = {"User-Agent": "Mozilla/5.0"}

def parse_movie_detail(url):
    """Fetch and parse details from a movie's detail page."""
    resp = requests.get(url, headers=headers)
    soup = BeautifulSoup(resp.text, "html.parser")
    
    # Extract summary
    summary = soup.find("span", {"data-testid": "plot-l"}).text.strip() \
        if soup.find("span", {"data-testid": "plot-l"}) else None
    
    # Extract director(s)
    director = soup.find("a", {"data-testid": "title-pc-principal-credit"}).text.strip() \
        if soup.find("a", {"data-testid": "title-pc-principal-credit"}) else None
    
    # Extract runtime
    runtime = soup.find("li", {"data-testid": "title-techspec_runtime"})
    runtime = runtime.text.strip() if runtime else None
    
    # Extract cast (first 3)
    cast_list = [a.text for a in soup.select("a[data-testid*='title-cast-item__actor']")][:3]
    cast = ", ".join(cast_list)
    
    return summary, director, runtime, cast

def scrape_top_250():
    print("Fetching Top 250 list...")
    resp = requests.get(BASE_CHART_URL, headers=headers)
    soup = BeautifulSoup(resp.text, "html.parser")
    
    rows = soup.select("tbody.lister-list tr")
    movies = []
    
    for row in rows:
        title = row.find("td", class_="titleColumn").a.text
        year = row.find("td", class_="titleColumn").span.text.strip("()")
        rating = row.find("td", class_="ratingColumn imdbRating").strong.text
        
        link = BASE_TITLE_URL + row.find("td", class_="titleColumn").a["href"].split('?')[0]
        summary, director, runtime, cast = parse_movie_detail(link)
        print(f"Scraped: {title} ({year})")
        
        movies.append({
            "title": title,
            "year": year,
            "rating": rating,
            "summary": summary,
            "director": director,
            "runtime": runtime,
            "cast": cast
        })
        time.sleep(1)  # polite delay
    
    return movies

if __name__ == "__main__":
    data = scrape_top_250()
    df = pd.DataFrame(data)
    df.to_csv("../scraped_data/imdb_top250_enriched.csv", index=False)
    print("Data saved to imdb_top250_enriched.csv")


Fetching Top 250 list...
Data saved to imdb_top250_enriched.csv
