Notes:

1. If you had not, install selenium using `pip install selenium webdriver-manager`
2. If you had not, install tqdm using `pip install tqdm`
3. instead of storing strings & numbers, or generic objects, it is wiser to store typed objects; two good choices are
    1. __[namedtuple](https://docs.python.org/3/library/collections.html#collections.namedtuple)__
    2. __[dataclass](https://docs.python.org/3/library/dataclasses.html)__

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from dataclasses import dataclass, field
from typing import Optional, List
import re
import time
import pandas as pd

url = 'https://www.imdb.com/search/title/?title_type=tv_movie,feature&release_date=2024-01-01,2024-12-31&country_of_origin=ES'

@dataclass
class Director:
    name: str
    url: str

@dataclass
class Thespian:
    name: str
    url: str
@dataclass
class MovieInfo:
    title: str
    url: str
    imdbRating: Optional[float] = None
    imdbVotes: Optional[int] = None
    metascore: Optional[int] = None
    directors: List[str] = field(default_factory=list)
    thespians: List[str] = field(default_factory=list)

def scrapeIMDbMoviesWithSlidingWindow(someURL):
    driver = webdriver.Chrome()
    driver.get(someURL)
    
    moviesList = []
    pageBatchSize = 50
    batchCounter = 0

    while True:
        # Wait until new batch is loaded
        WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.CLASS_NAME, "ipc-metadata-list-summary-item"))
        )
        time.sleep(0.5)  # slight buffer to ensure render

        # Parse and collect current visible 50 movies
        pageSource = driver.page_source
        soup = BeautifulSoup(pageSource, 'html.parser')
        movieItems = soup.select("ul.ipc-metadata-list > li")

        newMovies = []

        for li in movieItems:
            try:
                titleBlock = li.select_one("div.dli-parent h3")
                if not titleBlock:
                    continue
                title = titleBlock.text.strip()
                movieURL = "https://www.imdb.com" + li.select_one("a")["href"]
                movieURL = movieURL.split('?')[0]

                imdbRating = li.select_one("span.ipc-rating-star--rating")
                voteCount = li.select_one("span.ipc-rating-star--voteCount")
                metascore = li.select_one("span.metacritic-score-box")

                movie = MovieInfo(title, movieURL, imdbRating.text if imdbRating else None,
                                    voteCount.text if voteCount else None, metascore.text if metascore else None, [], [])
                newMovies.append(movie)
            except Exception as e:
                print(f"Parse error: {e}")

        moviesList.extend(newMovies)
        batchCounter += 1

        # Remove the first 50 <li> elements to avoid bloat
        driver.execute_script("""
            const ul = document.querySelector("ul.ipc-metadata-list");
            const lis = ul.querySelectorAll("li");
            for (let i = 0; i < 50 && i < lis.length; i++) {
                lis[i].remove();
            }
        """)

        time.sleep(0.5)  # Allow DOM to settle after deletion

        # Try clicking the “More” button
        try:
            button = WebDriverWait(driver, 5).until(
                EC.element_to_be_clickable((By.CLASS_NAME, "ipc-see-more__button"))
            )
            buttonText = driver.execute_script("return arguments[0].innerText;", button)
        except Exception as e:
            print("🛑 button retrieval failure")
            print(repr(e))
            break
        try:
            driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", button)
        except Exception as e:
            print("🛑 scroll failure")
            print(repr(e))
            break
        try:
            time.sleep(0.5)
        except Exception as e:
            print("🛑 sleep failure")
            print(repr(e))
            break
        try:
            driver.execute_script("arguments[0].click();", button)
        except Exception as e:
            print("🛑 Clickety failure")
            print(repr(e))
            break
        try:
            match = re.search(r"(\d+)", buttonText)
            if match:
                pageBatchSize = int(match.group(1))
            else:
                pageBatchSize = 50  # fallback/default
        except Exception as e:
            print("🛑 update of batchSize: failure")
            print(repr(e))
            break

            # Wait for at least one new <li> to load
        try:
            WebDriverWait(driver, 10).until(
                lambda d: len(d.find_elements(By.CLASS_NAME, "ipc-metadata-list-summary-item")) >= pageBatchSize
            )
        except Exception as e:
            print("🛑 Lambda failure in WebDriverWait")
            print(repr(e))
            break

    driver.quit()
    return moviesList

movies = scrapeIMDbMoviesWithSlidingWindow(url)

🛑 button retrieval failure
TimeoutException()


In [9]:
def scrapeMovieCredits(movieURL):
    fullCreditsURL = movieURL + "fullcredits/"
    
    headers = {
        'User-Agent': 'Mozilla/5.0'
    }

    response = requests.get(fullCreditsURL, headers=headers)
    response.raise_for_status()
    
    soup = BeautifulSoup(response.text, 'html.parser')

    # === Get all directors ===
    directors = []
    try:
        directorHeader = soup.find('h4', string=re.compile('Directed by'))
        if directorHeader:
            directorTable = directorHeader.find_next_sibling('table')
            if directorTable:
                for a in directorTable.find_all('a'):
                    directorName = a.text.strip()
                    directorURL = "https://www.imdb.com" + a.get('href')
                    directorURL = directorURL.split('?')[0]
                    if directorName and directorURL:
                        directors.append(Director(directorName, directorURL))
    except Exception as e:
        print(f"Error extracting directors for {movieUrl}: {e}")
    
    # === Get cast (top 5 billed actors) ===
    thespians = []
    try:
        castTable = soup.find('table', class_='cast_list')
        if castTable:
            rows = castTable.find_all('tr', class_=lambda x: x != 'castlist_label')
            for row in rows:
                cols = row.find_all('td')
                if len(cols) >= 2:
                    actorName = cols[1].text.strip()
                    actorURL = "https://www.imdb.com" + cols[1].find('a')['href']
                    actorURL = actorURL.split('?')[0]
                    if actorName and actorURL:
                        thespians.append(Thespian(actorName, actorURL))
                if len(thespians) >= 5:
                    break
    except Exception as e:
        print(f"Error extracting cast for {movieURL}:")
        print(repr(e))
    
    return directors, thespians

In [10]:
from tqdm import tqdm
import requests

for movie in tqdm(movies):
    directors, thespians = scrapeMovieCredits(movie.url)
    movie.directors = directors
    movie.thespians = thespians
    time.sleep(0.5)  # Respect IMDb

100%|██████████| 473/473 [14:07<00:00,  1.79s/it]


In [11]:
print(len(movies))

473


In [12]:
print(movies)



In [49]:
movies_df = pd.DataFrame(movies)

In [50]:
len(movies_df)

473

In [51]:
movies_df

Unnamed: 0,title,url,imdbRating,imdbVotes,metascore,directors,thespians
0,1. The Penguin Lessons,https://www.imdb.com/title/tt26677014/,73,(655),58,[],[]
1,2. La fiebre de los ricos,https://www.imdb.com/title/tt17677434/,55,"(5,4 mil)",,[],[]
2,3. El juego del asesino,https://www.imdb.com/title/tt0327785/,58,(17 mil),36,"[{'name': 'J.J. Perry', 'url': 'https://www.im...","[{'name': 'Dave Bautista', 'url': 'https://www..."
3,4. La habitación de al lado,https://www.imdb.com/title/tt29439114/,68,(19 mil),70,[],[]
4,5. Hechizados,https://www.imdb.com/title/tt7215232/,56,"(7,1 mil)",54,"[{'name': 'Vicky Jenson', 'url': 'https://www....","[{'name': 'Rachel Zegler', 'url': 'https://www..."
...,...,...,...,...,...,...,...
468,469. Juan Espino: El mejor luchador de todos l...,https://www.imdb.com/title/tt32991422/,,,,[],[]
469,470. Athletes to Watch - Paris 2024,https://www.imdb.com/title/tt32992679/,,,,[],[]
470,471. Película Nº1,https://www.imdb.com/title/tt31124592/,,,,[],[]
471,472. El eco de otras voces,https://www.imdb.com/title/tt36386095/,,,,"[{'name': 'Adriana Domínguez', 'url': 'https:/...","[{'name': 'Adolfo Domínguez', 'url': 'https://..."


In [52]:
movies_df.to_excel("movies.xlsx", index=False)