In [3]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
import time
import pandas as pd
import os

# Create output folder if it doesn't exist
output_dir = "scraping/genre_csvs"
os.makedirs(output_dir, exist_ok=True)

# Setup Chrome WebDriver
driver = webdriver.Chrome()

# List of genres to scrape
genres = ['action', 'adventure', 'animation', 'biography']

# Base URL
base_url = "https://www.imdb.com/search/title/?title_type=feature&release_date=2024-01-01,2024-12-31&genres={}"

# Function to click 'Load More' until it's unavailable
def click_load_more():
    try:
        load_more_button = driver.find_element(By.XPATH, '//*[@id="__next"]/main/div[2]/div[3]/section/section/div/section/section/div[2]/div/section/div[2]/div[2]/div[2]/div/span/button/span/span')
        ActionChains(driver).move_to_element(load_more_button).perform()
        load_more_button.click()
        time.sleep(3)
        return True
    except:
        return False

# Loop through genres and scrape data
for genre in genres:
    print(f"\n🎬 Starting genre: {genre}")
    url = base_url.format(genre)
    driver.get(url)
    time.sleep(4)

    # Keep clicking Load More until it's unavailable
    while click_load_more():
        print("🔁 Clicked 'Load More'...")

    print("✅ Finished loading all movies for", genre)

    # Initialize lists to store the scraped data
    titles, ratings, votings, durations = [], [], [], []

    # Locate all movie items
    movie_items = driver.find_elements(By.XPATH, '//*[@id="__next"]/main/div[2]/div[3]/section/section/div/section/section/div[2]/div/section/div[2]/div[2]/ul/li')

    for movie_item in movie_items:
        try:
            title = movie_item.find_element(By.XPATH, './div/div/div/div[1]/div[2]/div[1]/a/h3').text
            rating = movie_item.find_element(By.XPATH, './div/div/div/div[1]/div[2]/span/div/span/span[1]').text
            voting = movie_item.find_element(By.XPATH, './div/div/div/div[1]/div[2]/span/div/span/span[2]').text
            duration = movie_item.find_element(By.XPATH, './div/div/div/div[1]/div[2]/div[2]/span[2]').text

            titles.append(title)
            ratings.append(rating)
            votings.append(voting)
            durations.append(duration)
        except Exception as e:
            print(f"⚠️ Error extracting a movie item: {e}")
            continue

    # Create a DataFrame and save genre info
    df = pd.DataFrame({
        'Title': titles,
        'Rating': ratings,
        'Votes': votings,
        'Duration': durations,
        'Genre': genre.capitalize()
    })

    # Save the DataFrame to CSV
    csv_path = os.path.join(output_dir, f"{genre}_movies_2024.csv")
    df.to_csv(csv_path, index=False)
    print(f"💾 Saved {len(df)} records to {csv_path}")

driver.quit()
print("🚀 Scraping complete for all genres!")



🎬 Starting genre: action
🔁 Clicked 'Load More'...
🔁 Clicked 'Load More'...
🔁 Clicked 'Load More'...
🔁 Clicked 'Load More'...
🔁 Clicked 'Load More'...
🔁 Clicked 'Load More'...
🔁 Clicked 'Load More'...
🔁 Clicked 'Load More'...
🔁 Clicked 'Load More'...
🔁 Clicked 'Load More'...
🔁 Clicked 'Load More'...
🔁 Clicked 'Load More'...
🔁 Clicked 'Load More'...
🔁 Clicked 'Load More'...
🔁 Clicked 'Load More'...
🔁 Clicked 'Load More'...
🔁 Clicked 'Load More'...
🔁 Clicked 'Load More'...
🔁 Clicked 'Load More'...
🔁 Clicked 'Load More'...
🔁 Clicked 'Load More'...
🔁 Clicked 'Load More'...
🔁 Clicked 'Load More'...
🔁 Clicked 'Load More'...
🔁 Clicked 'Load More'...
🔁 Clicked 'Load More'...
🔁 Clicked 'Load More'...
✅ Finished loading all movies for action
⚠️ Error extracting a movie item: Message: no such element: Unable to locate element: {"method":"xpath","selector":"./div/div/div/div[1]/div[2]/div[2]/span[2]"}
  (Session info: chrome=137.0.7151.104); For documentation on this error, please visit: https://w

In [4]:
import pandas as pd
import glob

# Path to the folder containing all genre CSVs
csv_files = glob.glob("scraping/genre_csvs/*.csv")

# Read and concatenate all CSV files
merged_df = pd.concat([pd.read_csv(f) for f in csv_files], ignore_index=True)

# Save the combined dataset
merged_df.to_csv("scraping/imdb_2024_all_movies.csv", index=False)
print(f"📦 Combined dataset saved with {len(merged_df)} records.")


📦 Combined dataset saved with 1781 records.
