In [19]:
from selenium import webdriver
from selenium.common.exceptions import NoSuchWindowException, WebDriverException
from bs4 import BeautifulSoup
import time
import csv

IMDB_studios = {
    "marvel": "https://www.imdb.com/list/ls026690821/",
    "disney": "https://www.imdb.com/list/ls076436131/",
    "hbo": "https://www.imdb.com/list/ls008263268/",
    "universal": "https://www.imdb.com/list/ls029443456/",
    "sony": "https://www.imdb.com/list/ls051501563/",
    "20th Century Fox": "https://www.imdb.com/list/ls059930506/",
    "warner bros": "https://www.imdb.com/list/ls099821951/",
    "dreamworks": "https://www.imdb.com/list/ls068935612/",
}

def extract_year_time_audience(div):
    """Extract year, time, and audience type from the div element."""
    spans = div.find_all('span', class_='sc-b189961a-8 hCbzGp dli-title-metadata-item')
    if len(spans) >= 3:
        year = spans[0].text.strip()
        time_str = spans[1].text.strip()
        audience_type = spans[2].text.strip()
        return year, time_str, audience_type
    return 'N/A', 'N/A', 'N/A'

try:
    driver = webdriver.Chrome()  # Initialize WebDriver

    for studio in IMDB_studios:
        url = IMDB_studios[studio]

        driver.get(url)  # Navigate to the desired URL

        # Wait for the page to load completely
        time.sleep(5)  # Adjust the sleep time as needed for the page to load

        # Get the page source
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        # Find all movie elements - adjust class_ values based on actual page source
        movies_title = soup.find_all('h3', class_='ipc-title__text')
        movie_links = soup.find_all('a', class_='ipc-title-link-wrapper')
        movies_image_url = soup.find_all('img', class_='ipc-image')
        movies_year_time_audiType_divs = soup.find_all(class_='sc-b189961a-7 btCcOY dli-title-metadata')
        movies_rating = soup.find_all(class_='ipc-rating-star--rating')
        movies_voteCount = soup.find_all(class_='ipc-rating-star--voteCount')
        movies_metacritic_score = soup.find_all(class_='sc-b0901df4-0 bXIOoL metacritic-score-box')
        movies_content = soup.find_all(class_='ipc-html-content-inner-div')
        movies_director = soup.find_all(class_='ipc-link ipc-link--base dli-director-item')
        movies_stars = soup.find_all(class_='sc-74bf520e-5 eesgaX')

        # Prepare the data for CSV
        data = []
        for title, link, image_url, year_time_audiType_div, rating, voteCount, metacritic_score, content, director, stars in zip(
                movies_title, movie_links, movies_image_url, movies_year_time_audiType_divs, movies_rating,
                movies_voteCount, movies_metacritic_score, movies_content, movies_director, movies_stars):
            
            # Extract year, time, and audience type
            year, time_str, audience_type = extract_year_time_audience(year_time_audiType_div)
            
            # Extract movie link
            movie_url = link['href'] if 'href' in link.attrs else 'N/A'
            
            data.append([
                title.text.strip(),
                image_url['src'] if 'src' in image_url.attrs else 'N/A',
                year,
                time_str,
                audience_type,
                rating.text.strip(),
                voteCount.text.strip(),
                metacritic_score.text.strip() if metacritic_score else 'N/A',
                content.text.strip(),
                director.text.strip(),
                stars.text.strip(),
                f"https://www.imdb.com{movie_url}"  # Complete movie link
            ])

        # Write the data to CSV
        csv_file_path = f'{studio}_imdb_movies.csv'

        with open(csv_file_path, 'w', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            writer.writerow(['Title', 'Image URL', 'Year', 'Time', 'Audience Type', 'Rating', 'Vote Count', 'Metacritic Score', 'Content', 'Director', 'Stars', 'Movie Link'])
            writer.writerows(data)

except NoSuchWindowException as e:
    print("Error: No such window. The window may have been closed.")
    print(e)

except WebDriverException as e:
    print("WebDriverException: An error occurred with the WebDriver.")
    print(e)

finally:
    driver.quit()  # Close the WebDriver session


CompleteData in a CSV

In [20]:
from selenium import webdriver
from selenium.common.exceptions import NoSuchWindowException, WebDriverException
from bs4 import BeautifulSoup
import time
import csv

IMDB_studios = {
    "marvel": "https://www.imdb.com/list/ls026690821/",
    "disney": "https://www.imdb.com/list/ls076436131/",
    "hbo": "https://www.imdb.com/list/ls008263268/",
    "universal": "https://www.imdb.com/list/ls029443456/",
    "sony": "https://www.imdb.com/list/ls051501563/",
    "20th Century Fox": "https://www.imdb.com/list/ls059930506/",
    "warner bros": "https://www.imdb.com/list/ls099821951/",
    "dreamworks": "https://www.imdb.com/list/ls068935612/",
}

def extract_year_time_audience(div):
    """Extract year, time, and audience type from the div element."""
    spans = div.find_all('span', class_='sc-b189961a-8 hCbzGp dli-title-metadata-item')
    if len(spans) >= 3:
        year = spans[0].text.strip()
        time_str = spans[1].text.strip()
        audience_type = spans[2].text.strip()
        return year, time_str, audience_type
    return 'N/A', 'N/A', 'N/A'

try:
    driver = webdriver.Chrome()  # Initialize WebDriver

    all_data = []  # To accumulate data from all studios

    for studio in IMDB_studios:
        url = IMDB_studios[studio]

        driver.get(url)  # Navigate to the desired URL

        # Wait for the page to load completely
        time.sleep(5)  # Adjust the sleep time as needed for the page to load

        # Get the page source
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        # Find all movie elements - adjust class_ values based on actual page source
        movies_title = soup.find_all('h3', class_='ipc-title__text')
        movie_links = soup.find_all('a', class_='ipc-title-link-wrapper')
        movies_image_url = soup.find_all('img', class_='ipc-image')
        movies_year_time_audiType_divs = soup.find_all(class_='sc-b189961a-7 btCcOY dli-title-metadata')
        movies_rating = soup.find_all(class_='ipc-rating-star--rating')
        movies_voteCount = soup.find_all(class_='ipc-rating-star--voteCount')
        movies_metacritic_score = soup.find_all(class_='sc-b0901df4-0 bXIOoL metacritic-score-box')
        movies_content = soup.find_all(class_='ipc-html-content-inner-div')
        movies_director = soup.find_all(class_='ipc-link ipc-link--base dli-director-item')
        movies_stars = soup.find_all(class_='sc-74bf520e-5 eesgaX')

        # Prepare the data for CSV
        for title, link, image_url, year_time_audiType_div, rating, voteCount, metacritic_score, content, director, stars in zip(
                movies_title, movie_links, movies_image_url, movies_year_time_audiType_divs, movies_rating,
                movies_voteCount, movies_metacritic_score, movies_content, movies_director, movies_stars):
            
            # Extract year, time, and audience type
            year, time_str, audience_type = extract_year_time_audience(year_time_audiType_div)
            
            # Extract movie link
            movie_url = link['href'] if 'href' in link.attrs else 'N/A'
            
            all_data.append([
                title.text.strip(),
                image_url['src'] if 'src' in image_url.attrs else 'N/A',
                year,
                time_str,
                audience_type,
                rating.text.strip(),
                voteCount.text.strip(),
                metacritic_score.text.strip() if metacritic_score else 'N/A',
                content.text.strip(),
                director.text.strip(),
                stars.text.strip(),
                f"https://www.imdb.com{movie_url}"  # Complete movie link
            ])

        # Write the data to CSV for each studio
        csv_file_path = f'{studio}_imdb_movies.csv'

        with open(csv_file_path, 'w', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            writer.writerow(['Title', 'Image URL', 'Year', 'Time', 'Audience Type', 'Rating', 'Vote Count', 'Metacritic Score', 'Content', 'Director', 'Stars', 'Movie Link'])
            writer.writerows(all_data)

    # Write all accumulated data to a single CSV file
    complete_csv_file_path = 'all_imdb_movies.csv'

    with open(complete_csv_file_path, 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Title', 'Image URL', 'Year', 'Time', 'Audience Type', 'Rating', 'Vote Count', 'Metacritic Score', 'Content', 'Director', 'Stars', 'Movie Link'])
        writer.writerows(all_data)

except NoSuchWindowException as e:
    print("Error: No such window. The window may have been closed.")
    print(e)

except WebDriverException as e:
    print("WebDriverException: An error occurred with the WebDriver.")
    print(e)

finally:
    driver.quit()  # Close the WebDriver session


BOX Office data

In [41]:
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException, WebDriverException
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import csv
import time

# Setup WebDriver
driver = webdriver.Chrome()

def get_box_office(movie_url):
    """Scrape the box office worldwide value from a movie page."""
    try:
        driver.get(movie_url)
        time.sleep(2)  # Wait for the page to load
        
        # Try to find the box office element using the provided selector or XPath
        try:
            # box_office_element = driver.find_element(By.XPATH, '//*[@id="__next"]/main/div/section[1]/div/section/div/div[1]/section[12]/div[2]/ul/li[4]/div/ul/li/span')
            box_office_element = driver.find_element(By.XPATH, '//span[text()="Gross worldwide"]/following-sibling::div//span')

        except NoSuchElementException:
            box_office_element = driver.find_element(By.CSS_SELECTOR, "#__next > main > div > section.ipc-page-background.ipc-page-background--base.sc-c41b9732-0.llaHBP > div > section > div > div.sc-978e9339-1.cfupNW.ipc-page-grid__item.ipc-page-grid__item--span-2 > section:nth-child(49) > div.sc-f65f65be-0.dQVJPm > ul > li:nth-child(4) > div > ul > li > span")

        return box_office_element.text.strip()
    except Exception as e:
        print(f"Error while scraping {movie_url}: {e}")
        return 'N/A'

def create_box_office_csv(input_csv, output_csv):
    """Create a new CSV file with movie titles and box office worldwide values."""
    with open(input_csv, mode='r', encoding='utf-8') as infile:
        reader = csv.DictReader(infile)
        fieldnames = ['Title', 'Box Office Worldwide']  # Only title and box office
        rows = []

        for row in reader:
            movie_url = row['Movie Link']
            box_office = get_box_office(movie_url)
            rows.append({
                'Title': row['Title'],  # Use the title from the existing CSV
                'Box Office Worldwide': box_office
            })

    with open(output_csv, mode='w', newline='', encoding='utf-8') as outfile:
        writer = csv.DictWriter(outfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(rows)

if __name__ == "__main__":
    try:
        input_csv = 'all_imdb_movies.csv'
        output_csv = 'box_office_data.csv'
        create_box_office_csv(input_csv, output_csv)
    except WebDriverException as e:
        print(f"WebDriverException: An error occurred with the WebDriver.\n{e}")
    finally:
        driver.quit()


Error while scraping https://www.imdb.com/title/tt0036326/?ref_=ls_t_6: Message: no such element: Unable to locate element: {"method":"css selector","selector":"#__next > main > div > section.ipc-page-background.ipc-page-background--base.sc-c41b9732-0.llaHBP > div > section > div > div.sc-978e9339-1.cfupNW.ipc-page-grid__item.ipc-page-grid__item--span-2 > section:nth-child(49) > div.sc-f65f65be-0.dQVJPm > ul > li:nth-child(4) > div > ul > li > span"}
  (Session info: chrome=127.0.6533.120); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
0   chromedriver                        0x0000000100a41024 cxxbridge1$str$ptr + 1887276
1   chromedriver                        0x0000000100a39700 cxxbridge1$str$ptr + 1856264
2   chromedriver                        0x000000010064882c cxxbridge1$string$len + 88524
3   chromedriver                        0x000000010068c834 cxxbridge1$string$len +

merge datas

In [45]:
import pandas as pd

def read_csv_safe(file_path):
    try:
        return pd.read_csv(file_path, on_bad_lines='skip', quotechar='"', delimiter=',')
    except Exception as e:
        print(f"Failed to read {file_path}: {e}")
        return pd.DataFrame()  # Return an empty DataFrame on error

df1 = read_csv_safe('all_imdb_movies.csv')
df2 = read_csv_safe('box_office_data.csv')

if not df1.empty and not df2.empty:
    merged_df = pd.merge(df1, df2, on='Title', how='right')
    merged_df.to_csv('all_imdb_movies_box_office_data.csv', index=False)
else:
    print("One or both DataFrames are empty. Check the CSV files.")


Add Studio Field

In [3]:
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException, WebDriverException
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import csv
import time

# Setup WebDriver
driver = webdriver.Chrome()

def get_box_office(movie_url):
    """Scrape the box office worldwide value from a movie page."""
    try:
        driver.get(movie_url)
        time.sleep(2)  # Wait for the page to load
        
        try:
            box_office_element = driver.find_element(By.XPATH, '//span[text()="Gross worldwide"]/following-sibling::div//span')
        except NoSuchElementException:
            box_office_element = driver.find_element(By.CSS_SELECTOR, "#__next > main > div > section.ipc-page-background.ipc-page-background--base.sc-c41b9732-0.llaHBP > div > section > div > div.sc-978e9339-1.cfupNW.ipc-page-grid__item.ipc-page-grid__item--span-2 > section:nth-child(49) > div.sc-f65f65be-0.dQVJPm > ul > li:nth-child(4) > div > ul > li > span")

        return box_office_element.text.strip()
    except Exception as e:
        print(f"Error while scraping {movie_url}: {e}")
        return 'N/A'

def get_studio_name(movie_url):
    """Scrape the studio name from a movie page."""
    try:
        driver.get(movie_url)
        time.sleep(2)  # Wait for the page to load
        
        try:
            studio_element = driver.find_element(By.XPATH, '//*[@id="__next"]/main/div/section[1]/div/section/div/div[1]/section[11]/div[2]/ul/li[7]/div/ul')
        except NoSuchElementException:
            studio_element = driver.find_element(By.CSS_SELECTOR, "#__next > main > div > section.ipc-page-background.ipc-page-background--base.sc-c41b9732-0.llaHBP > div > section > div > div.sc-978e9339-1.cfupNW.ipc-page-grid__item.ipc-page-grid__item--span-2 > section:nth-child(43) > div.sc-f65f65be-0.dQVJPm > ul > li:nth-child(7) > div > ul")

        return studio_element.text.strip()
    except Exception as e:
        print(f"Error while scraping {movie_url}: {e}")
        return 'N/A'

def create_movie_data_csv(input_csv, output_csv):
    """Create a new CSV file with movie titles, box office worldwide values, and studio names."""
    with open(input_csv, mode='r', encoding='utf-8') as infile:
        reader = csv.DictReader(infile)
        fieldnames = ['Title', 'Box Office Worldwide', 'Studio Name']  # Added studio name
        rows = []

        for row in reader:
            movie_url = row['Movie Link']
            box_office = get_box_office(movie_url)
            studio_name = get_studio_name(movie_url)
            rows.append({
                'Title': row['Title'],  # Use the title from the existing CSV
                'Box Office Worldwide': box_office,
                'Studio Name': studio_name
            })

    with open(output_csv, mode='w', newline='', encoding='utf-8') as outfile:
        writer = csv.DictWriter(outfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(rows)

if __name__ == "__main__":
    try:
        input_csv = 'all_imdb_movies.csv'
        output_csv = 'movie_data.csv'
        create_movie_data_csv(input_csv, output_csv)
    except WebDriverException as e:
        print(f"WebDriverException: An error occurred with the WebDriver.\n{e}")
    finally:
        driver.quit()


Error while scraping https://www.imdb.com/title/tt0800080/?ref_=ls_t_2: Message: no such element: Unable to locate element: {"method":"css selector","selector":"#__next > main > div > section.ipc-page-background.ipc-page-background--base.sc-c41b9732-0.llaHBP > div > section > div > div.sc-978e9339-1.cfupNW.ipc-page-grid__item.ipc-page-grid__item--span-2 > section:nth-child(43) > div.sc-f65f65be-0.dQVJPm > ul > li:nth-child(7) > div > ul"}
  (Session info: chrome=127.0.6533.120); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
0   chromedriver                        0x000000010100d024 cxxbridge1$str$ptr + 1887276
1   chromedriver                        0x0000000101005700 cxxbridge1$str$ptr + 1856264
2   chromedriver                        0x0000000100c1482c cxxbridge1$string$len + 88524
3   chromedriver                        0x0000000100c58834 cxxbridge1$string$len + 367060
4   