<a href="https://colab.research.google.com/github/bnaveensagar1997/Guvi_mini_projects/blob/main/IMDB_Movie_recommendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install selenium
!pip install webdriver-manager

Collecting selenium
  Downloading selenium-4.32.0-py3-none-any.whl.metadata (7.5 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.30.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.32.0-py3-none-any.whl (9.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m37.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio-0.30.0-py3-none-any.whl (499 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m499.2/499.2 kB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio_websocket-0.12.2-py3-none-any.whl (21 kB)
Downloading outcome-1.3.0.post0-py2.py3-

In [11]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import re

In [16]:
def setup_driver():
    """Set up and return a Chrome webdriver with appropriate options."""
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in headless mode
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")

    # Initialize the Chrome WebDriver
    driver = webdriver.Chrome(options=chrome_options)
    return driver


def scrape_imdb_2024_movies(max_movies=100):
    """
    Scrape IMDB for 2024 movies including name and storyline.

    Args:
        max_movies: Maximum number of movies to scrape

    Returns:
        DataFrame with movie details
    """
    print("Starting IMDB scraper for 2024 movies...")  # Removed extra space before print
    driver = setup_driver()

    # Navigate to IMDB 2024 movie list
    url = "https://www.imdb.com/search/title/?year=2024&title_type=feature&sort=moviemeter,asc"
    driver.get(url)

    # Wait for the page to load, increasing timeout and adding error handling
    try:
        # Increased timeout to 30 seconds and added a more generic locator
        WebDriverWait(driver, 30).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, ".lister-list"))  # More generic locator
        )
    except TimeoutException:
        print("Timeout while waiting for page to load. Check your internet connection or the website structure.")
        driver.quit()  # Close the driver to avoid resource leaks
        return pd.DataFrame()  # Return an empty DataFrame in case of failure

    movies_data = []
    movie_links = []
    # First, gather all movie links from the search page
    movie_elements = driver.find_elements(By.CLASS_NAME, "lister-item-content")
    for element in movie_elements[:max_movies]:
        try:
            # Get the movie title and link
            title_element = element.find_element(By.TAG_NAME, "a")
            title = title_element.text
            link = title_element.get_attribute("href")
            movie_links.append((title, link))

        except NoSuchElementException:
            continue

    # Now visit each movie page to get the storyline
    for title, link in movie_links:
        try:
            driver.get(link)
            # Print the current URL for debugging purposes
            print(f"Scraping {title} from: {link}")  # Debugging line

            # Wait for the storyline section to load, adjust the selector if needed
            try:
                # Explicit wait for the storyline section or the plot summary section
                WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.XPATH, "//h2[contains(text(), 'Storyline')]/following-sibling::div")) or
                    EC.presence_of_element_located((By.ID, "plot-summaries-content"))
                )

                # Attempt to find the storyline in the main page
                storyline_element = driver.find_element(By.XPATH, "//h2[contains(text(), 'Storyline')]/following-sibling::div")
                storyline = storyline_element.text

            except (TimeoutException, NoSuchElementException):
                # If not found, try to find it in the plot summary section
                try:
                    storyline_element = driver.find_element(By.ID, "plot-summaries-content")
                    storyline = storyline_element.find_element(By.CSS_SELECTOR, ".ipc-html-content-inner-div").text  # Adjust selector if needed
                except (TimeoutException, NoSuchElementException):
                    storyline = "No storyline available"

            # Get genre information (Adjust the selector if needed)
            genres = []
            try:
                genre_elements = driver.find_elements(By.CSS_SELECTOR, 'a[href*="genres="]')
                genres = [genre.text for genre in genre_elements]
            except NoSuchElementException:
                pass

            # Get release year to confirm it's a 2024 movie (Adjust the selector if needed)
            release_year = ""
            try:
                year_element = driver.find_element(By.CSS_SELECTOR, 'a[href*="releaseinfo"]')
                release_year = year_element.text
            except NoSuchElementException:
                pass

            # Clean the storyline text
            storyline = re.sub(r'\s+', ' ', storyline).strip()

            # Add to our dataset
            movies_data.append({
                'title': title,
                'storyline': storyline,
                'genres': ', '.join(genres),
                'release_year': release_year,
                'imdb_link': link
            })

            print(f"Scraped: {title}")

            # Be nice to IMDB's servers
            time.sleep(1)

        except Exception as e:
            print(f"Error scraping {title}: {str(e)}")

    driver.quit()

    # Convert to DataFrame
    df = pd.DataFrame(movies_data)

    # Save raw data to CSV
    df.to_csv('imdb_movies_2024_raw.csv', index=False)
    print(f"Scraping completed. Found {len(df)} movies.")

    return df

if __name__ == "__main__":
    # Scrape data and assign it to the 'movies_df' variable
    movies_df = scrape_imdb_2024_movies(max_movies=50)  # Adjust as needed

    # Check if the DataFrame is empty before trying to access columns
    if movies_df.empty:
        print("The DataFrame is empty. No data was scraped.")
    else:
        # Print first few entries
        print("\nSample of scraped data:")
        # Accessing the dataframe created within the function
        print(movies_df[['title', 'storyline']].head())


# Now you can work with 'movies_df' here:
movies_df.head()

Starting IMDB scraper for 2024 movies...
Timeout while waiting for page to load. Check your internet connection or the website structure.
The DataFrame is empty. No data was scraped.


In [18]:
df = movies_df.head()
print(df.head())

Empty DataFrame
Columns: []
Index: []


In [None]:
from selenium import webdriver