In [16]:
# PSSSSS! READ ME! 
# purpose of these code: given a movie list (provided in csv format), this program, extracts detail information about 
# the listed movies from two websites using Selenium. The output of the program is a new csv file with the movies details.  
# websites scraped:
    # https://www.imdb.com/
    # https://www.rottentomatoes.com/
# output --> csv file with the following info:
    # IMDB id (this must be provided as en input)
    # IMDB score
    # Rotten Tomatoes score 
    # Director name
    # Screenwriter name
    # Plot
    # Duration (in minutes)
    # Name of the film as it is in IMDB website

# ⚠️⚠️ important for the code execution
    # about the csv inputs:
        # must contain the following columns:
            # id_imdb (this is mandatory for the program to work)
            # type
            # name
            # release_year
            # release_month
            # genre
        # must be located in this path: /data/api
        # file name format must be: 'list_{genre}_range_{start_page}_{end_page}_API.csv'
    # BEFORE RUNNING THE CODE


# code structure:
    # IMPORTS
    # FUNCTION DEFINITION
        # process input file
        # open url
        # accept cookies
        # search movies
        # retrive movie details
    # CODE EXECUTION

In [8]:
# IMPORTS

# import libraries for data processing
# -----------------------------------------------------------------------
import pandas as pd

# Import libraries for web browser automation with Selenium
# -----------------------------------------------------------------------
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager # ChromeDriverManager manages the installation of the Chrome driver
from selenium.webdriver.common.keys import Keys # to simulate keyboard events in Selenium
from selenium.webdriver.support.ui import Select # to interact with <select> elements on web pages

# import libraries to pause execution
# -----------------------------------------------------------------------
from time import sleep  # Sleep is used to pause the execution of the program for a number of seconds

# settings
# -----------------------------------------------------------------------
pd.set_option('display.max_columns', None)  # Sets a Pandas option to display all columns of a DataFrame


In [13]:
# FUNCTIONS DEFINITION

# process input file > transform csv into a list of tuples
def process_input (genre, start_page, end_page):
    # open csv file with pandas
    df = pd.read_csv(f"data/api/list_{genre}_range_{start_page}_{end_page}_API.csv", index_col = 0)

    # transform into a list of tuples
    list_of_movies_from_api = list(df.itertuples(index=True, name=None))

    return list_of_movies_from_api


# open url function
def open_url(url, driver):
    # maximize window
    try:
        driver.maximize_window()
        print("\nMaximizing window 🪟...")
    except:
        print("\nNot able to maximize window 🪟...")
        pass

    # navigate to website
    driver.get (url)
    print(f"\nNavigating to ⛵ {url}...")

    # pause to let the website load
    nap_time = 3
    sleep(nap_time)
    print(f"\nTaking a quick nap 😴... Only {nap_time} seconds though!")


# accept cookies function
def accept_cookies(locator):

    try:
        driver.find_element(locator[0], locator[1]).click()
        print("\nCookies accepted 🍪...")
    except:
        print("\nCookies not found ❌...")
        pass


# search movies function
def search_movies_imdb (movies_list, search_bar_locator):

    all_movies_data = [] # create a list to store the info to be collected of each movie
    
    for i in range (len(movies_list)):
        movie_id = movies_list [i][0]
        movie_name = movies_list [i][2]
        print (f"\n<< Iteration {i} >>\nSearching for movie 🔍 '{movie_name}', id '{movie_id}'...")
       
        try:
            driver.find_element(search_bar_locator[0],search_bar_locator[1]).send_keys(movie_id, Keys.ENTER)
            print(f"'{movie_name}' found! ✅")
        except:

            try: 
                driver.back() # this try fixis the cases where a previous individual movie page threw a 404 Error. In these cases the search bar is not reachable. Navigating back solves the issue 
                driver.find_element(search_bar_locator[0],search_bar_locator[1]).send_keys(movie_id, Keys.ENTER)
                print(f"'{movie_name}' found! ✅")
            except:
                print (f"Couldn't find movie '{movie_name}', id '{movie_id}' ❌")
                continue
        
        # call function "retrive_movie_data" to get the data related to the movie
        movie_data = retrive_movie_data(movie_id)
        
        # append movie info into final list 
        all_movies_data.append(movie_data)
        
    print("Seach completed ✅")
    return all_movies_data


# retrive movies details function
def retrive_movie_data(movie_id):
        
        movie_data = [] # create an empty list to store info on each iteration. It's a temporary list that will be overwritten on each iteration
        movie_data.append(movie_id) # append the movie_id (imdb_id) as the first element in the list
        print(f"\nmovie_data list after appending 'movie_id' {movie_data}\n")
        print ("\nExtracting data👽...")

        imdb_score_locator = ["xpath", '//*[@id="__next"]/main/div/section[1]/section/div[3]/section/section/div[2]/div[2]/div/div[1]/a/span/div/div[2]/div[1]/span[1]']
        director_locator = ["css selector","#__next > main > div > section.ipc-page-background.ipc-page-background--base.sc-c41b9732-0.NeSef > section > div:nth-child(5) > section > section > div.sc-491663c0-4.gEsAEH > div.sc-491663c0-6.eQRCDK > div.sc-491663c0-10.emoxHI > section > div.sc-1f50b7c-3.ZYFjc > div > ul > li:nth-child(1) > div > ul > li > a"]
        screenwriters_locator = ["css selector", "#__next > main > div > section.ipc-page-background.ipc-page-background--base.sc-c41b9732-0.NeSef > section > div:nth-child(5) > section > section > div.sc-491663c0-4.gEsAEH > div.sc-491663c0-6.eQRCDK > div.sc-491663c0-10.emoxHI > section > div.sc-1f50b7c-3.ZYFjc > div > ul > li:nth-child(2) > div > ul > li:nth-child(1) > a"]
        duration_locator = ["css selector","#__next > main > div > section.ipc-page-background.ipc-page-background--base.sc-c41b9732-0.NeSef > section > div:nth-child(5) > section > section > div.sc-491663c0-3.bdjVSf > div.sc-1f50b7c-0.PUxFE > ul > li:nth-child(2)"]
        imdb_name_locator = ["css selector", "#__next > main > div > section.ipc-page-background.ipc-page-background--base.sc-c41b9732-0.NeSef > section > div:nth-child(5) > section > section > div.sc-491663c0-3.bdjVSf > div.sc-1f50b7c-0.PUxFE > h1 > span"]

        # imdb score
        try:
            imdb_score = driver.find_element(imdb_score_locator[0], imdb_score_locator[1]).text
            print (" - IMDB rating ⭐ found✅")
        except:
            imdb_score = "none"
            print (" - IMDB rating not found❌")
        
        movie_data.append(imdb_score)
        print(f"\nmovie_data list after appending 'imdb_score' {movie_data}\n")

        # director
        try:
            director = driver.find_element(director_locator[0], director_locator[1]).text
            print (" - Director 🎬 found✅")
        except:
            director = "none"
            print (" - Director not found ❌")

        movie_data.append(director)
        print(f"\nmovie_data list after appending 'director' {movie_data}\n")

        # screenwriters
        try:
            screenwriters = driver.find_element(screenwriters_locator[0], screenwriters_locator[1]).text
            print (" - Screenwriters ✍️ found✅")
        except:
            screenwriters = "none"
            print (" - Screenwriters not found ❌")

        movie_data.append(screenwriters)
        print(f"\nmovie_data list after appending 'screenwriters' {movie_data}\n")

        # duration
        try:
            duration = driver.find_element(duration_locator[0], duration_locator[1]).text
            print (" - Duration ⏱️ found✅")
        except:
            duration = "none"
            print (" - Duration not found ❌")
        
        if duration != "none":
        #transform duration format from '1h 44min' into INT (in minutes)
            # get the number of hours
            if "h" in duration:
                h_position = duration.index("h")
                hours = int(duration[:h_position])
            else:
                hours = 0

            # get the number of minutes
            if "min" in duration:
                try:
                    min_position = duration.index("min")
                    minutes = int(duration [min_position -2 : min_position])
                except ValueError:
                    min_position = duration.index("min")
                    minutes = int(duration [min_position -1 : min_position])
            else:
                minutes = 0
            
            duration_in_minutes = hours * 60 + minutes

            print (f"  -> Movie duration transformed from {duration} into {duration_in_minutes}, now we have duration in {type(duration_in_minutes)}! 🎉")
        
        else:
            duration_in_minutes = duration
        
        movie_data.append(duration_in_minutes)
        print(f"\nmovie_data list after appending 'duration' {movie_data}\n")

        # imdb_name
        try:
            imdb_name = driver.find_element(imdb_name_locator[0], imdb_name_locator[1]).text
            print (" - IMDB name 🅰️ found✅")
        except:
            imdb_name = "none"
            print (" - IMDB name not found❌")

        movie_data.append (imdb_name)
        print(f"\nmovie_data list after appending 'imdb_name' {movie_data}\n")

        return movie_data



In [14]:
# CODE EXECUTION ⬇️⬇️⬇️ 

# Summary:
    # 1) open browser
    # 2) process input file
    # 3) open url (imdb website)
    # 4) accept cookies 
    # 5) search movies and retrieve movies details
    # 6) open url (rotten tomatoes)
    # 7) accept cookies
    # 8) search movies and retrieve additional movies details
    # 9) process output


# 1) open browser

driver = webdriver.Chrome()

print("\nOpening Chrome 🧭...")


# 2) process input file

genre = "action"            # ⚠️IMPORTANT: change the genre for selecting diferent files 

# iterate over the existing input files. By changing the page range, the program access the diferent file on each iteration.  
for page in range (2,10):   # ⚠️IMPORTANT: change the range for selecting diferent files
    start_page = page
    end_page = page + 1
    # call process_input function
    list_of_movies_from_api = process_input (genre, start_page, end_page)

    print (f"-----------------PAGE ITERATION #{page}----------------------------")

# 3) open url (imdb)

    url = "https://www.imdb.com/"

    open_url (url, driver)


# 4) accept_cookies
 
    imdb_cookies_locator = ["css selector", "#__next > div > div > div.sc-jrcTuL.bPmWiM > div > button.icb-btn.sc-bcXHqe.sc-dkrFOg.sc-iBYQkv.dcvrLS.ddtuHe.dRCGjd"]

    accept_cookies(imdb_cookies_locator)


# 5) search movies on imdb website and retrieve movies data

    search_bar_locator = ["css selector", "#suggestion-search"]

    all_movies_data = search_movies_imdb(list_of_movies_from_api, search_bar_locator)

    # print list for testing
    print(all_movies_data)


# 6) open url (rotten  tomatoes)

    url = "https://www.rottentomatoes.com/"

    open_url (url, driver)


# 7) accept cookies

    rt_cookies_locator = ["css selector", "#onetrust-accept-btn-handler"]

    accept_cookies(rt_cookies_locator)


# 8) search movies and retrieve additional movies details
    # PENDING or Improvement for next phase: create function to search in Rotten Tomatoes page
    
    for i in range (len(list_of_movies_from_api)):

        # find the movie
        movie_name = list_of_movies_from_api [i][2]
        print (f"\n<< Iteration {i} >>\nSearching for movie 🔍 '{movie_name}'...")

        # add inputs on search bar
        search_bar_locator_1 = ["css selector", "#header-main > search-results-nav > search-results-controls > input[type=text]"]
        try:
            driver.find_element(search_bar_locator_1[0], search_bar_locator_1[1]).send_keys(movie_name, Keys.ENTER)
        except:
            try: # this try fixis the cases where a previous individual movie page threw a 404 Error. In these cases the search bar is not reachable. Navigating back solves the issue
                    driver.back()
                    driver.find_element(search_bar_locator_1[0], search_bar_locator_1[1]).send_keys(movie_name, Keys.ENTER)
            except:
                print ("Error related to finding or sending inputs thorugh the search bar⚠️")
                continue
        
        # navigate the results
        # click on "Movies" to filter results to show Movies only
        try:
            movies_filter_locator = ["css selector", "#search-results > nav > ul > li:nth-child(2) > span"]
            driver.find_element(movies_filter_locator[0], movies_filter_locator[1]).click()
            print ("\nFiltering results by 'movie'...\n")
        except:
            print ("No matching results found or another error after entering the input in the search bar")
            continue

        # iterate over the first five search results 
        movie_found = False
        for x in range (1,6):
            # get movie name as in Rotten Tomatoes website and compare the name with the movie name from lista_general
            try:
                result_name_locator = ["css selector", f"#search-results > search-page-result:nth-child(2) > ul > search-page-media-row:nth-child({x}) > a:nth-child(2)"]
                rt_name = driver.find_element(result_name_locator[0], result_name_locator[1]).text.lower()
            except:
                print ("Error accesing the movie name in the result page ⚠️")
                break
            print (f">> {x}/5 Comparing result '{rt_name}'")                                           
            if rt_name == movie_name.lower():
                print(f"'{movie_name}' found! ✅")
                # if result name matches movie name, click on result name to access movie individual page
                try: 
                    driver.find_element(result_name_locator[0], result_name_locator[1]).click()
                except:
                    print ("Error when clicking movie name ⚠️")
                    break
                try:
                    rt_score_locator = ["css selector","#modules-wrap > div.media-scorecard.no-border > media-scorecard > rt-button:nth-child(3) > rt-text"]
                    rt_score = driver.find_element (rt_score_locator[0], rt_score_locator[1]).text
                    # transform tomatometer into INT
                    rt_score_as_int = int(rt_score.replace("%",""))
                    print (" - Score 🍅 found✅")
                except:
                    rt_score_as_int = "none"
                    print (" - Score not found ❌")
                try:
                    plot_locator = ["css selector", "#modules-wrap > div.media-scorecard.no-border > media-scorecard > drawer-more > rt-text"]
                    rt_plot = driver.find_element (plot_locator[0], plot_locator[1]).text
                    print (" - Plot 📰 found✅")
                except:
                    rt_plot = "none"
                    print (" - Plot not found ❌")
                movie_found = True
                break

        if not movie_found: 
            print ("Movie not found❌")
            rt_score_as_int = "none"
            rt_plot = "none"

        all_movies_data[i].insert(2,rt_score_as_int)
        all_movies_data[i].insert(5,rt_plot)
        print(f"\nmovie_data list after appending 'rt_score' and 'plot' {all_movies_data[i]}\n")



# 9) process output

    # transform output list of list into a list of tuples
    for i in range (len(all_movies_data)):
        all_movies_data[i] = tuple (all_movies_data[i])

    # print for testing 
    print ("\nPrinting the final list of tuples before transforming into DataFrame")
    print (all_movies_data)

    # transform into a DataFrame
    columns =  ['id_imdb','score_imdb', 'score_rt', 'director_imdb', 'screenriters_imdb', 'plot_rt', 'duration_imdb', 'title_imdb']
    df_movies_details = pd.DataFrame(all_movies_data, columns = columns)
    df_movies_details.set_index('id_imdb', inplace=True)
    print ("\nPrinting the final DataFrame before saving as csv")
    print (df_movies_details)

    # save into a .csv file
    df_movies_details.to_csv(f"data/selenium_movies/list_{genre}_{start_page}_{end_page}_sel_movies.csv")
    print (f"\ndata/selenium_movies/list_{genre}_{start_page}_{end_page}_sel_movies.csv CREATED✅")


Opening Chrome 🧭...
-----------------PAGE ITERATION #2----------------------------

Maximizing window 🪟...

Navigating to ⛵ https://www.imdb.com/...

Taking a quick nap 😴... Only 3 seconds though!

Cookies accepted 🍪...

<< Iteration 0 >>
Searching for movie 🔍 'Danger', id 'tt27625837'...
'Danger' found! ✅

movie_data list after appending 'movie_id' ['tt27625837']


Extracting data👽...
 - IMDB rating not found❌

movie_data list after appending 'imdb_score' ['tt27625837', 'none']

 - Director not found ❌

movie_data list after appending 'director' ['tt27625837', 'none', 'none']

 - Screenwriters not found ❌

movie_data list after appending 'screenwriters' ['tt27625837', 'none', 'none', 'none']

 - Duration not found ❌

movie_data list after appending 'duration' ['tt27625837', 'none', 'none', 'none', 'none']

 - IMDB name not found❌

movie_data list after appending 'imdb_name' ['tt27625837', 'none', 'none', 'none', 'none', 'none']


<< Iteration 1 >>
Searching for movie 🔍 'Kamen Rider 5