In [40]:
# psssss! READ ME! 
# purpose of these code: given a movie list, extract information about the movies from two websites, using selenium. 

# websites:
    # https://www.imdb.com/
    # https://www.rottentomatoes.com/

# about the info to extract:
    # IMDB score (if available)
    # Rotten Tomatoes score (Tomatometer)
    # Direction (director(s) of each film), will be retrived from IMDB website
    # Screenwriters (of each film), will be retrived from IMDB website
    # Plot, will be retrived from RottenTomatoes website
    # Duration (in minutes), will be retrived from IMDB website
    # Name of the film, will be retrived from IMDB website
    # Movie id (this exist in the entry list)

# output: information must be in a list of tuples (one tuple for each movie).
# example: [(imdb_score, rt_score, director, screenwriters, plot, duration, name, movie_id)...]
# example with real data: [(7.7, 77,  "Richard Donner", ["Chris ColumbusSteven", "Spielberg"], "Los Goonies son un grupo de amigos que viven en Goon Docks, Astoria, pero sus casas han sido compradas y van a ser demolidas. Sin embargo, vivirán su última aventura en busca de un tesoro que pueda salvar el barrio.", "1h 54min", "Los Gonnies", "tt31014713"),  ...]
# code structure:
    # to be added


# movie list, add here all the movies to be analized
lista_general = [('Movie', '2 Fingers Honey 2', 2024, 2, 'tt31014713', 'genre'),
 ('Movie', 'The Odd Room Mates', 2024, 3, 'tt30990201','genre'),
 ('Movie', 'Kallanmaarude Veedu', 2024, 1, 'tt30974578','genre'),
 ('Movie', 'Slasher House 3', 2024, 3, 'tt11078340'),
 ('Movie', 'Mr Patrick Wahala in America', 2024, 1, 'tt30970938','genre'),
 ('Movie', 'Furiosa: A Mad Max Saga', 2024, 5, 'tt12037194','genre'),
 ('Movie', 'The Heirloom', 2024, 1, 'tt30955919','genre'),
 ('Movie', 'Adrienne', 2024, None, 'tt13143064','genre'),
 ('Movie', 'Jaque Mate', 2024, 1, 'tt30850554','genre'),
 ('Movie', 'Mnemonix', 2024, 12, 'tt13590374','genre'),]

testing_list = [('Movie', '2 Fingers Honey 2', 2024, 2, 'tt31014713','genre'), # to be used for testing the program with less results
 ('Movie', 'The Odd Room Mates', 2024, 3, 'tt30990201','genre')]

# import libraries for data processing
# -----------------------------------------------------------------------
import pandas as pd

# Import libraries for web browser automation with Selenium
# -----------------------------------------------------------------------
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager # ChromeDriverManager manages the installation of the Chrome driver
from selenium.webdriver.common.keys import Keys # to simulate keyboard events in Selenium
from selenium.webdriver.support.ui import Select # to interact with <select> elements on web pages

# import libraries to pause execution
# -----------------------------------------------------------------------
from time import sleep  # Sleep is used to pause the execution of the program for a number of seconds

# settings
# -----------------------------------------------------------------------
pd.set_option('display.max_columns', None)  # Sets a Pandas option to display all columns of a DataFrame



In [54]:
# open browser
driver = webdriver.Chrome()
print("\nOpening Chrome 🧭...")

#open url
def open_url(url, driver):
    # maximize window
    driver.maximize_window()
    print("\nMaximizing window 🪟...")

    # navigate to website
    driver.get (url)
    print(f"\nNavigating to ⛵ {url}...")

    # pause to let the website load
    nap_time = 3
    sleep(nap_time)
    print(f"\nTaking a quick nap 😴... Only {nap_time} seconds though!")


# accept cookies
def accept_cookies(locator):

    try:
        driver.find_element(locator[0], locator[1]).click()
        print("\nCookies accepted 🍪...")
    except:
        print("\nCookies not found ❌...")




Opening Chrome 🧭...


In [55]:
# open imbd page and accept cookies calling the functions defined

url = "https://www.imdb.com/"
imdb_cookies_locator = ["css selector", "#__next > div > div > div.sc-jrcTuL.bPmWiM > div > button.icb-btn.sc-bcXHqe.sc-dkrFOg.sc-iBYQkv.dcvrLS.ddtuHe.dRCGjd"]

open_url (url, driver)
accept_cookies(imdb_cookies_locator)


Maximizing window 🪟...

Navigating to ⛵ https://www.imdb.com/...

Taking a quick nap 😴... Only 3 seconds though!

Cookies accepted 🍪...


In [57]:
# define functions for searching movies and retriving its data 

def search_movies_imdb (movies_list, search_bar_locator):

    all_movies_data = [] # create a list to store the info to be collected of each movie
    
    for i in range (len(movies_list)):
        movie_id = movies_list [i][4]
        movie_name = movies_list [i][1]
        print (f"\n<< Iteration {i} >>\nSearching for movie 🔍 '{movie_name}', id '{movie_id}'...")
        #search_bar_locator = ["css selector", "#suggestion-search"]
        try:
            driver.find_element(search_bar_locator[0],search_bar_locator[1]).send_keys(movie_id, Keys.ENTER)
            print(f"'{movie_name}' found! ✅")
        except:
            try: 
                driver.back() # this try fixis the cases where a previous individual movie page threw a 404 Error. In these cases the search bar is not reachable. Navigating back solves the issue 
                driver.find_element(search_bar_locator[0],search_bar_locator[1]).send_keys(movie_id, Keys.ENTER)
                print(f"'{movie_name}' found! ✅")
            except:
                print (f"Couldn't find movie '{movie_name}', id '{movie_id}' ❌")
                continue
        
        # call function "retrive_movie_data" to get the data related to the movie
        movie_data = retrive_movie_data(movie_id)
        
        # append movie info into final list 
        all_movies_data.append(movie_data)
        
    print("Seach completed ✅")
    return all_movies_data
 

def retrive_movie_data(movie_id):
        
        movie_data = [] # create an empty list to store info on each iteration. It's a temporary list that will be overwritten on each iteration

        print ("\nExtracting data👽...")

        imdb_score_locator = ["xpath", '//*[@id="__next"]/main/div/section[1]/section/div[3]/section/section/div[2]/div[2]/div/div[1]/a/span/div/div[2]/div[1]/span[1]']
        director_locator = ["css selector","#__next > main > div > section.ipc-page-background.ipc-page-background--base.sc-c41b9732-0.NeSef > section > div:nth-child(5) > section > section > div.sc-491663c0-4.gEsAEH > div.sc-491663c0-6.eQRCDK > div.sc-491663c0-10.emoxHI > section > div.sc-1f50b7c-3.ZYFjc > div > ul > li:nth-child(1) > div > ul > li > a"]
        screenwriters_locator = ["css selector", "#__next > main > div > section.ipc-page-background.ipc-page-background--base.sc-c41b9732-0.NeSef > section > div:nth-child(5) > section > section > div.sc-491663c0-4.gEsAEH > div.sc-491663c0-6.eQRCDK > div.sc-491663c0-10.emoxHI > section > div.sc-1f50b7c-3.ZYFjc > div > ul > li:nth-child(2) > div > ul > li:nth-child(1) > a"]
        duration_locator = ["css selector","#__next > main > div > section.ipc-page-background.ipc-page-background--base.sc-c41b9732-0.NeSef > section > div:nth-child(5) > section > section > div.sc-491663c0-3.bdjVSf > div.sc-1f50b7c-0.PUxFE > ul > li:nth-child(2)"]
        imdb_name_locator = ["css selector", "#__next > main > div > section.ipc-page-background.ipc-page-background--base.sc-c41b9732-0.NeSef > section > div:nth-child(5) > section > section > div.sc-491663c0-3.bdjVSf > div.sc-1f50b7c-0.PUxFE > h1 > span"]

        # imdb score
        try:
            imdb_score = driver.find_element(imdb_score_locator[0], imdb_score_locator[1]).text
            print (" - IMDB rating ⭐ found")
        except:
            imdb_score = "none"
            print (" - IMDB rating not found❌")
        
        movie_data.append(imdb_score)

        # director
        try:
            director = driver.find_element(director_locator[0], director_locator[1]).text
            print (" - Director 🎬 found")
        except:
            director = "none"
            print (" - Director not found ❌")

        movie_data.append(director)

        # screenwriters
        try:
            screenwriters = driver.find_element(screenwriters_locator[0], screenwriters_locator[1]).text
            print (" - Screenwriters ✍️ found")
        except:
            screenwriters = "none"
            print (" - Screenwriters not found ❌")

        movie_data.append(screenwriters)

        # duration
        try:
            duration = driver.find_element(duration_locator[0], duration_locator[1]).text
            print (" - Duration ⏱️ found")
        except:
            duration = "none"
            print (" - Duration not found ❌")
        
        if duration != "none":
        #transform duration format from 1h 44min into INT (minutes)
        # get the number of hours
            if "h" in duration:
                h_position = duration.index("h")
                hours = int(duration[:h_position])
            else:
                hours = 0

            # get the number of minutes
            if "min" in duration:
                min_position = duration.index("min")
                minutes = int(duration [min_position -2 : min_position])
            else:
                minutes = 0
            
            duration_in_minutes = hours * 60 + minutes

            print (f"  -> Movie duration transformed from {duration} into {duration_in_minutes}, now we have duration in {type(duration_in_minutes)}! 🎉")
        
        else:
            duration_in_minutes = duration
        
        movie_data.append(duration_in_minutes)

        # imdb_name
        try:
            imdb_name = driver.find_element(imdb_name_locator[0], imdb_name_locator[1]).text
            print (" - IMDB name 🅰️ found")
        except:
            imdb_name = "none"
            print (" - IMDB name not found ❌")

        movie_data.append (imdb_name)

        movie_data.append (movie_id)

        #movies_full_info.append (movie_info)

        return movie_data


In [58]:
# search and retrieve movies data calling the functions defined

search_bar_locator = ["css selector", "#suggestion-search"]

all_movies_data = search_movies_imdb(lista_general, search_bar_locator)


<< Iteration 0 >>
Searching for movie 🔍 '2 Fingers Honey 2', id 'tt31014713'...
'2 Fingers Honey 2' found! ✅

Extracting data👽...
 - IMDB rating ⭐ found
 - Director 🎬 found
 - Screenwriters ✍️ found
 - Duration ⏱️ found
  -> Movie duration transformed from 1h 46min into 106, now we have duration in <class 'int'>! 🎉
 - IMDB name 🅰️ found

<< Iteration 1 >>
Searching for movie 🔍 'The Odd Room Mates', id 'tt30990201'...
'The Odd Room Mates' found! ✅

Extracting data👽...
 - IMDB rating not found❌
 - Director 🎬 found
 - Screenwriters ✍️ found
 - Duration not found ❌
 - IMDB name 🅰️ found

<< Iteration 2 >>
Searching for movie 🔍 'Kallanmaarude Veedu', id 'tt30974578'...
'Kallanmaarude Veedu' found! ✅

Extracting data👽...
 - IMDB rating not found❌
 - Director 🎬 found
 - Screenwriters ✍️ found
 - Duration not found ❌
 - IMDB name 🅰️ found

<< Iteration 3 >>
Searching for movie 🔍 'Slasher House 3', id 'tt11078340'...
'Slasher House 3' found! ✅

Extracting data👽...
 - IMDB rating ⭐ found
 - Dir

In [45]:
all_movies_data

[['8,4',
  'Ermal Mamaqi',
  'Klaudia Brahimaj',
  106,
  'Dy gisht mjaltë 2',
  'tt13590374'],
 ['none',
  'Ibukunoluwa Rolat-Abiola',
  'Bryan Walker',
  'none',
  'The Odd Room Mates',
  'tt13590374'],
 ['none',
  'Hussain Aroni',
  'Hussain Aroni',
  'none',
  'Kallanmaarude Veedu',
  'tt13590374'],
 ['6,4', 'Mj Dixon', 'Mj Dixon', 'none', 'Slasher House 3', 'tt13590374'],
 ['none',
  'Christopher Nkem Okafor',
  'Roberta Uti Okpako',
  90,
  'Mr Patrick Wahala in America',
  'tt13590374'],
 ['7,7', 'none', 'none', 0, 'Furiosa: De la saga Mad Max', 'tt13590374'],
 ['7,4', 'Ben Petrie', 'Ben Petrie', 87, 'The Heirloom', 'tt13590374'],
 ['none', 'none', 'none', 'none', 'none', 'tt13590374'],
 ['4,2', 'Jorge Nisco', 'Luis Bernardez', 104, 'Jaque Mate', 'tt13590374'],
 ['none', 'Steve Deering', 'Christopher Lambert', 0, 'Mnemonix', 'tt13590374']]

In [59]:
# open imbd page and accept cookies calling the functions defined
url = "https://www.rottentomatoes.com/"
rt_cookies_locator = ["css selector", "#onetrust-accept-btn-handler"]

open_url (url, driver)
accept_cookies(rt_cookies_locator)


Maximizing window 🪟...

Navigating to ⛵ https://www.rottentomatoes.com/...

Taking a quick nap 😴... Only 3 seconds though!

Cookies accepted 🍪...


In [60]:
# PENDING or Improvement for next phase: create function to search in Rotten Tomatoes page
 
for i in range (len(lista_general)):

    # find the movie
    movie_name = lista_general [i][1]
    print (f"\n<< Iteration {i} >>\nSearching for movie 🔍 '{movie_name}'...")

    # add inputs on search bar
    search_bar_locator_1 = ["css selector", "#header-main > search-results-nav > search-results-controls > input[type=text]"]
    try:
        driver.find_element(search_bar_locator_1[0], search_bar_locator_1[1]).send_keys(movie_name, Keys.ENTER)
    except:
        try: # this try fixis the cases where a previous individual movie page threw a 404 Error. In these cases the search bar is not reachable. Navigating back solves the issue
                driver.back()
                driver.find_element(search_bar_locator_1[0], search_bar_locator_1[1]).send_keys(movie_name, Keys.ENTER)
        except:
            print ("Error related to finding or sending inputs thorugh the search bar⚠️")
            continue
    
    # navigate the results
    # click on "Movies" to filter results to show Movies only
    movies_filter_locator = ["css selector", "#search-results > nav > ul > li:nth-child(2) > span"]
    driver.find_element(movies_filter_locator[0], movies_filter_locator[1]).click()
    print ("\nFiltering results by 'movie'...\n")

    # iterate over the first five search results 
    movie_found = False
    for x in range (1,6):
        # get movie name as in Rotten Tomatoes website and compare the name with the movie name from lista_general
        try: 
            result_name_locator = ["css selector", f"#search-results > search-page-result:nth-child(2) > ul > search-page-media-row:nth-child({x}) > a:nth-child(2)"]
            rt_name = driver.find_element(result_name_locator[0], result_name_locator[1]).text.lower()
        except:
            print ("Error accesing the movie name in the result page ⚠️")
            break
        print (f">> {x}/5 Comparing result '{rt_name}'")                                                   
        if rt_name == movie_name.lower():
            print(f"'{movie_name}' found! ✅")
            # if result name matches movie name, click on result name to access movie individual page
            try: 
                driver.find_element(result_name_locator[0], result_name_locator[1]).click()
            except:
                print ("Error when clicking movie name ⚠️")
                break
            try:
                rt_score_locator = ["css selector","#modules-wrap > div.media-scorecard.no-border > media-scorecard > rt-button:nth-child(3) > rt-text"]
                rt_score = driver.find_element (rt_score_locator[0], rt_score_locator[1]).text
                # transform tomatometer into INT
                rt_score_as_int = int(rt_score.replace("%",""))
                print (" - Score 🍅 found")
            except:
                rt_score_as_int = "none"
                print (" - Score not found ❌")
            try:
                plot_locator = ["css selector", "#modules-wrap > div.media-scorecard.no-border > media-scorecard > drawer-more > rt-text"]
                rt_plot = driver.find_element (plot_locator[0], plot_locator[1]).text
                print (" - Plot 📰 found")
            except:
                rt_plot = "none"
                print (" - Plot not found ❌")
            movie_found = True
            break

    if not movie_found: 
        print ("movie not found")
        rt_score_as_int = "none"
        rt_plot = "none"

    all_movies_data[i].insert(1,rt_score_as_int)
    all_movies_data[i].insert(4,rt_plot)


<< Iteration 0 >>
Searching for movie 🔍 '2 Fingers Honey 2'...

Filtering results by 'movie'...

>> 1/5 Comparing result '2 fingers honey'
>> 2/5 Comparing result 'honey 2'
>> 3/5 Comparing result 'winnie the pooh: blood and honey ii'
>> 4/5 Comparing result 'lots of ice and a little bit of water'
>> 5/5 Comparing result 'honey bee 2: celebrations'
movie not found

<< Iteration 1 >>
Searching for movie 🔍 'The Odd Room Mates'...

Filtering results by 'movie'...

>> 1/5 Comparing result 'the odd room mates'
'The Odd Room Mates' found! ✅
 - Score not found ❌
 - Plot 📰 found

<< Iteration 2 >>
Searching for movie 🔍 'Kallanmaarude Veedu'...

Filtering results by 'movie'...

>> 1/5 Comparing result '8am number veedu'
>> 2/5 Comparing result 'chinna veedu'
>> 3/5 Comparing result 'anandhapurathu veedu'
>> 4/5 Comparing result 'ente veedu appuvinteyum'
>> 5/5 Comparing result '13 no veedu'
movie not found

<< Iteration 3 >>
Searching for movie 🔍 'Slasher House 3'...

Filtering results by 'mov

In [61]:
# transformation to tuples

for i in range (len(all_movies_data)):
    all_movies_data[i] = tuple (all_movies_data[i])

In [62]:
all_movies_data

[('8,4',
  'none',
  'Ermal Mamaqi',
  'Klaudia Brahimaj',
  'none',
  106,
  'Dy gisht mjaltë 2',
  'tt31014713'),
 ('none',
  'none',
  'Ibukunoluwa Rolat-Abiola',
  'Bryan Walker',
  'A Nigerian woman and an African American man end up as roommates due to unforeseen circumstances.',
  'none',
  'The Odd Room Mates',
  'tt30990201'),
 ('none',
  'none',
  'Hussain Aroni',
  'Hussain Aroni',
  'none',
  'none',
  'Kallanmaarude Veedu',
  'tt30974578'),
 ('6,4',
  'none',
  'Mj Dixon',
  'Mj Dixon',
  'none',
  'none',
  'Slasher House 3',
  'tt11078340'),
 ('none',
  'none',
  'Christopher Nkem Okafor',
  'Roberta Uti Okpako',
  'none',
  90,
  'Mr Patrick Wahala in America',
  'tt30970938'),
 ('7,7',
  90,
  'none',
  'none',
  'Snatched from the Green Place of Many Mothers, young Furiosa falls into the hands of a great biker horde led by the warlord Dementus. Sweeping through the Wasteland, they come across the Citadel, presided over by the Immortan Joe. As the two tyrants fight for

In [66]:
# transform into a DataFrame
columns =  ['score_imdb', 'score_rt', 'director_imdb', 'screenriters_imdb', 'plot_rt', 'duration_imdb', 'title_imdb', 'id_imdb']
df_movies_details = pd.DataFrame(all_movies_data, columns = columns)
df_movies_details.set_index('id_imdb', inplace=True)
df_movies_details



Unnamed: 0_level_0,score_imdb,tomatometer,director_imdb,screenriters_imdb,plot_rt,duration_imdb,imdb_title
id_imdb,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
tt31014713,84,none,Ermal Mamaqi,Klaudia Brahimaj,none,106,Dy gisht mjaltë 2
tt30990201,none,none,Ibukunoluwa Rolat-Abiola,Bryan Walker,A Nigerian woman and an African American man e...,none,The Odd Room Mates
tt30974578,none,none,Hussain Aroni,Hussain Aroni,none,none,Kallanmaarude Veedu
tt11078340,64,none,Mj Dixon,Mj Dixon,none,none,Slasher House 3
tt30970938,none,none,Christopher Nkem Okafor,Roberta Uti Okpako,none,90,Mr Patrick Wahala in America
tt12037194,77,90,none,none,"Snatched from the Green Place of Many Mothers,...",0,Furiosa: De la saga Mad Max
tt30955919,74,none,Ben Petrie,Ben Petrie,James Yang (Jason Chang) is a promising young ...,87,The Heirloom
tt13143064,none,none,none,none,none,none,none
tt30850554,42,none,Jorge Nisco,Luis Bernardez,,104,Jaque Mate
tt13590374,none,none,Steve Deering,Christopher Lambert,none,0,Mnemonix


In [51]:
# save into a .csv file
df_movies_details.to_csv("movies_selenium_data.csv")