## 1. Challenge, prep and imports

In [4]:
# to improve: 
    # I can't find "plot" in IMDB website
    # la forma en que se llama a la función find_and_save puede mejorarse (agregar un iterador por ejemplo) 

# purpose of these code: given a movie list, extract information about the movies from two websites, using selenium. 
# store the info in tuples. One tuple for each movie. Example:
    # [(7.7, 77,  "Richard Donner", ["Chris ColumbusSteven", "Spielberg"], "Los Goonies son un grupo de amigos que viven en Goon Docks, Astoria, pero sus casas han sido compradas y van a ser demolidas. Sin embargo, vivirán su última aventura en busca de un tesoro que pueda salvar el barrio.", "Aventura", "1h 54min", "Los Gonnies"),  ...]

# info to extract:
    # IMDB score (if available)
    # Rotten Tomatoes score (Tomatometer)
    # Direction (director(s) of each film)
    # Screenwriters (of each film)
    # Plot
    # Duration (in minutes)
    # Name of the film

# websites:
    # https://www.imdb.com/
    # https://www.rottentomatoes.com/

# movie list
lista_general = [('Movie', '2 Fingers Honey 2', 2024, 2, 'tt31014713'),
 ('Movie', 'The Odd Room Mates', 2024, 3, 'tt30990201'),
 ('Movie', 'Kallanmaarude Veedu', 2024, 1, 'tt30974578'),
 ('Movie', 'Slasher House 3', 2024, 3, 'tt11078340'),
 ('Movie', 'Mr Patrick Wahala in America', 2024, 1, 'tt30970938'),
 ('Movie', 'Furiosa: A Mad Max Saga', 2024, 5, 'tt12037194'),
 ('Movie', 'The Heirloom', 2024, 1, 'tt30955919'),
 ('Movie', 'Adrienne', 2024, None, 'tt13143064'),
 ('Movie', 'Jaque Mate', 2024, 1, 'tt30850554'),
 ('Movie', 'Mnemonix', 2024, 12, 'tt13590374'),]

testing_list = [('Movie', '2 Fingers Honey 2', 2024, 2, 'tt31014713'),
 ('Movie', 'The Odd Room Mates', 2024, 3, 'tt30990201')]

# import libraries for data processing
# -----------------------------------------------------------------------
import pandas as pd

# Import libraries for web browser automation with Selenium
# -----------------------------------------------------------------------
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager # ChromeDriverManager manages the installation of the Chrome driver
from selenium.webdriver.common.keys import Keys # to simulate keyboard events in Selenium
from selenium.webdriver.support.ui import Select # to interact with <select> elements on web pages

# import libraries to pause execution
# -----------------------------------------------------------------------
from time import sleep  # Sleep is used to pause the execution of the program for a number of seconds

# settings
# -----------------------------------------------------------------------
pd.set_option('display.max_columns', None)  # Sets a Pandas option to display all columns of a DataFrame



In [3]:
# open browser
driver = webdriver.Chrome()
print("\nOpening Chrome 🧭...")

# maximize window
driver.maximize_window()
print("\nMaximizing window 🪟...")

# navigate to website
url = "https://www.imdb.com/"
driver.get (url)
print(f"\nNavigating to ⛵ {url}...")

# pause to let the website load 
nap_time = 3
sleep(nap_time)
print(f"\nTaking a quick nap 😴... Only {nap_time} seconds though!")

# accept cookies
css_selector_imbd = "#__next > div > div > div.sc-jrcTuL.bPmWiM > div > button.icb-btn.sc-bcXHqe.sc-dkrFOg.sc-iBYQkv.dcvrLS.ddtuHe.dRCGjd"
driver.find_element("css selector", css_selector_imbd).click()
print("\nAccepting cookies 🍪...")

# info to extract:
    # IMDB score (if available) --> IMDB website
    # Rotten Tomatoes score (Tomatometer) --> Rotten Tomatoes website
    # Direction (director(s) of each film) --> Exists in both websites
    # Screenwriters (of each film) --> IMDB website
    # Plot --> IMDB website
    # Duration (in minutes) --> IMDB website
    # Name of the film --> This is available already

# create a list to store the info to be collected from each movie
all_movies_info = [] 

# implement a for loop for iterating each movie on the list
#for i in range (len(lista_general)):
for i in range (len(testing_list)):
   
    movie_id = lista_general [i][4]
    movie_name = lista_general [i][1]
    print (f"\n<<{i}>>\nSearching for movie 🔍 '{movie_name}', id '{movie_id}'...")

    movie_list = [] # empty list to store info on each iteration. It's a temporary list

    # implement a try exept for managing errors (if not found for example)
    try: 
        # find the movie > enter the id into the search bar 
        driver.find_element("css selector", "#suggestion-search").send_keys(movie_id, Keys.ENTER)
        print(f"'{movie_name}' found! 🎉")
        print ("\nExtracting data👽...")
    except:
        # add none to the list
        print (f"Couldn't find movie '{movie_name}', id '{movie_id}' ❌")
        driver.back()
        continue
    
    def find_and_save (data, emoji, locator_type, expression):
        """
        Finds an element in a website page, extracts the text within the element and appends its value in a list.

        Parameters:
        - data (string): data to be found; e.g. "IMDB rating" or "Director"
        - locator_type (string): the strategy to be used within the find_element method to find the element; e.g. "css selector" or "xpath"
        - expression (string): expression from the website to be used within the find_element method to find the element; e.g. css selector or xpath expression
        - emoji (string): the icon that will be print in the success message, must be representative of the data to be found

        Returns:
        - list: a list containings all value found or "none" if nothing found.
        """
        try:
            data = data
            value = driver.find_element(locator_type, expression).text
            movie_list.append(value)
            print (f" - {data}: {emoji}{value}")
        except:
            imbd_rating = "none"
            movie_list.append(imbd_rating)
            print (f" - No {data} found❌")
        return movie_list

    movie_list = find_and_save ("IMDB_rating", "⭐", "xpath", '//*[@id="__next"]/main/div/section[1]/section/div[3]/section/section/div[2]/div[2]/div/div[1]/a/span/div/div[2]/div[1]/span[1]')
    movie_tuple = tuple(movie_list)
    all_movies_info.append(movie_tuple)

    movie_list = find_and_save ("Director","🎬","css selector","#__next > main > div > section.ipc-page-background.ipc-page-background--base.sc-c41b9732-0.NeSef > section > div:nth-child(5) > section > section > div.sc-491663c0-4.gEsAEH > div.sc-491663c0-6.eQRCDK > div.sc-491663c0-10.emoxHI > section > div.sc-1f50b7c-3.ZYFjc > div > ul > li:nth-child(1) > div > ul > li > a")
    movie_tuple = tuple(movie_list)
    all_movies_info.append(movie_tuple)

    movie_list = find_and_save ("Screenwriters", "✍️", "css selector", "#__next > main > div > section.ipc-page-background.ipc-page-background--base.sc-c41b9732-0.NeSef > section > div:nth-child(5) > section > section > div.sc-491663c0-4.gEsAEH > div.sc-491663c0-6.eQRCDK > div.sc-491663c0-10.emoxHI > section > div.sc-1f50b7c-3.ZYFjc > div > ul > li:nth-child(2) > div > ul > li:nth-child(1) > a")
    movie_tuple = tuple(movie_list)
    all_movies_info.append(movie_tuple)
    
    movie_list = find_and_save ("Plot", "📰", "css selector", '#__next > main > div > section.ipc-page-background.ipc-page-background--base.sc-c41b9732-0.NeSef > div > section > div > div.sc-978e9339-1.ihWZgK.ipc-page-grid__item.ipc-page-grid__item--span-2 > section:nth-child(18) > div.sc-20579f43-0.kJbyJL > div > div > div > div')
    movie_tuple = tuple(movie_list)
    all_movies_info.append(movie_tuple)

    movie_list = find_and_save ("Duration", "⏱️", "css selector","#__next > main > div > section.ipc-page-background.ipc-page-background--base.sc-c41b9732-0.NeSef > section > div:nth-child(5) > section > section > div.sc-491663c0-3.bdjVSf > div.sc-1f50b7c-0.PUxFE > ul > li:nth-child(2)")
    movie_tuple = tuple(movie_list)
    all_movies_info.append(movie_tuple)

    movie_list = find_and_save ("Film name IMDB", "🅰️", "css selector", "#__next > main > div > section.ipc-page-background.ipc-page-background--base.sc-c41b9732-0.NeSef > section > div:nth-child(5) > section > section > div.sc-491663c0-3.bdjVSf > div.sc-1f50b7c-0.PUxFE > h1 > span")
    movie_tuple = tuple(movie_list)
    all_movies_info.append(movie_tuple)


print(f"\nIMDB web scrapping COMPLETED! ✅")
print (all_movies_info)



Opening Chrome 🧭...

Maximizing window 🪟...

Navigating to ⛵ https://www.imdb.com/...

Taking a quick nap 😴... Only 3 seconds though!

Accepting cookies 🍪...

<<0>>
Searching for movie 🔍 '2 Fingers Honey 2', id 'tt31014713'...
'2 Fingers Honey 2' found! 🎉

Extracting data👽...
 - IMDB_rating: ⭐8,5
 - Director: 🎬Ermal Mamaqi
 - Screenwriters: ✍️Klaudia Brahimaj
 - No Plot found❌
 - Duration: ⏱️1h 46min
 - Film name IMDB: 🅰️Dy gisht mjaltë 2

<<1>>
Searching for movie 🔍 'The Odd Room Mates', id 'tt30990201'...
'The Odd Room Mates' found! 🎉

Extracting data👽...
 - No IMDB_rating found❌
 - Director: 🎬Ibukunoluwa Rolat-Abiola
 - Screenwriters: ✍️Bryan Walker
 - No Plot found❌
 - No Duration found❌
 - Film name IMDB: 🅰️The Odd Room Mates

IMDB web scrapping COMPLETED! ✅
[('8,5',), ('8,5', 'Ermal Mamaqi'), ('8,5', 'Ermal Mamaqi', 'Klaudia Brahimaj'), ('8,5', 'Ermal Mamaqi', 'Klaudia Brahimaj', 'none'), ('8,5', 'Ermal Mamaqi', 'Klaudia Brahimaj', 'none', '1h 46min'), ('8,5', 'Ermal Mamaqi', 'K

In [14]:
# open browser
driver = webdriver.Chrome()
print("\nOpening Chrome 🧭...")

# maximize window
driver.maximize_window()
print("\nMaximizing window 🪟...")

# open website
url = "https://www.rottentomatoes.com/"
driver.get (url)
print(f"\nNavigating to ⛵ {url}...")

# pause to let the website load 
nap_time = 3
sleep(nap_time)
print(f"\nTaking a quick nap 😴... Only {nap_time} seconds though!")

# accept cookies
css_selector = "#onetrust-accept-btn-handler" 
driver.find_element("css selector", css_selector).click()
print("\nAccepting cookies 🍪...")

for i in range (len(lista_general)):
    movie_name = lista_general [i][1]
    
    print (f"\n<<{i}>>\nSearching for movie 🔍 '{movie_name}'...")
    # search with movie name
    driver.find_element("css selector", "#header-main > search-results-nav > search-results-controls > input[type=text]").send_keys(movie_name, Keys.ENTER)
    # click on "Movies" to filter results to show Movies only
    driver.find_element("css selector", "#search-results > nav > ul > li:nth-child(2) > span").click()
    print ("\nFiltering results by 'movie'...")
    # add a for loop to iterate on diferent results
    for i in range (1,6):
        try:
            # go to the result and get movie name as in Rotten Tomatoes website
            rt_movie_name = driver.find_element("css selector", f"#search-results > search-page-result:nth-child(2) > ul > search-page-media-row:nth-child({i}) > a:nth-child(2)").text.lower()
            # print (f"\nComparing result {i}: '{rt_movie_name}'")
            # compare the name with the movie name
            if rt_movie_name == movie_name.lower():
                    # access movie page
                    print ("Movie found 🎉")
                    driver.find_element("css selector", f"#search-results > search-page-result:nth-child(2) > ul > search-page-media-row:nth-child({i}) > a:nth-child(2)").click()
                    find_and_save ("Rotten Tomatoes score","🍅", "css selector","#modules-wrap > div.media-scorecard.no-border > media-scorecard > rt-button:nth-child(3) > rt-text")
                    find_and_save ("Plot", "", "css selector", "#modules-wrap > div.media-scorecard.no-border > media-scorecard > drawer-more > rt-text")
            else:
                # print (f" > Result {i}'s name doesn't match 😒")
                # go to the second result and repeat 4.A
                continue
        except:
            print ("not found - Error")
            #print a message "not found"
            #append "none"
            pass
    print ("\nMovie not found ❌ in Rotten Tomatoes")    
        
        
        # 1. go to the first result
        # 2. go to the first result
        # 3. compare the name with the movie name
        # 4.A if it matches, access the movie page
            # get the Tomatometer and save it in a list
            # get the plot
        # 4.B if it doesn't match go back
            # go to the second result and repeat 4.A

# add a for loop to iterate on diferent results

    







Opening Chrome 🧭...

Maximizing window 🪟...

Navigating to ⛵ https://www.rottentomatoes.com/...

Taking a quick nap 😴... Only 3 seconds though!

Accepting cookies 🍪...

<<0>>
Searching for movie 🔍 '2 Fingers Honey 2'...

Filtering results by 'movie'...

Movie not found ❌ in Rotten Tomatoes

<<1>>
Searching for movie 🔍 'The Odd Room Mates'...

Filtering results by 'movie'...
Movie found 🎉
 - No Rotten Tomatoes score found❌
 - Plot: A Nigerian woman and an African American man end up as roommates due to unforeseen circumstances.
not found - Error
not found - Error
not found - Error
not found - Error

Movie not found ❌ in Rotten Tomatoes

<<2>>
Searching for movie 🔍 'Kallanmaarude Veedu'...

Filtering results by 'movie'...

Movie not found ❌ in Rotten Tomatoes

<<3>>
Searching for movie 🔍 'Slasher House 3'...

Filtering results by 'movie'...

Movie not found ❌ in Rotten Tomatoes

<<4>>
Searching for movie 🔍 'Mr Patrick Wahala in America'...

Filtering results by 'movie'...

Movie not fou

ElementNotInteractableException: Message: element not interactable
  (Session info: chrome=126.0.6478.127)
Stacktrace:
	GetHandleVerifier [0x00007FF66038EEA2+31554]
	(No symbol) [0x00007FF660307ED9]
	(No symbol) [0x00007FF6601C8559]
	(No symbol) [0x00007FF660211D73]
	(No symbol) [0x00007FF66020FEDB]
	(No symbol) [0x00007FF66023D02A]
	(No symbol) [0x00007FF66020BA76]
	(No symbol) [0x00007FF66023D240]
	(No symbol) [0x00007FF66025C977]
	(No symbol) [0x00007FF66023CDD3]
	(No symbol) [0x00007FF66020A33B]
	(No symbol) [0x00007FF66020AED1]
	GetHandleVerifier [0x00007FF660698B1D+3217341]
	GetHandleVerifier [0x00007FF6606E5AE3+3532675]
	GetHandleVerifier [0x00007FF6606DB0E0+3489152]
	GetHandleVerifier [0x00007FF66043E776+750614]
	(No symbol) [0x00007FF66031375F]
	(No symbol) [0x00007FF66030EB14]
	(No symbol) [0x00007FF66030ECA2]
	(No symbol) [0x00007FF6602FE16F]
	BaseThreadInitThunk [0x00007FFA222A257D+29]
	RtlUserThreadStart [0x00007FFA243CAF28+40]
