In [1]:
import re

In [2]:
from IPython.display import clear_output

In [3]:
import requests

In [4]:
import bs4

In [5]:
from time import sleep

Movie attributes:

- length ("Runtime")
- date released
- genre      
- RT score (critic/audience/bool=cert fresh)
- original language
- platforms to watch on (free? subscription?)
- URL to the movie on RT


In [121]:
class Movie:
        
    def __init__(self, title, genre, scores, og_lang, release_date, length, platforms, url):
        self.genre = genre
        self.title = title
        self.scores = scores
        self.og_lang = og_lang
        self.release_date = release_date
        self.length = length
        self.platforms = platforms
        self.url = url
        
    def __str__(self):
#         Print a divider line to separate movies
        string = '-'*85+"\n"*3
        
        string += f"{self.title} ({self.release_date}),  Genre(s): {self.genre}\n"
        
        yn = ''
        if self.scores[2]:
            yn = 'yes'
        else:
            yn = 'no'
            
        score_string = f"Critic score: {self.scores[0]}, Audience score: {self.scores[1]}, Certified fresh: {yn}" + "\n"
        
        string += score_string
        
        string += f"Length: {self.length}, Original language: {self.og_lang}"
        
        platforms_string = ''
        if len(self.platforms) == 0:
            platforms_string = 'none'
        else:
            platforms_string = "Available platforms: "
            for key,value in self.platforms.items():
                platforms_string += f"{key}: {value}, "
                
#             Cut off the last comma
            platforms_string = platforms_string[:-2]
        
        string += "\n"+platforms_string
        
        string += "\n"+self.url
        
        return string

In [122]:
# This class has not yet been implemented
class search_query:
    
    def __init__(self, selected_genres, selected_scores, selected_og_lang, selected_release_date, selected_length, selected_pay, selected_platforms):
        self.selected_genres = selected_genres
        self.selected_scores = selected_scores
        self.selected_og_lang = selected_og_lang
        self.selected_release_date = selected_release_date
        self.selected_length = selected_length
        self.selected_pay = selected_pay
        self.selected_platforms = selected_platforms
    
    def __str__(self):
        string = "Your previous search consisted of movies that were "

HOURS_TO_MINUTES

In [125]:
# Input looks like one of the following Strings:  "1h 23m"   "1h 7m"   "45m"
# Outputs runtime (int) in minutes

def hours_to_minutes(hour_string):
    num = 0
    if len(hour_string) == 6:
        num = 60*int(hour_string[0]) + 10*int(hour_string[3]) + int(hour_string[4])
    if len(hour_string) == 5:
        num = 60*int(hour_string[0]) + int(hour_string[3])
    if len(hour_string) == 3:
        num = int(hour_string[0:2])
    if len(hour_string) == 2:
        num = int(hour_string[0])
    return num

SELECT_SEARCH_METHOD     (not yet implemented)

In [126]:
# This method has not been implemented yet
def select_search_method(new_search=False):
    y_or_n = input("Would you like to modify a previous search query? Enter Y or N: ")
    if y_or_n.lower() == 'y':
        clear_output()
        print(previous_search)
        print()
        print("This is your previous search. To change a search entry, enter its corresponding number: ")
    else:
        pass
    

SELECT_GENRES

In [127]:
# Prompts user for list of genres,  returns corresponding sorted list: e.g. [1,4,18]
# The numbers in this list are used for creating the RT URL corresponding to the desired genres

def select_genres():
    print("Select from the genres below: \n")
    print("Action"+' '*10+"Animation"+' '*7+"Art & Foreign"+' '*7+"Classics"+' '*8+"\nComedy"+' '*10+
          "Documentary"+' '*5+"Drama"+' '*15+"Horror"+' '*10+"\nKids & Family"+' '*3+"Mystery"+' '*9+"Sci-fi & Fantasy"+' '*4
          +"Romance")
    print('\ne.g. Enter something like "Action, Drama, Mystery"  (but exclude the quotation marks)')
    
    genres = input()
    splitted = genres.split(',')
    
#     Remove spaces using re (regular expressions)
    pattern = r"( *)([a-zA-Z& -]+)( *)"
    for i in range(len(splitted)):
        match = re.search(pattern,splitted[i])
        splitted[i] = genre_dict[match.group(2).lower()]
        
    splitted.sort()
        
    return (splitted, genres)
    

SELECT_SCORE

In [10]:
# Prompts user for score lower-bound preferences and certified fresh:   e.g.  returns [80,90,True]
def select_score():
    crit_higher_than = int(input("...that have a critic score higher than:  \n"))
    aud_higher_than = int(input("...that have an audience score higher than:  \n"))
    fresh_bool = input('...that are "Certified Fresh". Enter Y or N:  ')
    if fresh_bool.lower() == 'y':
        fresh_bool = True
    else:
        fresh_bool = False
        
    return [crit_higher_than, aud_higher_than, fresh_bool]

SELECT_LANG

In [11]:
# Prompts user for whether they want movies in English only.   Returns 'English' or 'any'
def select_lang():
    lang = input("...that are in English. Enter Y or N:  ")
    if lang.lower() == 'n':
        return 'any'
    else:
        return 'English'

SELECT_DATE

In [12]:
# Prompts user for lower and upper bound for movies.     Returns something like [1980,2000]
def select_date():
    lower = int(input("...that were made between the year "))
    upper = int(input("and the year "))
    if upper<lower:
        temp = upper
        upper = lower
        lower = temp
    return [lower,upper]

SELECT_LENGTH

In [13]:
# Prompts user for lower and upper bound movie length.     Returns something like [90,150]
def select_length():
    lower = int(input("that are longer than (minutes): "))
    upper = int(input("but shorter than (minutes): "))
    if upper<lower:
        temp = upper
        upper = lower
        lower = temp
    return [lower,upper]
    

CREATE_RELEASE_URL

In [14]:
def create_release_url(genres,scores):
    genre_list = str(genres[0])
    for i in range(1,len(genres)):
        genre_list += ";"+str(genres[i])
    
    certified = str(scores[2]).lower()
    
    url = f'https://www.rottentomatoes.com/browse/dvd-streaming-all?minTomato={scores[0]}&maxTomato=100&certified={certified}&services=amazon;hbo_go;itunes;netflix_iw;vudu;amazon_prime;fandango_now&genres={genre_list}&sortBy=release'
    
    return url

CREATE_TOMATO_URL

In [15]:
def create_tomato_url(genres,scores):
    genre_list = str(genres[0])
    for i in range(1,len(genres)):
        genre_list += ";"+str(genres[i])
    
    certified = str(scores[2]).lower()
    
    url = f'https://www.rottentomatoes.com/browse/dvd-streaming-all?minTomato={scores[0]}&maxTomato=99&certified={certified}&services=amazon;hbo_go;itunes;netflix_iw;vudu;amazon_prime;fandango_now&genres={genre_list}&sortBy=tomato'
    
    return url

CREATE_TOMATO_URL_100

In [16]:
def create_tomato_url_100(genres,scores):
    genre_list = str(genres[0])
    for i in range(1,len(genres)):
        genre_list += ";"+str(genres[i])
    
    certified = str(scores[2]).lower()
    
    url = f'https://www.rottentomatoes.com/browse/dvd-streaming-all?minTomato={scores[0]}&maxTomato=100&certified={certified}&services=amazon;hbo_go;itunes;netflix_iw;vudu;amazon_prime;fandango_now&genres={genre_list}&sortBy=tomato'
    
    return url

NUM_RECOMMENDATIONS

In [17]:
# Prompts user for number of movie recommendations       returns an int
def num_recommendations():
    num = int(input("How many movie recommendations do you want to see?  "))
    return num

SELECT_PLATFORMS

In [18]:
# Returns list of BeautifulSoup-formatted platforms
def select_platforms():
    print("I would like to be shown movies that are available for free from (or with a subscription to) the platforms below: \n")
    print("Amazon Prime"+' '*3+"Hulu"+' '*11+"Netflix"+' '*8+"Disney+"+' '*5+"ESPN Plus"+' '*6+
          "\nHBO Go"+' '*9+"HBO Max"+' '*8+"HBO Now"+' '*8+"Peacock"+' '*5+"Vudu"+' '*11+"\nApple TV"+' '*7
          +"iTunes"+' '*9+"XFINITY"+' '*8+"E! Online"+' '*6)
    print('\ne.g. Enter something like "Netflix, Amazon Prime, Vudu"  (but exclude the quotation marks)')
    
    platforms = input()
    splitted = platforms.split(',')
    
#     Remove spaces
    pattern = r"( *)([a-zA-Z&!-]+) *([a-zA-Z&!-]+)( *)"
    for i in range(len(splitted)):
        match = re.search(pattern,splitted[i])
        to_dict = match.group(2)+match.group(3)
        splitted[i] = platform_dict_selected_to_soup[to_dict.lower()]
        
# Now splitted is a list of soup-language platforms

    print(splitted)
    return splitted

SELECT_PAYMENT

In [19]:
# Returns 'y' if the user is okay with spending money, 'n' otherwise
def select_payment():
    yes_or_no = input("Are you okay with spending money on a movie? Enter Y or N: ")
    if yes_or_no.lower() == 'y':
        return 'y'
    else:
        return 'n'
        

## Useful dictionaries

In [129]:
# platforms: vudu, peacock, netflix, hulu, amazon-prime-video-us, disney-plus-us, espn-plus, apple-tv-plus-us,
# hbo-now, hbo-go, hbo-max, itunes, xfinity, e-online


# Here I define some useful dictionaries.


# Takes in a user-input-formatted platform and outputs a BeautifulSoup-formatted platform 
platform_dict_selected_to_soup = {'vudu':'vudu', 'peacock':'peacock', 'netflix':'netflix', 'hulu':'hulu',
                                  'amazonprime':'amazon-prime-video-us', 'amazon':'amazon-prime-video-us',
                                  'prime':'amazon-prime-video-us', 'disney+':'disney-plus-us',
                                  'disneyplus':'disney-plus-us', 'disney':'disney-plus-us', 'espnplus':'espn-plus',
                                  'espn':'espn-plus', 'appletvplus':'apple-tv-plus-us', 'appletv':'apple-tv-plus-us',
                                  'apple':'apple-tv-plus-us', 'hbonow':'hbo-now', 'hbogo':'hbo-go', 'hbomax':'hbo-max',
                                  'itunes':'itunes', 'xfinity':'xfinity', 'e!online':'e-online', 'e-online':'e-online', 'eonline':'e-online'
                                 }

# Turns soup language into a printable form
platform_dict_soup_to_printable = {'vudu':'Vudu',
                                   'peacock':'Peacock',
                                   'netflix':'Netflix',
                                   'hulu':'Hulu',
                                   'amazon-prime-video-us':'Amazon Prime',
                                   'disney-plus-us':'Disney+',
                                   'espn-plus':'ESPN Plus',
                                   'apple-tv-plus-us':'Apple TV',
                                   'hbo-now':'HBO Now',
                                   'hbo-go':'HBO Go',
                                   'hbo-max':'HBO Max',
                                   'itunes':'iTunes',
                                   'xfinity':'XFINITY',
                                   'e-online':'E! Online'}


genre_dict = {'action':1, 'animation':2, 'art & foreign':4, 'art':4, 'foreign':4, 'art&foreign':4, 'classics':5,'comedy':6, 'documentary':8, 'drama':9,
              'horror':10, 'kids & family':11, 'kids':11, 'mystery':13, 'sci-fi & fantasy':14, 'sci-fi':14, 'scifi':14, 'sci-fy':14, 'scify':14, 'fantasy':14, 'scifi & fantasy':14, 'romance':18}


inverse_genre_dict = {1: 'Action', 2: 'Animation', 4: 'Art & Foreign', 5: 'Classics', 6: 'Comedy', 8: 'Documentary', 9: 'Drama', 10: 'Horror', 11: 'Kids & Family', 13: 'Mystery', 14: 'Sci-fi & Fantasy', 18: 'Romance'}


## Enhanced search method

In [345]:
from selenium import webdriver
from random import randint
from selenium.webdriver.chrome.options import Options

In [362]:
#  Not yet implemented
saved_soup = ''

In [347]:
def enhanced_search():
    
    global saved_soup

    print("Rotten Tomatoes enhanced search"+'\n'*2)
    
    genre_list,genre_string = select_genres()
    sleep(2)
    clear_output()
    
    selected_genres = inverse_genre_dict[genre_list[0]]
    for i in range(1,len(genre_list)):
        selected_genres += f", {inverse_genre_dict[genre_list[i]]}"
    print(f"Selected genres: {selected_genres}\n")
    
    print("From the genres I selected, I would only like to be shown movies..."+'\n')
    selected_scores = select_score()
    print()
    selected_og_lang = select_lang()
    print()
    selected_release_date = select_date()
    print()
    selected_length = select_length()
    print()
#     either 'y' or 'n':
    selected_pay = select_payment()
    if selected_pay == 'n':
        sleep(1)
        clear_output()
        selected_platforms = select_platforms()
    print()
    
    print("Search criteria complete.")
    print()
    sleep(3)
    clear_output()
    print("Search criteria complete.")
    
    selected_recommendations = num_recommendations()
    
    sleep(2)
    clear_output()
    print(f"You want to see {selected_recommendations} movies")
    print("Searching for movies...")
    
    
# Create two URLs. We will combine their search pools into one.
    
#     Create URL which sorts movies by release date (release date on a streaming platform, not original release date)
    initial_url = create_release_url(genre_list,selected_scores)
    
#     Create URL which sorts movies by Tomatometer
    secondary_url = create_tomato_url(genre_list,selected_scores)
    
    
#     NOTE: YOU WILL HAVE TO CONFIGURE THE Selenium Web Driver

    my_options = Options()
#     This allows the Chrome driver to operate "headless" (a Chrome window is not opened)
    my_options.add_argument("--headless")
    
#     Create the driver
    driver = webdriver.Chrome('/Users/bean/Desktop/Jupyter Notebooks/chromedriver',options=my_options)

#     Optional option argument for window size:
    # my_options.add_argument('window-size=1200x600')
    
    
#     Launch initial_url in driver
    driver.get(initial_url)
    sleep(3)
    
    
#     Click the list-view button. This way we can scrape the runtime (movie length), which is unreachable when the
#     movies are in grid-view.
    list_item_element = driver.find_element_by_css_selector('.fontelloIcon.icon-list')
    try:
        list_item_element.click()
    except:
        list_item_element.click()

    # Click the "Show More" button exactly 5 times
    for n in range(5):
        show_more_element = driver.find_element_by_css_selector('.btn.btn-secondary-rt.mb-load-btn')
        try:
            show_more_element.click()
        except:
            show_more_element.click()
#         Look less like a bot:
        s = randint(3, 5)  
        sleep(s)
        
        
    page = driver.page_source

    
#     This soup object contains all the data we need for the first search pool.
    initial_soup = bs4.BeautifulSoup(page, "lxml")
    


#     Load second page
    driver.get(secondary_url)
    sleep(3)
    
#     Click the list-view button. Attempt to click it again if it throws an exception the first tiem.
    list_item_element = driver.find_element_by_css_selector('.fontelloIcon.icon-list')
    try:
        list_item_element.click()
    except:
        list_item_element.click()
    
    # Click the "Show More" button exactly 5 times
    for n in range(5):
        show_more_element = driver.find_element_by_css_selector('.btn.btn-secondary-rt.mb-load-btn')
        try:
            show_more_element.click()
        except:
            show_more_element.click()
#         Look less like a bot:
        
    
    page = driver.page_source

    secondary_soup = bs4.BeautifulSoup(page, "lxml")
    
#     Quit the driver, we are done with it. We have all our data stored in our soup objects!
    driver.quit()
    
    movies_added = 0
    movie_list = []
    
    url_list = set({})

    pattern_url_tomatoIcon_tomatoScore_runtime = r'"url":"(/m/.+?)","tomatoIcon":"(.+?)","tomatoScore":(\d+),(.+?)"thumborId"'

    runtime_pattern = r'"runtime":"(.+?)"'

    soups = (initial_soup, secondary_soup)
    
    for soup in soups:

        for tag in soup.find_all("div", class_="mb-movie"):

            url = tag.div.next_sibling.a.get('href')

            try:
                other_info_string_with_spaces = str(tag.select(".other_info")[0])
                other_info_string_no_spaces = other_info_string_with_spaces.replace(' ','')
                length_pattern = r"(\d)hr\.(\d+)min\."

                length_match = re.search(length_pattern, other_info_string_no_spaces)

                if length_match:
                    hours = length_match.group(1)
                    minutes = length_match.group(2)
                    runtime_string = hours+"h "+minutes+"m"
                    runtime = hours_to_minutes(runtime_string)
                    
#                 If no exception occurred in the first line, and there's no match, then the runtime must be
#                 under an hour, i.e. like this: "23min"
                else:
                    runtime = int(other_info_string_no_spaces[:-3])

            except:
                runtime = -1

            
            if runtime != -1:
                if runtime < selected_length[0]:
                    continue
                if runtime > selected_length[1]:
                    continue


                
                
            is_cert_fresh = False
            if len(tag.select(".icon.tiny.certified_fresh")) != 0:
                is_cert_fresh = True

            if selected_scores[2] and not is_cert_fresh:
                continue

            tomato_score = int(tag.select(".tMeterScore")[0].text[:-1])
            if tomato_score < selected_scores[0]:
                continue

            
#             Add the movie URL to our list if we haven't skipped it by now.
#             i.e. it hasn't failed the search criteria yet, judging from data on the search page
            url_list.add(url)
    
    print(f"Search pool (url list) contains {len(url_list)} movies.")
    
        

#     Now, we scrape the URL of each added movie using Requests and BeautifulSoup to do the bulk of our filtering

    for url in url_list:
               
        if movies_added == selected_recommendations:
            break

        movie_url = "https://www.rottentomatoes.com"+url
        result = requests.get(movie_url)
        soup = bs4.BeautifulSoup(result.text,'lxml')

        print(movie_url)
        

        aud_cri_soup = soup.select("score-board")
        
        audience_score = ''
        critic_score = ''
        
        try:
            audience_score = int(aud_cri_soup[0]["audiencescore"])
        except:
#             I chose to skip movies that don't have an audience score
            continue
        try:
            critic_score = int(aud_cri_soup[0]["tomatometerscore"])
        except:
#             I chose to skip movies that don't have a critic score
            continue


        certified_fresh = False
        certified_soup = soup.select("#mps-page-integration")
        certified_pattern = r'certified_fresh]":"(\d)"'
        certified_match = re.search(certified_pattern,str(certified_soup[0]))

        if certified_match.group(1) == '1':
            certified_fresh = True
            
        scores = [critic_score,audience_score,certified_fresh]
            
        if selected_scores[1] > scores[1]:
            continue
            
#         ------------------------------------------------------------------------------------------------------------------------------------------------------
        
#         Check if the movie is available for free or subscription on any selected_platforms
        
        platforms = {}
        has_a_selected_platform = False
        
        platforms_soup = soup.select(".affiliate__link")

        for match in platforms_soup:
            key = match["data-affiliate"]
            
#             Skip this movie if it's currently in theatres
            if key == "showtimes":
                continue
            
            value = match.p.string.lower()
            
            
            if selected_pay == 'n' and key in selected_platforms and (value == 'free' or value == 'subscription'):  # have to 1) create sel_plat and 2) create dict
                has_a_selected_platform = True
                platforms.update({platform_dict_soup_to_printable[key]:value})
                
            elif selected_pay == 'y':
                has_a_selected_platform = True
                platforms.update({platform_dict_soup_to_printable[key]:value})
                
            else:
                continue
                
            
#         Skip it if none of the available platforms is a user-selected platform 
        if not has_a_selected_platform:
            continue
            
            
            
#         ------------------------------------------------------------------------------------------------------------------------------------------------------

#         Scrape release date (rel), genre (gen), and length (len)

        rel_gen_len_soup = soup.select(".scoreboard__info")
        rel_gen_len_pattern = r"(\d{4}), *? (.*?),*? (\d*h* *\d+?m)"
        rel_gen_len_match = re.search(rel_gen_len_pattern,rel_gen_len_soup[0].text)

        release_date = int(rel_gen_len_match.group(1))
        
        if selected_release_date[0] > release_date or selected_release_date[1] < release_date:
#             if selected_release_date[0] > release_date:
#                 print("wrong critic score")
#             else:
#                 print("wrong aud score")
            continue

        genre = rel_gen_len_match.group(2)

        length = rel_gen_len_match.group(3)
        
        length_int = hours_to_minutes(length)
        
        if length_int < selected_length[0] or length_int > selected_length[1]:
            continue
            
            
            
#         ------------------------------------------------------------------------------------------------------------------------------------------------------

#         Filter out selected language
#         This section could probably be improved using things like .classname.p


        og_lang = ''
        count = 0
        ready = False
        for match in soup.select(".meta-row"):
            for child in match.children:
                if not ready:
                    try:
        #                 print(child.text)
                        if child.text == "Original Language:":
                            ready = True
                    except:
                        pass
                else:
                    count += 1
                    if count == 2:
                        try:
                            og_lang = child.text
                        except:
                            pass
                        
        if og_lang == '':
            og_lang = "not listed"
        
        if selected_og_lang == 'English':
            if og_lang == "not listed":
                pass
            elif og_lang[0:7] != 'English':
                continue
            else:
                pass

            
        
# We've made it this far, so the movie is definitely being added to the search results.
#         Scrape title
        title_soup = soup.select('title')
        title_pattern = r"(.+) - Rotten Tomatoes"
        title_match = re.search(title_pattern,title_soup[0].text)

        title = title_match.group(1)
        

#         Create movie object
        movie = Movie(title, genre, scores, og_lang, release_date, length, platforms, movie_url)
    
#         Add it to movie list
        movie_list.append(movie)
        movies_added += 1
        print("+1")
        
        
    clear_output()

    if movies_added < selected_recommendations:
        print(f"Sorry, we were unable to find {selected_recommendations} movies.\n")
        
    if movies_added == 1:
        print(f"Here is {movies_added} movie that matches your search criteria!"+"\n"*2)
    else:
        print(f"Here are {movies_added} movies that match your search criteria!"+"\n"*2)
    
    for mov in movie_list:
        print(mov)
        print("\n")
        
        
    

In [361]:
enhanced_search()

I would like to be shown movies that are available for free from (or with a subscription to) the platforms below: 

Amazon Prime   Hulu           Netflix        Disney+     ESPN Plus      
HBO Go         HBO Max        HBO Now        Peacock     Vudu           
Apple TV       iTunes         XFINITY        E! Online      

e.g. Enter something like "Netflix, Amazon Prime, Vudu"  (but exclude the quotation marks)



AttributeError: 'NoneType' object has no attribute 'group'

In [None]:
# For copy-paste purposes:

amazon prime, hulu, netflix, disney+, hbo max, peacock, vudu

## Notes for improvement:   keep track of minimum movie score scanned for each 