In [3]:
import pandas as pd
import numpy as np
import sklearn as sk
import string
import omdb
from omdb import OMDBClient
from bs4 import BeautifulSoup

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.metrics import r2_score,mean_squared_error

from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO 

import urllib.parse
import requests
import json
import re
import time
import datetime
from collections import Counter
from reader import feed



In [3]:
client = omdb.OMDBClient(apikey='9b6f6a00')

First I import the IMDB dataset that I downloaded here: https://www.kaggle.com/stefanoleone992/imdb-extensive-dataset?select=IMDb+movies.csv

Thank you stefanoleone992!

In [4]:
movies = pd.read_csv("movies_with_awards.csv")

In [5]:
movies = movies[movies["year"].astype(int) >= 1999]

In [6]:
movies.shape

(47988, 319)

As you can see here, the entire dataset consists of over 85,000 movies. However, I am going to just look at films from the past twenty years, as they are likely to be more indicative of how to predict oscar winners in the present and will have more Rotten Tomatoes reviews--which will be a crucial data point in my logistic regression. The data set has a variety of data on each film; including language, genres, directors, actors, and more. You may notice, however, there are some important data points missing--whether or not each film won an oscar and it's rotten tomatoes results. This gives me the opportunity to scrape that data myself.

To append the award data to this dataset, I am going to use the omdb API. Essentially, it's an unofficial IMDB api, but it works well. Since I have the IMDB id for each film, I will simply query the website with the id, and grab the "award" field of the data. I will clean it up using regular expressions.

In [5]:
def get_award(imdbId):
    try:
        return client.imdbid(imdbId)["awards"]
    except:
        return "Not found"

Here I actually run the get_award function to create an 'awards' column

In [None]:
movies["awards"] = movies.iloc[:,0].apply(lambda x : get_award(x))

And now I use a regular expression to extract the actual number of Oscars it won. Then I fill all other non-winners with a 0 and convert every value in this column to an integer since I was previously working with strings and regular expressions.

In [None]:
movies["oscars_won"] = movies[movies["awards"].str.contains(r'Won [1-9]* Oscar', regex=True)]["awards"].str.extract('(\d+)')
movies["oscars_won"] = movies["oscars_won"].fillna(0)
movies["oscars_won"] = movies["oscars_won"].astype(int)

I will now remove the 'awards' column since I have extracted the data I need from it.

In [None]:
movies = movies.drop("awards", axis =1)

I would also like to pull all Rotten Tomatoes scores for the films as well as how many rotten tomatoes reviews they received. I will also build a function to scrape these review. But first, in order to make sure that the review data is not affected by whether or not a film was nominated for an oscar or awarded an oscar; I will need to import a very small dataset that notates when the oscar nominations were announced for each year.

In [8]:
nominee_announcements = pd.read_csv("oscars_nominee_announcements.csv")

We'll convert the date strings in the 'Date' column of this small dataset into datetime objects.

In [9]:
nominee_announcements['Date'] = pd.to_datetime(nominee_announcements['Date'])

We'll actually subtract the year by one so that it can be easily matched with films. For example, the 2020 Oscars were actually awarding movies from 2019, so although the Oscars were in 2020, we want to easily be able to match this date with movies released in 2019.

In [11]:
nominee_announcements["Year"] = nominee_announcements["Year"] - 1

Quickly rename the columns to be more congruent with the column names of the movies dataset.

In [12]:
nominee_announcements = nominee_announcements.rename(columns={'Year' : 'year', 'Date' : 'nom_date'})

Here we'll merge the nomination dataset with the movies dataset, so that the nomination date is easily accessible for the Rotten Tomatoes scraping we'll be doing.

In [13]:
movies = movies.merge(nominee_announcements, how='left', left_on = 'year', right_on = 'year')

Create some helper columns that will also make the Rotten Tomatoes scraping easier. We will create a column for a search term that combines the film title and director name. This will allow us to find the film we want with a fair amount of accuracy since adding the director name will prevent us accidentally searching and retrieving any films with the same title as the one we are searching. As a part of creating this search term, I've used the urllib library to encoding the spaces and other non-url-friendly characters to url encoding. However, the exception here is the space which Rotten Tomatoes search function handles as the characters '%20' and urllib encodes as '+' so I've replaced any occurence of '+' in the resulting encoded string with '%20' using the pandas string method 'replace'. 

In [15]:
movies['tomatoes_search'] = movies['title'] + " " + movies['director']
movies['tomatoes_search'] = movies['tomatoes_search'].apply(lambda x : urllib.parse.quote_plus(str(x)))
movies['tomatoes_search'] = movies['tomatoes_search'].str.replace("\+", "%20")


And then just to making things even more simple when it comes time to apply a lambda function, I am zipping the nom_date and tomatoes_search columns into the same helper column.

In [None]:
movies['review_helper'] = list(zip(movies['tomatoes_search'], movies['nom_date']))

In [16]:
# This function will retrieve the reviews from rotten tomatoes as a list consisting of the words fresh and rotten. 
# The number of instances of each word will indicate how many fresh and rotten reviews there are respectively.
# This function takes the parameters 'reviews_soup', the BeautifulSoup object of the reviews page, which it will 
# receive from the fresh_list function that actually handles the initial scraping of the page. It also takes the parameter
# nom_date so it can pull only reviews that happened before the nominations.

def get_review_list(reviews_soup, nom_date):
    
    # Creates the list where the words 'rotten' and 'fresh' will be stored
    
    rating_list = []

    # Uses BeautifulSoup to find all of the elements on the page that constitute the review summaries.
    
    reviews = reviews_soup.find_all(class_='row review_table_row')
    
    # It cycles through all of these review summaries and retrieves the rating of each review only if it was released before
    # the film was nominated for an Oscar.

    for review_num in range(0, len(reviews)):
        review_row = reviews_soup.find_all(class_='row review_table_row')[review_num]
        
        # I use the get_review_date function here to get the date of the review.
        
        review_date = get_review_date(reviews_soup, review_row)
        
        # Here, the function determines whether the review date was before the nomination date. If it was, then it pulls the
        # rating "fresh" or "rotten" from that review. Otherwise, the function simply passes and skips the row.
        
        if review_date < nom_date:
            rating_list.append(get_review_rating(reviews_soup, review_row))
        else:
            pass
    
    return rating_list

In [17]:
# Returns the date from the review by using regex to extract the date from the BeautifulSoup object and converting it to a 
# datetime object

def get_review_date(page_soup, review_row): 
    date = review_row.find(class_="review-date subtle small").text
    date_pat = r'[A-Z][a-z]{1,9}[ ][0-9]{1,2}[,][ ][0-9]{4}'
    date = re.findall(date_pat, date)[0]
    date = datetime.datetime.strptime(date, '%B %d, %Y')
    return date

# Returns the rating of a review as a string by finding the icon title for the review and just returning the part of the name
# of the icon that matters--'fresh' or 'rotten'.
    
def get_review_rating(page_soup, review_row):

    rating = review_row.find(class_=lambda class_: class_ and class_.startswith("review_icon icon small"))
    rating = rating["class"][3]
    return rating

In [9]:
# This is the overall handling function that initializes the web scrape, finds the appropriate film, and then pulls the
# html as a BeautifulSoup object to feed into the other functions that will actually pull the reviews.

def fresh_list(search_uri, nom_date):
    
    # Here I use an error handling exception to make sure I don't timeout because Rotten Tomatoes wants me to slow down with my
    # scraping. I have found that this keeps the crawler moving with just a few exceptions cropping up throughout the crawl
    # First I simply try accessing the url.
    
    try:
        search_url = requests.get('https://www.rottentomatoes.com/search?search=' + search_uri)
    
    # If there's some kind of error, the program just waits 15 seconds and tries again. If it still doesn't wait, it waits 20
    # more seconds and tries one last time. At this point, if it fails, I should just stop because Rotten Tomatoes clearly wants
    # that of me; but with this handling, I didn't have any problems with that.
    
    except:
        try:
            print("Too many requests.")
            time.sleep(15)
            search_url = requests.get('https://www.rottentomatoes.com/search?search=' + search_uri)
        except:
            print("Try again. Too many requests.")
            time.sleep(20)
            search_url = requests.get('https://www.rottentomatoes.com/search?search=' + search_uri)
                    
    
    # I'll implement another error handling exception just so that--if for any reason the html is not suitable for handling
    # (e.g. it is essentially empty of the necessary elements or has no reviews) then it will be skipped and return NaN
    
    try:
        
        # I use the BeautifulSoup library in order to easily parse the html from the webpage

        soup = BeautifulSoup(search_url.content, 'html.parser')
        
        # I specify that I want the list of search results from the page

        html = json.loads(soup.find(id='movies-json').text)

        # I retrieve the url of the first search result, which I can hope is the film I was searching for and create a url
        # to take us to the reviews page based on the Rotten Tomatoes format
        
        reviewpage = requests.get(html['items'][0]['url'] + "/reviews?type=&sort=&page=1")
        
        # I then create a BeautifulSoup object for the reviews page itself

        soup = BeautifulSoup(reviewpage.content, 'html.parser')
        
        # I grab the text "Page 1 of x" fron this page so that I will be able to extract how many pages I will need to cycle
        # through
        
        page_number = soup.find(class_='pageInfo')
        
        # I create an empty list variable for the storing of the rotten/fresh reviews
        
        fresh_rotten_list = []

        # This if statement just checks to make sure there is more than one page based on the page_number 
        
        if page_number is not None:
            # Uses regex to extract the second number aka the total number of pages. Also converts it to an integer.
            page_number = page_number.text
            page_pat = '(?<=of )[0-9]{1,2}'
            final_page = re.findall(page_pat, page_number)[0]
            final_page = int(final_page)
            
            # This for loop just iterates through the pages using the 'final_page' variable and grabs the rotten/fresh scores
            # from each page using the get_review_list function, appending each resulting list to the overall fresh_rotten_list

            for page in range(1, final_page+1):
                reviewpage = requests.get(html['items'][0]['url'] + "/reviews?type=&sort=&page=" + str(page))
                soup = BeautifulSoup(reviewpage.content, 'html.parser')
                fresh_rotten_list.append(get_review_list(soup, nom_date))
        
        # If there is no page indicator, this means either there are no reviews or only one page of reviews. In this case,
        # I execute a error handling exception that tries to use the get_review_list function to return the list of review
        # results from the one page, but if the review table is inaccessible (i.e. there are no reviews) then it simply returns
        # a NaN value rather than a review list.
        
        else:
            try:
                fresh_rotten_list.append(get_review_list(soup, nom_date))
            except:
                fresh_rotten_list.append(None)
    except:
        return None
    
    # If all went well, the function will return a list with a certain number of 'fresh' and 'rotten' strings to be counted
    # later via vectorized functions on the dataframe
    
    return fresh_rotten_list

Now at long last, I will pull the Rotten Tomatoes results, passing in the search uri and the nomination date which I have already added via helper columns to my dataframe. This naturally takes quite a long time as there are quite a lot of films to pull the reviews for, and it has to search for each one individually and then parse the reviews individually to make sure they were posted before the nomination date. This took around 60 hours for me all said.

In [32]:
movies['rt_list'] = movies['review_helper'].apply(lambda x : fresh_list(x[0], x[1]))

7367.9953337
Too many requests. Gotta rest for about five seconds!
Just 10 more seconds coach--sorry about this!
Too many requests. Gotta rest for about five seconds!
Just 10 more seconds coach--sorry about this!
Too many requests. Gotta rest for about five seconds!
Just 10 more seconds coach--sorry about this!
Too many requests. Gotta rest for about five seconds!
Just 10 more seconds coach--sorry about this!
Too many requests. Gotta rest for about five seconds!
Just 10 more seconds coach--sorry about this!
Too many requests. Gotta rest for about five seconds!
Just 10 more seconds coach--sorry about this!
Too many requests. Gotta rest for about five seconds!
Just 10 more seconds coach--sorry about this!
157124.9187782


Any fresh/rotten list with more than one page will look something like [['fresh', 'rotten', 'rotten', 'fresh], ['fresh', 'fresh']], my point being that they are lists within lists, so first I will collapse that using a list comprehension. The list comprehension includes an if statement to make sure that the item in question is in fact a list and not a NaN value.

In [44]:
movies['rt_list'] = movies['rt_list'].apply(lambda x : [item for sublist in x for item in sublist] if type(x) is list else None)

Now finally I can actually count how many fresh ratings there are and how many are rotten using a Counter function within a lambda function. This will simply count them up and return a dictionary (e.g. {'fresh': 23, 'rotten' : 5})

In [53]:
movies['rt_list'] = movies['rt_list'].apply(lambda x : Counter(x))

And then I will create a separate column that simply divides the number of fresh over the total number of reviews to get the Rotten Tomatoes score. Note at this point, that I could not have just grabbed the fresh rating directly off each page because the rating would have included reviews that may have been written after the Oscars were already issued which would have sullied the independence of the variable from the variable I am trying to predict.

In [64]:
movies['rt_score'] = movies['rt_list'].apply(lambda x : x['fresh'] / (x['fresh'] + x['rotten']) if len(x) >= 1 else None)

TypeError: object of type 'float' has no len()

I would like to build a random forest regression to predict how many oscars a given film will win based on a variety of factors that I think could be good indicators of an award-winning movie. Genre is an aspect that I think would be important in this prediction. For instance, anecdotally, I am aware that horror films are taken less seriously and are less likely to win awards.

However, the data--as it stands--is not properly formatted. Genre exists as list for each film. I am going to use binary encoding to give the regression model a way to quantify this categorical data.

First, I am going to use the strip function to remove the white space on either side of the genres. 

In [10]:
cleaned_genre = movies.set_index(['imdb_title_id'])["genre"].str.split(',',expand=True).stack()

In [13]:
cleaned_genre = cleaned_genre.apply(lambda x : x.strip())

Now I will actually begin the binary encoding process, using the get_dummies function to turn these lists into categorical data. This creates a new dataframe just containing each film by its id and binary indications of whether or not it is in a certain language.

I will now use the get_dummies method in order to binary encode the genres, using the prefix 'g' to note that each column represents a genre. I then merge it with the full dataframe.

In [16]:
genre_encoded = pd.get_dummies(cleaned_genre, prefix='g').groupby(level=0).sum().reset_index()

In [17]:
movies = movies.merge(genre_encoded, left_on=["imdb_title_id"], right_on=["imdb_title_id"])

Now I will take a look at the awards column. The simple webscraper I executed just pulled all of the awards that each film won, usually in sentence structure format (e.g. "Won 2 Oscars. Another 112 wins & 103 nominations.") Rather than slow down the webscraper by extracting the number of Oscars as it went, I opted to wait until now to use a regular expression to extract the number representing how many Oscars the film won. I put this in separate column "oscars_won"

Obviously this will yield NaN values for any films that did not contain the regular expression (and thus did not win Oscars), so I will go ahead and fill those NaN values with 0.

There is one movie that is oddly labeled with "TV Movie 2019" rather than an actual date published. Not only will this interfere with the conversion of dates to datetime objects, I'm also not particularly interested in having a TV movie mixed in with this dataset. So I will go ahead and just drop that row.

In [314]:
tvMovieIndex = movies[movies["date_published"]=="TV Movie 2019"].index

In [315]:
movies = movies.drop([tvMovieIndex[0], tvMovieIndex[0]])

Now I will convert the date published column--currently a string--to datetime objects. I would also like to use the month a movie came out rather than the date in order to categorize the films more neatly, so I will create a "months_published" column from the converted "date_published" column.

In [24]:
movies["date_published"] = pd.to_datetime(movies["date_published"])

In [None]:
I would like to investigate whether a film is American as a possible indicator of whether or not it will win an Oscar.

In [26]:
movies["country"] = movies["country"].fillna('None')

In [29]:
movies["American"] = movies["country"].str.contains("USA")

In [30]:
movies["American"] = movies["American"].fillna(0)

Below I will reset the index in order to prevent any errors

In [32]:
movies.reset_index(drop=True)

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,...,g_Romance,g_Sci-Fi,g_Sport,g_Thriller,g_War,g_Western,oscars_won,month_published,director_cat,American
0,tt0000009,Miss Jerry,Miss Jerry,1894,1894-10-09,Romance,45,USA,,Alexander Black,...,1,0,0,0,0,0,0,10,1129,True
1,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,1906-12-26,"Biography, Crime, Drama",70,Australia,,Charles Tait,...,0,0,0,0,0,0,0,12,5115,False
2,tt0002101,Cleopatra,Cleopatra,1912,1912-11-13,"Drama, History",100,USA,English,Charles L. Gaskill,...,0,0,0,0,0,0,0,11,5068,True
3,tt0002130,L'Inferno,L'Inferno,1911,1911-03-06,"Adventure, Drama, Fantasy",68,Italy,Italian,"Francesco Bertolini, Adolfo Padovan",...,0,0,0,0,0,0,0,3,9814,False
4,tt0002199,"From the Manger to the Cross; or, Jesus of Naz...","From the Manger to the Cross; or, Jesus of Naz...",1912,1913-01-01,"Biography, Drama",60,USA,English,Sidney Olcott,...,0,0,0,0,0,0,0,1,29464,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85016,tt9908390,Le lion,Le lion,2020,2020-01-29,Comedy,95,"France, Belgium",French,Ludovic Colbeau-Justin,...,0,0,0,0,0,0,0,1,19400,False
85017,tt9911196,De Beentjes van Sint-Hildegard,De Beentjes van Sint-Hildegard,2020,2020-02-13,"Comedy, Drama",103,Netherlands,"German, Dutch",Johan Nijenhuis,...,0,0,0,0,0,0,0,2,15372,False
85018,tt9911774,Padmavyuhathile Abhimanyu,Padmavyuhathile Abhimanyu,2019,2019-03-08,Drama,130,India,Malayalam,Vineesh Aaradya,...,0,0,0,0,0,0,0,3,32817,False
85019,tt9914286,Sokagin Çocuklari,Sokagin Çocuklari,2019,2019-03-15,"Drama, Family",98,Turkey,Turkish,Ahmet Faik Akinci,...,0,0,0,0,0,0,0,3,493,False


### This chunk will be taken out for publishing ###

In [524]:
movies_twenty_years = pd.read_csv("movies_after_rt-try.csv", encoding='latin-1')

In [525]:
movies_twenty_years["nom"] = movies_twenty_years["nom"].fillna(0)

In [526]:
movies_twenty_years = movies_twenty_years.iloc[0:, 0:55]

In [527]:
movies_twenty_years['title'] = movies_twenty_years['original_title']

In [528]:
characters_to_remove = ["[", "]", ",", '\'']
new_string = movies_twenty_years['rt_list'].iloc[0]
for character in characters_to_remove:
    new_string = new_string.replace(character, "")

In [529]:
list_string = movies_twenty_years['rt_list'].iloc[0]

In [530]:
list_string = list_string.translate({ord('['):None, ord(']'):None, ord('\''):None, ord(','):None})
list_list = list_string.split(" ")

In [531]:
def string_translate(list_string):
    list_string = list_string.translate({ord('['):None, ord(']'):None, ord('\''):None, ord(','):None})
    list_list = list_string.split(" ")
    return list_list

In [532]:
movies_twenty_years['rt_list'] = movies_twenty_years['rt_list'].apply(lambda x : string_translate(x) if isinstance(x, str) else None)

### END ###

In [534]:
movies_twenty_years['rt_num'] = movies_twenty_years['rt_list'].apply(lambda x : Counter(x)['fresh'] + Counter(x)['rotten'])

In [535]:
movies_twenty_years['rt_score'] = movies_twenty_years['rt_list'].apply(lambda x : Counter(x)['fresh'] / (Counter(x)['fresh'] + Counter(x)['rotten']) if (Counter(x)['fresh'] + Counter(x)['rotten']) > 0 else None)

In [536]:
movies_twenty_years['rt_list'].apply(lambda x : [item for sublist in x for item in sublist] if x is list else None)

0        None
1        None
2        None
3        None
4        None
         ... 
47983    None
47984    None
47985    None
47986    None
47987    None
Name: rt_list, Length: 47988, dtype: object

In [537]:
movies_with_rt = movies_twenty_years[movies_twenty_years['rt_score'].notna()]

In [538]:
movies_with_rt = movies_with_rt.sort_values(["rt_num", "rt_score"], ascending=(False, False))
movies_with_rt = movies_with_rt.groupby('year').head(100)

In [539]:
movies_with_rt['budget_float'] = movies_with_rt['budget'].str.extract(r'([0-9]{1,})')

In [540]:
movies_with_rt['budget_float'] = movies_with_rt['budget_float'].astype(float)

In [541]:
movies_with_rt[(movies_with_rt['year']==2019) & (movies_with_rt['oscars_won'] > 0)]

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,...,oscars_won,American,nom_date,tomatoesSearch,review_helper,rt_list,nom,rt_num,rt_score,budget_float
44440,tt7286456,Joker,Joker,2019,2019-10-03,"Crime, Drama, Thriller",122,"USA, Canada",English,Todd Phillips,...,2,True,2020-01-13,Joker%20Todd%20Phillips,"('Joker%20Todd%20Phillips', Timestamp('2020-01...","[, , fresh, fresh, fresh, fresh, fresh, fresh,...",1.0,534,0.689139,55000000.0
44100,tt7131622,Once Upon a Time... in Hollywood,Once Upon a Time... in Hollywood,2019,2019-09-18,"Comedy, Drama",161,"USA, UK, China","English, Italian, Spanish, German",Quentin Tarantino,...,2,True,2020-01-13,C%27era%20una%20volta%20a...%20Hollywood%20Que...,('C%27era%20una%20volta%20a...%20Hollywood%20Q...,"[, rotten, fresh, fresh, fresh, fresh, fresh, ...",1.0,524,0.853053,90000000.0
26216,tt1979376,Toy Story 4,Toy Story 4,2019,2019-06-26,"Animation, Adventure, Comedy",100,USA,English,Josh Cooley,...,1,True,2020-01-13,Toy%20Story%204%20Josh%20Cooley,"('Toy%20Story%204%20Josh%20Cooley', Timestamp(...","[, fresh, fresh, fresh, fresh, fresh, fresh, f...",0.0,423,0.971631,200000000.0
26788,tt2066051,Rocketman,Rocketman,2019,2019-05-29,"Biography, Drama, Music",121,"UK, USA, Canada",English,Dexter Fletcher,...,1,True,2020-01-13,Rocketman%20Dexter%20Fletcher,"('Rocketman%20Dexter%20Fletcher', Timestamp('2...","[, fresh, fresh, fresh, fresh, fresh, rotten, ...",0.0,358,0.891061,40000000.0
30483,tt2584384,Jojo Rabbit,Jojo Rabbit,2019,2020-01-16,"Comedy, Drama, War",108,"USA, New Zealand, Czech Republic","English, German",Taika Waititi,...,1,True,2020-01-13,Jojo%20Rabbit%20Taika%20Waititi,"('Jojo%20Rabbit%20Taika%20Waititi', Timestamp(...","[, , , fresh, fresh, fresh, fresh, fresh, fres...",1.0,353,0.796034,14000000.0
43375,tt6751668,Gisaengchung,Gisaengchung,2019,2019-11-07,"Comedy, Drama, Thriller",132,South Korea,"Korean, English",Bong Joon Ho,...,4,False,2020-01-13,Parasite%20Bong%20Joon%20Ho,"('Parasite%20Bong%20Joon%20Ho', Timestamp('202...","[, , , , , fresh, fresh, fresh, fresh, fresh, ...",1.0,350,0.991429,11400000.0
45172,tt7653254,Marriage Story,Marriage Story,2019,2019-12-06,"Comedy, Drama, Romance",137,"UK, USA","English, Spanish",Noah Baumbach,...,1,True,2020-01-13,Storia%20di%20un%20matrimonio%20Noah%20Baumbach,('Storia%20di%20un%20matrimonio%20Noah%20Baumb...,"[, , fresh, fresh, rotten, fresh, fresh, fresh...",1.0,333,0.954955,18600000.0
33100,tt3281548,Little Women,Little Women,2019,2020-01-09,"Drama, Romance",135,USA,"English, French",Greta Gerwig,...,1,True,2020-01-13,Piccole%20donne%20Greta%20Gerwig,"('Piccole%20donne%20Greta%20Gerwig', Timestamp...","[, , , , fresh, fresh, fresh, fresh, fresh, fr...",1.0,331,0.945619,40000000.0
26012,tt1950186,Ford v Ferrari,Ford v Ferrari,2019,2019-11-14,"Action, Biography, Drama",152,USA,"English, Italian, French, Japanese",James Mangold,...,2,True,2020-01-13,Le%20Mans%20%2766%20-%20La%20grande%20sfida%20...,('Le%20Mans%20%2766%20-%20La%20grande%20sfida%...,"[, fresh, fresh, fresh, fresh, fresh, fresh, f...",1.0,313,0.916933,97600000.0
46597,tt8579674,1917,1917,2019,2020-01-23,"Drama, War",119,"USA, UK, India, Spain, Canada, China","English, French, German",Sam Mendes,...,3,True,2020-01-13,1917%20Sam%20Mendes,"('1917%20Sam%20Mendes', Timestamp('2020-01-13 ...","[, , , , , , , rotten, fresh, fresh, fresh, fr...",1.0,296,0.898649,95000000.0


In [592]:
factors = ['rt_score', 'American', 'g_Action', 'g_Adult',
       'g_Adventure', 'g_Animation', 'g_Biography', 'g_Comedy', 'g_Crime',
       'g_Documentary', 'g_Drama', 'g_Family', 'g_Fantasy', 'g_Film-Noir',
       'g_History', 'g_Horror', 'g_Music', 'g_Musical', 'g_Mystery', 'g_News',
       'g_Reality-TV', 'g_Romance', 'g_Sci-Fi', 'g_Sport', 'g_Thriller']

I'll go ahead and create the train and the test subsets.

In [593]:
X_train, X_test, y_train, y_test = train_test_split(movies_with_rt[factors], movies_with_rt["best_picture"], train_size=0.7,test_size=0.3, random_state=1)

### Here I also try a Binary Descision Tree Classifier--not yet noted in the post ###

In [594]:
classifier = DecisionTreeClassifier()

In [595]:
classifier.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [596]:
y_pred = classifier.predict(X_test)

In [597]:
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [598]:
print(confusion_matrix(y_test, y_pred))

[[624   3]
 [  2   1]]


In [599]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       627
           1       0.25      0.33      0.29         3

    accuracy                           0.99       630
   macro avg       0.62      0.66      0.64       630
weighted avg       0.99      0.99      0.99       630



### END ##

In [601]:
regressor = RandomForestRegressor(n_estimators=60, random_state=0)

Here I fit my dataset to the regressor, generate the y predictions, and then check the results using the root mean squared error. This turned out to be a low ~.02. So I can feel fairly confident that my predictions were correct.

In [602]:
regressor.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=60, n_jobs=None, oob_score=False,
                      random_state=0, verbose=0, warm_start=False)

In [603]:
y_pred = regressor.predict(X_test)

In [604]:
mse = mean_squared_error(y_test, y_pred)

In [605]:
rmse = np.sqrt(mse)

In [606]:
rmse

0.076783772516908

In [607]:
regressor.feature_importances_

array([6.95430993e-01, 2.30556568e-02, 2.04241803e-02, 0.00000000e+00,
       1.17525498e-02, 1.49393858e-03, 2.45880050e-02, 1.55969172e-02,
       4.14096024e-02, 0.00000000e+00, 1.17213009e-02, 0.00000000e+00,
       1.22928148e-02, 0.00000000e+00, 2.15064083e-02, 8.62326875e-04,
       5.46118603e-04, 1.01300151e-02, 6.53656402e-03, 0.00000000e+00,
       0.00000000e+00, 1.81701178e-02, 7.58209168e-03, 9.98051598e-03,
       6.69198827e-02])

In [608]:
['rt_score', 'American', 'g_Action', 'g_Adult',
       'g_Adventure', 'g_Animation', 'g_Biography', 'g_Comedy', 'g_Crime',
       'g_Documentary', 'g_Drama', 'g_Family', 'g_Fantasy', 'g_Film-Noir',
       'g_History', 'g_Horror', 'g_Music', 'g_Musical', 'g_Mystery', 'g_News',
       'g_Reality-TV', 'g_Romance', 'g_Sci-Fi', 'g_Sport', 'g_Thriller',
       'g_War', 'g_Western']

['rt_score',
 'American',
 'g_Action',
 'g_Adult',
 'g_Adventure',
 'g_Animation',
 'g_Biography',
 'g_Comedy',
 'g_Crime',
 'g_Documentary',
 'g_Drama',
 'g_Family',
 'g_Fantasy',
 'g_Film-Noir',
 'g_History',
 'g_Horror',
 'g_Music',
 'g_Musical',
 'g_Mystery',
 'g_News',
 'g_Reality-TV',
 'g_Romance',
 'g_Sci-Fi',
 'g_Sport',
 'g_Thriller',
 'g_War',
 'g_Western']

In [609]:
X_test

Unnamed: 0,rt_score,American,g_Action,g_Adult,g_Adventure,g_Animation,g_Biography,g_Comedy,g_Crime,g_Documentary,...,g_Horror,g_Music,g_Musical,g_Mystery,g_News,g_Reality-TV,g_Romance,g_Sci-Fi,g_Sport,g_Thriller
22925,0.284615,True,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
21289,0.544218,True,0,0,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
18283,0.664921,True,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
5306,0.490909,True,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
7722,0.635659,True,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38532,0.540000,False,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
28979,0.357143,True,1,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
43475,0.890710,True,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
39048,0.931596,True,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [610]:
X_test['preds'] = y_pred

In [611]:
X_test

Unnamed: 0,rt_score,American,g_Action,g_Adult,g_Adventure,g_Animation,g_Biography,g_Comedy,g_Crime,g_Documentary,...,g_Music,g_Musical,g_Mystery,g_News,g_Reality-TV,g_Romance,g_Sci-Fi,g_Sport,g_Thriller,preds
22925,0.284615,True,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0.0
21289,0.544218,True,0,0,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0.0
18283,0.664921,True,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0.0
5306,0.490909,True,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0.0
7722,0.635659,True,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38532,0.540000,False,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0.0
28979,0.357143,True,1,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0.0
43475,0.890710,True,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
39048,0.931596,True,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0.0


In [612]:
predictions = movies_with_rt.merge(X_test, left_index = True, right_index=True)

In [613]:
predictions

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,...,g_Music_y,g_Musical_y,g_Mystery_y,g_News_y,g_Reality-TV_y,g_Romance_y,g_Sci-Fi_y,g_Sport_y,g_Thriller_y,preds
44440,tt7286456,Joker,Joker,2019,2019-10-03,"Crime, Drama, Thriller",122,"USA, Canada",English,Todd Phillips,...,0,0,0,0,0,0,0,0,1,0.000000
36285,tt4154796,Avengers: Endgame,Avengers: Endgame,2019,2019-04-24,"Action, Adventure, Drama",181,USA,"English, Japanese, Xhosa, German","Anthony Russo, Joe Russo",...,0,0,0,0,0,0,0,0,0,0.000000
42566,tt6320628,Spider-Man: Far from Home,Spider-Man: Far from Home,2019,2019-07-10,"Action, Adventure, Sci-Fi",129,USA,"English, Italian, Czech",Jon Watts,...,0,0,0,0,0,0,1,0,0,0.000000
47062,tt8946378,Knives Out,Knives Out,2019,2019-12-05,"Comedy, Crime, Drama",130,USA,"English, Spanish, Hindi",Rian Johnson,...,0,0,0,0,0,0,0,0,0,0.316667
18294,tt1213641,First Man,First Man,2018,2018-10-31,"Biography, Drama, History",141,"USA, Japan",English,Damien Chazelle,...,0,0,0,0,0,0,0,0,0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
224,tt0137494,Entrapment,Entrapment,1999,1999-09-03,"Action, Crime, Romance",113,"USA, UK, Germany",English,Jon Amiel,...,0,0,0,0,0,1,0,0,0,0.000000
538,tt0161023,Tumbleweeds,Tumbleweeds,1999,2000-03-17,"Comedy, Drama",102,USA,English,Gavin O'Connor,...,0,0,0,0,0,0,0,0,0,0.000000
100,tt0120866,Titus,Titus,1999,2000-03-24,"Drama, History, Thriller",162,"UK, Italy, USA","English, Latin",Julie Taymor,...,0,0,0,0,0,0,0,0,1,0.000000
184,tt0132477,October Sky,October Sky,1999,1999-11-19,"Biography, Drama, Family",108,USA,English,Joe Johnston,...,0,0,0,0,0,0,0,0,0,0.000000


In [614]:
predictions[['title', 'best_picture', 'preds']].sort_values(by='best_picture', ascending=False).head(10)

Unnamed: 0,title,best_picture,preds
30374,Birdman or (The Unexpected Virtue of Ignorance),1,0.0
21492,The King's Speech,1,0.366667
739,American Beauty,1,0.016667
3255,Changing Lanes,0,0.0
6104,A Series of Unfortunate Events,0,0.0
14125,The Switch,0,0.0
5273,Tears of the Sun,0,0.0
23832,The Spectacular Now,0,0.0
22227,The Skeleton Twins,0,0.0
1755,A.I. Artificial Intelligence,0,0.0


In [615]:
predictions[['title', 'best_picture', 'preds']].sort_values(by='preds', ascending=False).head(20)

Unnamed: 0,title,best_picture,preds
42755,The Ballad of Buster Scruggs,0,0.583333
13581,The Town,0,0.416667
21492,The King's Speech,1,0.366667
9469,Hairspray,0,0.35
39672,All the Money in the World,0,0.333333
47062,Knives Out,0,0.316667
25377,Ve stï¿½nu,0,0.283333
2912,The Pianist,0,0.256667
15981,Between Pain and Amen,0,0.233333
47099,De belofte van Pisa,0,0.216667
