# Regression Open-Ended Project

-----

# Previous Notebooks

- Web Scraping
- Cleaning data
- Exploratory Data Analysis

In [1]:
import warnings
warnings.filterwarnings('ignore')

# Feature Engineering

In [2]:
%matplotlib inline
%pylab
import pandas as pd
import numpy as np
import pickle

Using matplotlib backend: MacOSX
Populating the interactive namespace from numpy and matplotlib


In [3]:
ebert_imdb_df = pickle.load(open('../data/interim/ebert_imdb_df.pkl', 'rb'))

In [4]:
ebert_imdb_df.head(2)

Unnamed: 0,Title,IMDB_Rating,Rating_Count,Description,Metascore,User_Review_Count,Critic_Review_Count,Genre_List,Stars_List,Director,Country,Release_Date,EbertStars,Year,URL,Rating,Runtime,Review
0,Call Me by Your Name,8.4,4766.0,Directed by Luca Guadagnino. With Armie Hamme...,93.0,39.0,107.0,"Drama, Romance","Armie Hammer,, Timothée Chalamet,, Michael Stu...",Luca Guadagnino,USA,2017-11-24,4.0,2017.0,/reviews/call-me-by-your-name-2017,R,130.0,Luca Guadagnino’s films are all about the tran...
1,Mudbound,7.4,2666.0,"Directed by Dee Rees. With Carey Mulligan, Ga...",86.0,22.0,61.0,Drama,"Carey Mulligan,, Garrett Hedlund,, Jason Clarke",Dee Rees,USA,2017-11-17,4.0,2017.0,/reviews/mudbound-2017,NR,134.0,“Mudbound” is all about perception. How it can...


In [5]:
def check_foreign(row):
    try:
        country = row['Country']

        if country in ['USA', 'UK', 'Canada']:
            return 0
        else:
            return 1
    except:
        return np.nan
    
def user_critic_ratio(row):
    try:
        ratio = row['User_Review_Count'] / row['Critic_Review_Count']
        return ratio
    except:
        return np.nan
    
def description_length(row):
    try:
        length = len(row['Description'].split())
        return length
    except:
        return np.nan
    
def review_length(row):
    try:
        length = len(row['Review'].split())
        return length
    except:
        return np.nan
    
def convert_season(row):
    try:
        day = row['Release_Date'].timetuple().tm_yday
        
        spring = range(80, 172)
        summer = range(172, 264)
        fall = range(264, 355)

        if day in spring:
            season = 'Spring'
        elif day in summer:
            season = 'Summer'
        elif day in fall:
            season = 'Fall'
        else:
            season = 'Winter'
            
        return season
    except:
        return np.nan

In [6]:
ebert_imdb_df['Foreign'] = ebert_imdb_df.apply(lambda x: check_foreign(x), 1)
ebert_imdb_df['UC_Ratio'] = ebert_imdb_df.apply(lambda x: user_critic_ratio(x), 1)
ebert_imdb_df['Description_Len'] = ebert_imdb_df.apply(lambda x: description_length(x), 1)
ebert_imdb_df['Review_Len'] = ebert_imdb_df.apply(lambda x: review_length(x), 1)
ebert_imdb_df['Season'] = ebert_imdb_df.apply(lambda x: convert_season(x), 1)

## Convert MPAA Rating and Season to Numeric

In [7]:
ebert_imdb_df.head()

Unnamed: 0,Title,IMDB_Rating,Rating_Count,Description,Metascore,User_Review_Count,Critic_Review_Count,Genre_List,Stars_List,Director,...,Year,URL,Rating,Runtime,Review,Foreign,UC_Ratio,Description_Len,Review_Len,Season
0,Call Me by Your Name,8.4,4766.0,Directed by Luca Guadagnino. With Armie Hamme...,93.0,39.0,107.0,"Drama, Romance","Armie Hammer,, Timothée Chalamet,, Michael Stu...",Luca Guadagnino,...,2017.0,/reviews/call-me-by-your-name-2017,R,130.0,Luca Guadagnino’s films are all about the tran...,0,0.364486,47,1176,Fall
1,Mudbound,7.4,2666.0,"Directed by Dee Rees. With Carey Mulligan, Ga...",86.0,22.0,61.0,Drama,"Carey Mulligan,, Garrett Hedlund,, Jason Clarke",Dee Rees,...,2017.0,/reviews/mudbound-2017,NR,134.0,“Mudbound” is all about perception. How it can...,0,0.360656,42,1536,Fall
2,Justice League,7.4,78007.0,"Directed by Zack Snyder. With Ben Affleck, Ga...",46.0,709.0,286.0,"Action, Adventure, Fantasy","Ben Affleck,, Gal Gadot,, Jason Momoa",Zack Snyder,...,2017.0,/reviews/justice-league-2017,PG-13,120.0,For a film about a band of heroes trying to st...,0,2.479021,43,1242,Fall
3,Wonder,8.0,1579.0,Directed by Stephen Chbosky. With Jacob Tremb...,67.0,22.0,55.0,Drama,"Jacob Tremblay,, Owen Wilson,, Izabela Vidovic",Stephen Chbosky,...,2017.0,/reviews/wonder-2017,PG,113.0,Based on the R.J. Palacio novel of the same na...,0,0.4,49,828,Fall
4,Mr. Roosevelt,6.8,116.0,"Directed by Noël Wells. With Noël Wells, Nick...",73.0,,55.0,Comedy,"Noël Wells,, Nick Thune,, Britt Lower",Noël Wells,...,2017.0,/reviews/mr-roosevelt-2017,NR,90.0,Emily Martin (Noël Wells) doesn't quite know h...,0,,48,1118,Fall


In [8]:
ebert_imdb_df['Rating'].unique()

array(['R', 'NR', 'PG-13', 'PG', '', 'G', 'NC-17', 'Unrated', 'TV', 'PG13',
       'Not rated', 'No MPAA rating', 'PG-13&#8206;', 'No rating',
       'No MPAA Rating', '.', 'g PG-13', 'R,', ': R', 'PG- 13'], dtype=object)

In [9]:
mpaa_fix = {'': 'Unrated',
            'TV': 'Unrated',
            'NR': 'Unrated',
            'Not rated': 'Unrated',
            'No MPAA rating': 'Unrated',
            'No rating': 'Unrated',
            'No MPAA Rating': 'Unrated',
            '.': 'Unrated',
            'PG13': 'PG-13',
            'PG-13&#8206;': 'PG-13',
            'g PG-13': 'PG-13',
            'PG- 13': 'PG-13',
            'R,': 'R',
            ': R': 'R',
            'X': 'NC-17'}

In [10]:
for i, rating in ebert_imdb_df['Rating'].iteritems():
    if rating in mpaa_fix.keys():
        better_name = mpaa_fix.get(rating)
        ebert_imdb_df.set_value(i, 'Rating', better_name)
        
ebert_imdb_df['Rating'].unique()

array(['R', 'Unrated', 'PG-13', 'PG', 'G', 'NC-17'], dtype=object)

In [11]:
df_rating = pd.get_dummies(ebert_imdb_df['Rating'])
df_season = pd.get_dummies(ebert_imdb_df['Season'])
ebert_imdb_df = pd.concat([ebert_imdb_df, df_season, df_rating], axis=1)

In [12]:
ebert_imdb_df.columns

Index(['Title', 'IMDB_Rating', 'Rating_Count', 'Description', 'Metascore',
       'User_Review_Count', 'Critic_Review_Count', 'Genre_List', 'Stars_List',
       'Director', 'Country', 'Release_Date', 'EbertStars', 'Year', 'URL',
       'Rating', 'Runtime', 'Review', 'Foreign', 'UC_Ratio', 'Description_Len',
       'Review_Len', 'Season', 'Fall', 'Spring', 'Summer', 'Winter', 'G',
       'NC-17', 'PG', 'PG-13', 'R', 'Unrated'],
      dtype='object')

In [13]:
ebert_imdb_df.head(3)

Unnamed: 0,Title,IMDB_Rating,Rating_Count,Description,Metascore,User_Review_Count,Critic_Review_Count,Genre_List,Stars_List,Director,...,Fall,Spring,Summer,Winter,G,NC-17,PG,PG-13,R,Unrated
0,Call Me by Your Name,8.4,4766.0,Directed by Luca Guadagnino. With Armie Hamme...,93.0,39.0,107.0,"Drama, Romance","Armie Hammer,, Timothée Chalamet,, Michael Stu...",Luca Guadagnino,...,1,0,0,0,0,0,0,0,1,0
1,Mudbound,7.4,2666.0,"Directed by Dee Rees. With Carey Mulligan, Ga...",86.0,22.0,61.0,Drama,"Carey Mulligan,, Garrett Hedlund,, Jason Clarke",Dee Rees,...,1,0,0,0,0,0,0,0,0,1
2,Justice League,7.4,78007.0,"Directed by Zack Snyder. With Ben Affleck, Ga...",46.0,709.0,286.0,"Action, Adventure, Fantasy","Ben Affleck,, Gal Gadot,, Jason Momoa",Zack Snyder,...,1,0,0,0,0,0,0,1,0,0


In [14]:
ebert_imdb_df.shape

(9504, 33)

# Convert Genres to Numerical

In [15]:
genre_array = [genre_list.split(",") for genre_list in ebert_imdb_df['Genre_List']]
unique_genres = {genre.strip() for genres in genre_array for genre in genres}
unique_genres.discard('')
unique_genres

{'Action',
 'Adventure',
 'Animation',
 'Biography',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'Film-Noir',
 'Game-Show',
 'History',
 'Horror',
 'Music',
 'Musical',
 'Mystery',
 'News',
 'Reality-TV',
 'Romance',
 'Sci-Fi',
 'Short',
 'Sport',
 'Talk-Show',
 'Thriller',
 'War',
 'Western'}

In [16]:
for genre in sorted(unique_genres):
    ebert_imdb_df[genre] = np.zeros(len(ebert_imdb_df), dtype=int)

In [17]:
pd.set_option("display.max_columns", 150)
ebert_imdb_df.head()

Unnamed: 0,Title,IMDB_Rating,Rating_Count,Description,Metascore,User_Review_Count,Critic_Review_Count,Genre_List,Stars_List,Director,Country,Release_Date,EbertStars,Year,URL,Rating,Runtime,Review,Foreign,UC_Ratio,Description_Len,Review_Len,Season,Fall,Spring,Summer,Winter,G,NC-17,PG,PG-13,R,Unrated,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,Film-Noir,Game-Show,History,Horror,Music,Musical,Mystery,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western
0,Call Me by Your Name,8.4,4766.0,Directed by Luca Guadagnino. With Armie Hamme...,93.0,39.0,107.0,"Drama, Romance","Armie Hammer,, Timothée Chalamet,, Michael Stu...",Luca Guadagnino,USA,2017-11-24,4.0,2017.0,/reviews/call-me-by-your-name-2017,R,130.0,Luca Guadagnino’s films are all about the tran...,0,0.364486,47,1176,Fall,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Mudbound,7.4,2666.0,"Directed by Dee Rees. With Carey Mulligan, Ga...",86.0,22.0,61.0,Drama,"Carey Mulligan,, Garrett Hedlund,, Jason Clarke",Dee Rees,USA,2017-11-17,4.0,2017.0,/reviews/mudbound-2017,Unrated,134.0,“Mudbound” is all about perception. How it can...,0,0.360656,42,1536,Fall,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Justice League,7.4,78007.0,"Directed by Zack Snyder. With Ben Affleck, Ga...",46.0,709.0,286.0,"Action, Adventure, Fantasy","Ben Affleck,, Gal Gadot,, Jason Momoa",Zack Snyder,USA,2017-11-17,3.0,2017.0,/reviews/justice-league-2017,PG-13,120.0,For a film about a band of heroes trying to st...,0,2.479021,43,1242,Fall,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Wonder,8.0,1579.0,Directed by Stephen Chbosky. With Jacob Tremb...,67.0,22.0,55.0,Drama,"Jacob Tremblay,, Owen Wilson,, Izabela Vidovic",Stephen Chbosky,USA,2017-11-17,3.0,2017.0,/reviews/wonder-2017,PG,113.0,Based on the R.J. Palacio novel of the same na...,0,0.4,49,828,Fall,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Mr. Roosevelt,6.8,116.0,"Directed by Noël Wells. With Noël Wells, Nick...",73.0,,55.0,Comedy,"Noël Wells,, Nick Thune,, Britt Lower",Noël Wells,USA,2017-11-22,3.0,2017.0,/reviews/mr-roosevelt-2017,Unrated,90.0,Emily Martin (Noël Wells) doesn't quite know h...,0,,48,1118,Fall,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [18]:
# fill the genres with 1's
for idx, row in ebert_imdb_df.iterrows():
    for genre in row['Genre_List'].split(", "):
        if genre != '':
            ebert_imdb_df.ix[idx, genre] = 1

In [19]:
ebert_imdb_df.head()

Unnamed: 0,Title,IMDB_Rating,Rating_Count,Description,Metascore,User_Review_Count,Critic_Review_Count,Genre_List,Stars_List,Director,Country,Release_Date,EbertStars,Year,URL,Rating,Runtime,Review,Foreign,UC_Ratio,Description_Len,Review_Len,Season,Fall,Spring,Summer,Winter,G,NC-17,PG,PG-13,R,Unrated,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,Film-Noir,Game-Show,History,Horror,Music,Musical,Mystery,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western
0,Call Me by Your Name,8.4,4766.0,Directed by Luca Guadagnino. With Armie Hamme...,93.0,39.0,107.0,"Drama, Romance","Armie Hammer,, Timothée Chalamet,, Michael Stu...",Luca Guadagnino,USA,2017-11-24,4.0,2017.0,/reviews/call-me-by-your-name-2017,R,130.0,Luca Guadagnino’s films are all about the tran...,0,0.364486,47,1176,Fall,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
1,Mudbound,7.4,2666.0,"Directed by Dee Rees. With Carey Mulligan, Ga...",86.0,22.0,61.0,Drama,"Carey Mulligan,, Garrett Hedlund,, Jason Clarke",Dee Rees,USA,2017-11-17,4.0,2017.0,/reviews/mudbound-2017,Unrated,134.0,“Mudbound” is all about perception. How it can...,0,0.360656,42,1536,Fall,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Justice League,7.4,78007.0,"Directed by Zack Snyder. With Ben Affleck, Ga...",46.0,709.0,286.0,"Action, Adventure, Fantasy","Ben Affleck,, Gal Gadot,, Jason Momoa",Zack Snyder,USA,2017-11-17,3.0,2017.0,/reviews/justice-league-2017,PG-13,120.0,For a film about a band of heroes trying to st...,0,2.479021,43,1242,Fall,1,0,0,0,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Wonder,8.0,1579.0,Directed by Stephen Chbosky. With Jacob Tremb...,67.0,22.0,55.0,Drama,"Jacob Tremblay,, Owen Wilson,, Izabela Vidovic",Stephen Chbosky,USA,2017-11-17,3.0,2017.0,/reviews/wonder-2017,PG,113.0,Based on the R.J. Palacio novel of the same na...,0,0.4,49,828,Fall,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Mr. Roosevelt,6.8,116.0,"Directed by Noël Wells. With Noël Wells, Nick...",73.0,,55.0,Comedy,"Noël Wells,, Nick Thune,, Britt Lower",Noël Wells,USA,2017-11-22,3.0,2017.0,/reviews/mr-roosevelt-2017,Unrated,90.0,Emily Martin (Noël Wells) doesn't quite know h...,0,,48,1118,Fall,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [20]:
ebert_imdb_df = ebert_imdb_df.drop(['Genre_List'], axis='columns')

In [21]:
pickle.dump(unique_genres, open('../data/interim/unique_genres.pkl', 'wb'))

## Convert Directors to Numerical

In [22]:
print(len(ebert_imdb_df.Director.unique()))
print((ebert_imdb_df.Director.value_counts() >= 2).value_counts()[1])

4523
1564


Directors that appear in a single movie don't add predictive value. As we can see above, removing them from the dataset will let us reduce the number of directors dummy variables from over 4400 to under 1600.

In [23]:
series = ebert_imdb_df['Director'].value_counts() >= 2
relevant_directors = series[series].index.values

In [24]:
for director in sorted(relevant_directors):
    ebert_imdb_df[director] = np.zeros(len(ebert_imdb_df), dtype=int)

In [25]:
ebert_imdb_df.head(2)

Unnamed: 0,Title,IMDB_Rating,Rating_Count,Description,Metascore,User_Review_Count,Critic_Review_Count,Stars_List,Director,Country,Release_Date,EbertStars,Year,URL,Rating,Runtime,Review,Foreign,UC_Ratio,Description_Len,Review_Len,Season,Fall,Spring,Summer,Winter,G,NC-17,PG,PG-13,R,Unrated,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,Film-Noir,Game-Show,History,Horror,Music,Musical,Mystery,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western,Unnamed: 60,Aaron Blaise,Aaron Katz,Abbas Kiarostami,Abdellatif Kechiche,Abel Ferrara,Adam F. Goldberg,Adam Green,Adam Leon,Adam McKay,Adam Rapp,Adam Shankman,Adam Wingard,Adrian Dunbar,Adrian Lyne,Adrian Shergold,...,Tom Tykwer,Tom Vaughan,Tomas Alfredson,Tomm Moore,Tommy Lee Jones,Tommy O'Haver,Tommy Wirkola,Tomás Gutiérrez Alea,Toni Myers,Tony Bill,Tony Gilroy,Tony Goldwyn,Tony Kaye,Tony Scott,Tran Anh Hung,Travis Cluff,Trey Edward Shults,Trey Parker,Tyler Perry,Udayan Prasad,Uli Edel,Ulrich Seidl,Ulu Grosbard,Victor Nunez,Victor Salva,Vincent Gallo,Vincent Paronnaud,Vincent Ward,Vincenzo Natali,Vitaliy Manskiy,Vittorio De Sica,Volker Schlöndorff,Wai-Keung Lau,Wallace Wolodarsky,Walt Becker,Walter Hill,Walter Salles,Warren Beatty,Wayne Blair,Wayne Kramer,Wayne Wang,Werner Herzog,Wes Anderson,Wes Ball,Wes Craven,Whit Stillman,Will Finn,Will Gluck,William Dear,William Eubank,William Friedkin,William Gazecki,William H. Macy,Wilson Yip,Wim Wenders,Wolfgang Petersen,Woo-Ping Yuen,Woody Allen,Xavier Dolan,Xavier Giannoli,Xavier Koller,Yang Zhang,Yimou Zhang,Yorgos Lanthimos,Yvan Attal,Yves Robert,Yvette Lee Bowser,Zach Braff,Zack Snyder,Zak Hilditch,Zalman King,Zhangke Jia,Zoe R. Cassavetes,Álex de la Iglesia,Éric Rohmer
0,Call Me by Your Name,8.4,4766.0,Directed by Luca Guadagnino. With Armie Hamme...,93.0,39.0,107.0,"Armie Hammer,, Timothée Chalamet,, Michael Stu...",Luca Guadagnino,USA,2017-11-24,4.0,2017.0,/reviews/call-me-by-your-name-2017,R,130.0,Luca Guadagnino’s films are all about the tran...,0,0.364486,47,1176,Fall,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Mudbound,7.4,2666.0,"Directed by Dee Rees. With Carey Mulligan, Ga...",86.0,22.0,61.0,"Carey Mulligan,, Garrett Hedlund,, Jason Clarke",Dee Rees,USA,2017-11-17,4.0,2017.0,/reviews/mudbound-2017,Unrated,134.0,“Mudbound” is all about perception. How it can...,0,0.360656,42,1536,Fall,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [26]:
# fill the directors with 1's
for idx, row in ebert_imdb_df.iterrows():
    director = row['Director']
    if director != '' and director in relevant_directors:
        ebert_imdb_df.ix[idx, director] = 1

In [27]:
ebert_imdb_df.head(2)

Unnamed: 0,Title,IMDB_Rating,Rating_Count,Description,Metascore,User_Review_Count,Critic_Review_Count,Stars_List,Director,Country,Release_Date,EbertStars,Year,URL,Rating,Runtime,Review,Foreign,UC_Ratio,Description_Len,Review_Len,Season,Fall,Spring,Summer,Winter,G,NC-17,PG,PG-13,R,Unrated,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,Film-Noir,Game-Show,History,Horror,Music,Musical,Mystery,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western,Unnamed: 60,Aaron Blaise,Aaron Katz,Abbas Kiarostami,Abdellatif Kechiche,Abel Ferrara,Adam F. Goldberg,Adam Green,Adam Leon,Adam McKay,Adam Rapp,Adam Shankman,Adam Wingard,Adrian Dunbar,Adrian Lyne,Adrian Shergold,...,Tom Tykwer,Tom Vaughan,Tomas Alfredson,Tomm Moore,Tommy Lee Jones,Tommy O'Haver,Tommy Wirkola,Tomás Gutiérrez Alea,Toni Myers,Tony Bill,Tony Gilroy,Tony Goldwyn,Tony Kaye,Tony Scott,Tran Anh Hung,Travis Cluff,Trey Edward Shults,Trey Parker,Tyler Perry,Udayan Prasad,Uli Edel,Ulrich Seidl,Ulu Grosbard,Victor Nunez,Victor Salva,Vincent Gallo,Vincent Paronnaud,Vincent Ward,Vincenzo Natali,Vitaliy Manskiy,Vittorio De Sica,Volker Schlöndorff,Wai-Keung Lau,Wallace Wolodarsky,Walt Becker,Walter Hill,Walter Salles,Warren Beatty,Wayne Blair,Wayne Kramer,Wayne Wang,Werner Herzog,Wes Anderson,Wes Ball,Wes Craven,Whit Stillman,Will Finn,Will Gluck,William Dear,William Eubank,William Friedkin,William Gazecki,William H. Macy,Wilson Yip,Wim Wenders,Wolfgang Petersen,Woo-Ping Yuen,Woody Allen,Xavier Dolan,Xavier Giannoli,Xavier Koller,Yang Zhang,Yimou Zhang,Yorgos Lanthimos,Yvan Attal,Yves Robert,Yvette Lee Bowser,Zach Braff,Zack Snyder,Zak Hilditch,Zalman King,Zhangke Jia,Zoe R. Cassavetes,Álex de la Iglesia,Éric Rohmer
0,Call Me by Your Name,8.4,4766.0,Directed by Luca Guadagnino. With Armie Hamme...,93.0,39.0,107.0,"Armie Hammer,, Timothée Chalamet,, Michael Stu...",Luca Guadagnino,USA,2017-11-24,4.0,2017.0,/reviews/call-me-by-your-name-2017,R,130.0,Luca Guadagnino’s films are all about the tran...,0,0.364486,47,1176,Fall,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Mudbound,7.4,2666.0,"Directed by Dee Rees. With Carey Mulligan, Ga...",86.0,22.0,61.0,"Carey Mulligan,, Garrett Hedlund,, Jason Clarke",Dee Rees,USA,2017-11-17,4.0,2017.0,/reviews/mudbound-2017,Unrated,134.0,“Mudbound” is all about perception. How it can...,0,0.360656,42,1536,Fall,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [28]:
ebert_imdb_df = ebert_imdb_df.drop(['Director'], axis='columns')

## Convert Actors to Numerical

In [29]:
stars_array = [stars_list.split(",,") for stars_list in ebert_imdb_df['Stars_List']]
unique_stars = {star.strip() for stars in stars_array for star in stars}
unique_stars.discard('')
unique_stars = pd.Series(sorted(unique_stars))

In [30]:
# count how many appearances each star has
stars_appearances  = dict.fromkeys(unique_stars, 0)

for idx, row in ebert_imdb_df.iterrows():
    for star in row['Stars_List'].split(",, "):
        if star != '':
            stars_appearances[star] += 1

stars_appearances

{'Matthew Goode': 10,
 'Julian Barratt': 2,
 'Phillips Holmes': 1,
 'Karina Frederico': 1,
 'C.S. Lee': 1,
 'Jing Wu': 2,
 'Christopher Hewett': 1,
 'Scott Bigelow': 1,
 'Max Elliott Slade': 1,
 'Penélope Cruz': 26,
 'Jeanne Balibar': 5,
 'Jerry Adler': 2,
 'Maggie Grace': 9,
 'Barry Pepper': 5,
 'Tom Johnigarn': 1,
 'Timothy Vahle': 1,
 'Stacey Glick': 1,
 'Martin McCann': 2,
 'Joaquin Phoenix': 26,
 'Cortney Palm': 1,
 'Jodi Benson': 2,
 'Owen Campbell': 2,
 'Stephan James': 1,
 'François Arnaud': 1,
 'Andy Whitfield': 2,
 'Greg Dykstra': 1,
 'Fred A. Leuchter Jr.': 1,
 'Paul Bagget': 1,
 'Brian Kelly': 1,
 'Phillipe Coquet': 1,
 'Alan Bennett': 1,
 'Craig Gallivan': 1,
 'Simon Bossell': 1,
 'Puti Sri Candra Dewi': 1,
 'Riley Polanski': 1,
 'Francesco Carnelutti': 1,
 'Ben Konigsberg': 1,
 'Adrian Purcarescu': 1,
 'John Lyle': 1,
 'Timm Sharp': 1,
 'Laurel Holloman': 2,
 'Matthew Zuk': 1,
 'Jim Davidson': 1,
 'Adam Coleman Howard': 1,
 'Stanley Tucci': 13,
 'Jude Swanberg': 1,
 'Seam

In [31]:
stars_appearances_df = pd.DataFrame.from_dict(stars_appearances, orient='index')
stars_appearances_df.columns = ['Appearances']
stars_appearances_df

Unnamed: 0,Appearances
Matthew Goode,10
Julian Barratt,2
Phillips Holmes,1
Karina Frederico,1
C.S. Lee,1
Jing Wu,2
Christopher Hewett,1
Scott Bigelow,1
Max Elliott Slade,1
Penélope Cruz,26


In [32]:
print(len(stars_appearances_df))
print(len(stars_appearances_df[stars_appearances_df.Appearances >= 2]))

11116
3348


As with directors, stars that only appear in a single movie in the dataset don't provide predictive value. Removing them allows us to reduce the number of stars dummy variables from over 11 thousand to under 3400.

In [33]:
relevant_actors = stars_appearances_df[stars_appearances_df.Appearances >= 2].index.values
relevant_actors

array(['Matthew Goode', 'Julian Barratt', 'Jing Wu', ...,
       'Anthony Okungbowa', 'Anne Hathaway', 'Guillaume Depardieu'], dtype=object)

In [34]:
for actor in sorted(relevant_actors):
    ebert_imdb_df[actor] = np.zeros(len(ebert_imdb_df), dtype=int)

In [35]:
ebert_imdb_df.head(2)

Unnamed: 0,Title,IMDB_Rating,Rating_Count,Description,Metascore,User_Review_Count,Critic_Review_Count,Stars_List,Country,Release_Date,EbertStars,Year,URL,Rating,Runtime,Review,Foreign,UC_Ratio,Description_Len,Review_Len,Season,Fall,Spring,Summer,Winter,G,NC-17,PG,PG-13,R,Unrated,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,Film-Noir,Game-Show,History,Horror,Music,Musical,Mystery,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western,Unnamed: 59,Aaron Blaise,Aaron Katz,Abbas Kiarostami,Abdellatif Kechiche,Abel Ferrara,Adam F. Goldberg,Adam Green,Adam Leon,Adam McKay,Adam Rapp,Adam Shankman,Adam Wingard,Adrian Dunbar,Adrian Lyne,Adrian Shergold,Adrianne Palicki,...,Will Ferrell,Will Forte,Will Oldham,Will Patton,Will Poulter,Will Smith,Will Yun Lee,Willa Fitzgerald,Willem Dafoe,William Atherton,William Baldwin,William Devane,William Fichtner,William Forsythe,William Holden,William Hurt,William Moseley,William Petersen,William Ragsdale,William Shatner,Willie Nelson,Winona Ryder,Wojciech Pszoniak,Wood Harris,Woody Harrelson,Wrenn Schmidt,Wu Jiang,Wyatt Russell,Xander Berkeley,Xavier Samuel,Xun Zhou,Yada Beener,Yannick Renier,Yaphet Kotto,Yasiin Bey,Yaya DaCosta,Yayan Ruhian,Yaël Abecassis,Ye Liu,Yeo-jeong Yoon,Yiftach Klein,Yoko Ono,Yoo Gong,Yu Xia,Yu-Yong,Yui Natsukawa,Yuliya Aug,Yun-Fat Chow,Yuqi Zhang,Yuriy Tsurilo,Yves Montand,Yvette Mimieux,Yvonne Strahovski,Yûki Kudô,Zabou Breitman,Zac Efron,Zach Galifianakis,Zach Galligan,Zach Gilford,Zachary Booth,Zachary Knighton,Zachary Quinto,Zakes Mokae,Zdenek Sverák,Ziggy Marley,Zinedine Soualem,Ziyi Zhang,Zoe Kazan,Zoe Lister-Jones,Zoe Saldana,Zoey Deutch,Zooey Deschanel,Zoë Kravitz,Élodie Bouchez,Émilie Dequenne
0,Call Me by Your Name,8.4,4766.0,Directed by Luca Guadagnino. With Armie Hamme...,93.0,39.0,107.0,"Armie Hammer,, Timothée Chalamet,, Michael Stu...",USA,2017-11-24,4.0,2017.0,/reviews/call-me-by-your-name-2017,R,130.0,Luca Guadagnino’s films are all about the tran...,0,0.364486,47,1176,Fall,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Mudbound,7.4,2666.0,"Directed by Dee Rees. With Carey Mulligan, Ga...",86.0,22.0,61.0,"Carey Mulligan,, Garrett Hedlund,, Jason Clarke",USA,2017-11-17,4.0,2017.0,/reviews/mudbound-2017,Unrated,134.0,“Mudbound” is all about perception. How it can...,0,0.360656,42,1536,Fall,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [36]:
# fill actors with 1's
for idx, row in ebert_imdb_df.iterrows():
    for actor in row['Stars_List'].split(",, "):
        if actor != '' and actor in relevant_actors:
            ebert_imdb_df.ix[idx, actor] = 1

In [37]:
ebert_imdb_df.head(2)

Unnamed: 0,Title,IMDB_Rating,Rating_Count,Description,Metascore,User_Review_Count,Critic_Review_Count,Stars_List,Country,Release_Date,EbertStars,Year,URL,Rating,Runtime,Review,Foreign,UC_Ratio,Description_Len,Review_Len,Season,Fall,Spring,Summer,Winter,G,NC-17,PG,PG-13,R,Unrated,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,Film-Noir,Game-Show,History,Horror,Music,Musical,Mystery,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western,Unnamed: 59,Aaron Blaise,Aaron Katz,Abbas Kiarostami,Abdellatif Kechiche,Abel Ferrara,Adam F. Goldberg,Adam Green,Adam Leon,Adam McKay,Adam Rapp,Adam Shankman,Adam Wingard,Adrian Dunbar,Adrian Lyne,Adrian Shergold,Adrianne Palicki,...,Will Ferrell,Will Forte,Will Oldham,Will Patton,Will Poulter,Will Smith,Will Yun Lee,Willa Fitzgerald,Willem Dafoe,William Atherton,William Baldwin,William Devane,William Fichtner,William Forsythe,William Holden,William Hurt,William Moseley,William Petersen,William Ragsdale,William Shatner,Willie Nelson,Winona Ryder,Wojciech Pszoniak,Wood Harris,Woody Harrelson,Wrenn Schmidt,Wu Jiang,Wyatt Russell,Xander Berkeley,Xavier Samuel,Xun Zhou,Yada Beener,Yannick Renier,Yaphet Kotto,Yasiin Bey,Yaya DaCosta,Yayan Ruhian,Yaël Abecassis,Ye Liu,Yeo-jeong Yoon,Yiftach Klein,Yoko Ono,Yoo Gong,Yu Xia,Yu-Yong,Yui Natsukawa,Yuliya Aug,Yun-Fat Chow,Yuqi Zhang,Yuriy Tsurilo,Yves Montand,Yvette Mimieux,Yvonne Strahovski,Yûki Kudô,Zabou Breitman,Zac Efron,Zach Galifianakis,Zach Galligan,Zach Gilford,Zachary Booth,Zachary Knighton,Zachary Quinto,Zakes Mokae,Zdenek Sverák,Ziggy Marley,Zinedine Soualem,Ziyi Zhang,Zoe Kazan,Zoe Lister-Jones,Zoe Saldana,Zoey Deutch,Zooey Deschanel,Zoë Kravitz,Élodie Bouchez,Émilie Dequenne
0,Call Me by Your Name,8.4,4766.0,Directed by Luca Guadagnino. With Armie Hamme...,93.0,39.0,107.0,"Armie Hammer,, Timothée Chalamet,, Michael Stu...",USA,2017-11-24,4.0,2017.0,/reviews/call-me-by-your-name-2017,R,130.0,Luca Guadagnino’s films are all about the tran...,0,0.364486,47,1176,Fall,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Mudbound,7.4,2666.0,"Directed by Dee Rees. With Carey Mulligan, Ga...",86.0,22.0,61.0,"Carey Mulligan,, Garrett Hedlund,, Jason Clarke",USA,2017-11-17,4.0,2017.0,/reviews/mudbound-2017,Unrated,134.0,“Mudbound” is all about perception. How it can...,0,0.360656,42,1536,Fall,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [38]:
ebert_imdb_df = ebert_imdb_df.drop(['Stars_List'], axis='columns')

## Create buckets for decades

In [39]:
ebert_imdb_df['Year'] = ebert_imdb_df['Year'].apply(pd.to_numeric, args=('coerce',))
ebert_imdb_df['Runtime'] = ebert_imdb_df['Runtime'].apply(pd.to_numeric, args=('coerce',))

In [40]:
ebert_imdb_df = ebert_imdb_df.dropna(subset=['Release_Date', 'Year'])
# ax = ebert_imdb_df.Year.hist(bins=range(1920,2020, 10), figsize=(17, 8))

In [41]:
decade_buckets = range(1920, 2020, 10)
for decade in decade_buckets:
    ebert_imdb_df[decade] = np.zeros(len(ebert_imdb_df), dtype=int)

In [42]:
# fill decades with 1's
for idx, row in ebert_imdb_df.iterrows():
    decade_idx = int((row['Year'] - 1920) // 10)
    ebert_imdb_df.ix[idx, decade_buckets[decade_idx]] = 1

In [43]:
ebert_imdb_df.head(2)

Unnamed: 0,Title,IMDB_Rating,Rating_Count,Description,Metascore,User_Review_Count,Critic_Review_Count,Country,Release_Date,EbertStars,Year,URL,Rating,Runtime,Review,Foreign,UC_Ratio,Description_Len,Review_Len,Season,Fall,Spring,Summer,Winter,G,NC-17,PG,PG-13,R,Unrated,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,Film-Noir,Game-Show,History,Horror,Music,Musical,Mystery,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western,Unnamed: 58,Aaron Blaise,Aaron Katz,Abbas Kiarostami,Abdellatif Kechiche,Abel Ferrara,Adam F. Goldberg,Adam Green,Adam Leon,Adam McKay,Adam Rapp,Adam Shankman,Adam Wingard,Adrian Dunbar,Adrian Lyne,Adrian Shergold,Adrianne Palicki,Adrián García Bogliano,...,William Baldwin,William Devane,William Fichtner,William Forsythe,William Holden,William Hurt,William Moseley,William Petersen,William Ragsdale,William Shatner,Willie Nelson,Winona Ryder,Wojciech Pszoniak,Wood Harris,Woody Harrelson,Wrenn Schmidt,Wu Jiang,Wyatt Russell,Xander Berkeley,Xavier Samuel,Xun Zhou,Yada Beener,Yannick Renier,Yaphet Kotto,Yasiin Bey,Yaya DaCosta,Yayan Ruhian,Yaël Abecassis,Ye Liu,Yeo-jeong Yoon,Yiftach Klein,Yoko Ono,Yoo Gong,Yu Xia,Yu-Yong,Yui Natsukawa,Yuliya Aug,Yun-Fat Chow,Yuqi Zhang,Yuriy Tsurilo,Yves Montand,Yvette Mimieux,Yvonne Strahovski,Yûki Kudô,Zabou Breitman,Zac Efron,Zach Galifianakis,Zach Galligan,Zach Gilford,Zachary Booth,Zachary Knighton,Zachary Quinto,Zakes Mokae,Zdenek Sverák,Ziggy Marley,Zinedine Soualem,Ziyi Zhang,Zoe Kazan,Zoe Lister-Jones,Zoe Saldana,Zoey Deutch,Zooey Deschanel,Zoë Kravitz,Élodie Bouchez,Émilie Dequenne,1920,1930,1940,1950,1960,1970,1980,1990,2000,2010
0,Call Me by Your Name,8.4,4766.0,Directed by Luca Guadagnino. With Armie Hamme...,93.0,39.0,107.0,USA,2017-11-24,4.0,2017.0,/reviews/call-me-by-your-name-2017,R,130.0,Luca Guadagnino’s films are all about the tran...,0,0.364486,47,1176,Fall,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,Mudbound,7.4,2666.0,"Directed by Dee Rees. With Carey Mulligan, Ga...",86.0,22.0,61.0,USA,2017-11-17,4.0,2017.0,/reviews/mudbound-2017,Unrated,134.0,“Mudbound” is all about perception. How it can...,0,0.360656,42,1536,Fall,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


## Export to pickle

In [44]:
pickle.dump(ebert_imdb_df, open('../data/processed/ebert_imdb_final_df.pkl', 'wb'))

# Plan for Following Notebooks

- More Exploratory Data Analysis
- Making predictions
- Final analysis