# Regression Open-Ended Project

-----

# Previous Notebooks

- Web Scraping
- Cleaning data
- Exploratory Data Analysis

In [1]:
import warnings
warnings.filterwarnings('ignore')

# Feature Engineering

In [2]:
import pandas as pd
import numpy as np
import pickle

In [3]:
ebert_imdb_df = pickle.load(open('../data/interim/ebert_imdb_df_v1.pkl', 'rb'))

In [4]:
ebert_imdb_df.head(2)

Unnamed: 0,Title,IMDB_Rating,Rating_Count,Description,Metascore,User_Review_Count,Critic_Review_Count,Genre_List,Stars_List,Director,Country,Release_Date,EbertStars,Year,URL,Rating,Runtime,Review
0,The Emoji Movie,1.5,2334.0,"Directed by Tony Leondis. With T.J. Miller, J...",9.0,55.0,15.0,"Animation, Adventure, Comedy","T.J. Miller,, James Corden,, Anna Faris",Tony Leondis,USA,2017-07-28,0.5,2017,/reviews/the-emoji-movie-2017,PG,,Since “Toy Story” became an enormous box-offic...
1,Menashe,6.1,164.0,Directed by Joshua Z Weinstein. With Menashe ...,81.0,1.0,21.0,Drama,"Menashe Lustig,, Yoel Falkowitz,, Ruben Niborsk",Joshua Z Weinstein,USA,2017-07-28,4.0,2017,/reviews/menashe-2017,,82.0,Like all great films that nudge the world towa...


In [5]:
def check_foreign(row):
    try:
        country = row['Country']

        if country in ['USA', 'UK', 'Canada']:
            return 0
        else:
            return 1
    except:
        return np.nan
    
def user_critic_ratio(row):
    try:
        ratio = row['User_Review_Count'] / row['Critic_Review_Count']
        return ratio
    except:
        return np.nan
    
def description_length(row):
    try:
        length = len(row['Description'].split())
        return length
    except:
        return np.nan
    
def review_length(row):
    try:
        length = len(row['Review'].split())
        return length
    except:
        return np.nan
    
def convert_season(row):
    try:
        day = row['Release_Date'].timetuple().tm_yday
        
        spring = range(80, 172)
        summer = range(172, 264)
        fall = range(264, 355)

        if day in spring:
            season = 'Spring'
        elif day in summer:
            season = 'Summer'
        elif day in fall:
            season = 'Fall'
        else:
            season = 'Winter'
            
        return season
    except:
        return np.nan

In [6]:
ebert_imdb_df['Foreign'] = ebert_imdb_df.apply(lambda x: check_foreign(x), 1)
ebert_imdb_df['UC_Ratio'] = ebert_imdb_df.apply(lambda x: user_critic_ratio(x), 1)
ebert_imdb_df['Description_Len'] = ebert_imdb_df.apply(lambda x: description_length(x), 1)
ebert_imdb_df['Review_Len'] = ebert_imdb_df.apply(lambda x: review_length(x), 1)
ebert_imdb_df['Season'] = ebert_imdb_df.apply(lambda x: convert_season(x), 1)

## Convert MPAA Rating and Season to Numeric

In [7]:
ebert_imdb_df.head()

Unnamed: 0,Title,IMDB_Rating,Rating_Count,Description,Metascore,User_Review_Count,Critic_Review_Count,Genre_List,Stars_List,Director,...,Year,URL,Rating,Runtime,Review,Foreign,UC_Ratio,Description_Len,Review_Len,Season
0,The Emoji Movie,1.5,2334.0,"Directed by Tony Leondis. With T.J. Miller, J...",9.0,55.0,15.0,"Animation, Adventure, Comedy","T.J. Miller,, James Corden,, Anna Faris",Tony Leondis,...,2017,/reviews/the-emoji-movie-2017,PG,,Since “Toy Story” became an enormous box-offic...,0,3.666667,27,1144.0,Summer
1,Menashe,6.1,164.0,Directed by Joshua Z Weinstein. With Menashe ...,81.0,1.0,21.0,Drama,"Menashe Lustig,, Yoel Falkowitz,, Ruben Niborsk",Joshua Z Weinstein,...,2017,/reviews/menashe-2017,,82.0,Like all great films that nudge the world towa...,0,0.047619,47,803.0,Summer
2,Detroit,7.9,100.0,Directed by Kathryn Bigelow. With John Boyega...,86.0,1.0,13.0,"Crime, Drama, History","John Boyega,, Anthony Mackie,, Algee Smith",Kathryn Bigelow,...,2017,/reviews/detroit-2017,R,143.0,"Watching ""Detroit,"" the latest film directed b...",0,0.076923,45,1490.0,Summer
3,Brigsby Bear,7.6,316.0,"Directed by Dave McCary. With Mark Hamill, Cl...",69.0,4.0,27.0,"Comedy, Drama","Mark Hamill,, Claire Danes,, Kyle Mooney",Dave McCary,...,2017,/reviews/brigsby-bear-2017,PG-13,100.0,"Released in the wake of Comic-Con, it’s imposs...",1,0.148148,41,862.0,Fall
4,The Incredible Jessica James,6.4,178.0,Directed by Jim Strouse. With Lakeith Stanfie...,71.0,,27.0,Comedy,"Lakeith Stanfield,, Chris O'Dowd,, Noël Wells",Jim Strouse,...,2017,/reviews/the-incredible-jessica-james-2017,,85.0,"In the opening credits sequence of ""The Incred...",0,,33,1067.0,Summer


In [8]:
ebert_imdb_df['Rating'].unique()

array(['PG', '', 'R', 'PG-13', 'NR', 'G', 'NC-17', 'Unrated', 'TV', 'PG13',
       'Not rated', 'No MPAA rating', 'PG-13&#8206;', 'No rating',
       'No MPAA Rating', '.', 'g PG-13', 'R,', ': R', 'PG- 13', nan], dtype=object)

In [9]:
mpaa_fix = {'': 'Unrated',
            'TV': 'Unrated',
            'NR': 'Unrated',
            'Not rated': 'Unrated',
            'No MPAA rating': 'Unrated',
            'No rating': 'Unrated',
            'No MPAA Rating': 'Unrated',
            '.': 'Unrated',
            'PG13': 'PG-13',
            'PG-13&#8206;': 'PG-13',
            'g PG-13': 'PG-13',
            'PG- 13': 'PG-13',
            'R,': 'R',
            ': R': 'R',
            'X': 'NC-17'}

In [10]:
for i, rating in ebert_imdb_df['Rating'].iteritems():
    if rating in mpaa_fix.keys():
        better_name = mpaa_fix.get(rating)
        ebert_imdb_df.set_value(i, 'Rating', better_name)
        
ebert_imdb_df['Rating'].unique()

array(['PG', 'Unrated', 'R', 'PG-13', 'G', 'NC-17', nan], dtype=object)

In [16]:
df_rating = pd.get_dummies(ebert_imdb_df['Rating'])
df_season = pd.get_dummies(ebert_imdb_df['Season'])
ebert_imdb_df = pd.concat([ebert_imdb_df, df_season, df_rating], axis=1)

In [17]:
ebert_imdb_df.columns

Index(['Title', 'IMDB_Rating', 'Rating_Count', 'Description', 'Metascore',
       'User_Review_Count', 'Critic_Review_Count', 'Genre_List', 'Stars_List',
       'Director', 'Country', 'Release_Date', 'EbertStars', 'Year', 'URL',
       'Rating', 'Runtime', 'Review', 'Foreign', 'UC_Ratio', 'Description_Len',
       'Review_Len', 'Season', 'Fall', 'Spring', 'Summer', 'Winter', 'G',
       'NC-17', 'PG', 'PG-13', 'R', 'Unrated'],
      dtype='object')

In [18]:
ebert_imdb_df.head(3)

Unnamed: 0,Title,IMDB_Rating,Rating_Count,Description,Metascore,User_Review_Count,Critic_Review_Count,Genre_List,Stars_List,Director,...,Fall,Spring,Summer,Winter,G,NC-17,PG,PG-13,R,Unrated
0,The Emoji Movie,1.5,2334.0,"Directed by Tony Leondis. With T.J. Miller, J...",9.0,55.0,15.0,"Animation, Adventure, Comedy","T.J. Miller,, James Corden,, Anna Faris",Tony Leondis,...,0,0,1,0,0,0,1,0,0,0
1,Menashe,6.1,164.0,Directed by Joshua Z Weinstein. With Menashe ...,81.0,1.0,21.0,Drama,"Menashe Lustig,, Yoel Falkowitz,, Ruben Niborsk",Joshua Z Weinstein,...,0,0,1,0,0,0,0,0,0,1
2,Detroit,7.9,100.0,Directed by Kathryn Bigelow. With John Boyega...,86.0,1.0,13.0,"Crime, Drama, History","John Boyega,, Anthony Mackie,, Algee Smith",Kathryn Bigelow,...,0,0,1,0,0,0,0,0,1,0


In [19]:
ebert_imdb_df.shape

(9500, 33)

In [20]:
# pickle.dump(ebert_imdb_df, open('../data/interim/ebert_imdb_df_v2.pkl', 'wb'))

# Convert Genres to Numerical

In [16]:
ebert_imdb_df = pickle.load(open('data/ebert_imdb_df_v2.pkl', 'rb'))

In [21]:
genre_array = [genre_list.split(",") for genre_list in ebert_imdb_df['Genre_List']]
unique_genres = {genre.strip() for genres in genre_array for genre in genres}
unique_genres.discard('')
unique_genres

{'Action',
 'Adventure',
 'Animation',
 'Biography',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'Film-Noir',
 'Game-Show',
 'History',
 'Horror',
 'Music',
 'Musical',
 'Mystery',
 'News',
 'Reality-TV',
 'Romance',
 'Sci-Fi',
 'Short',
 'Sport',
 'Talk-Show',
 'Thriller',
 'War',
 'Western'}

In [22]:
for genre in sorted(unique_genres):
    ebert_imdb_df[genre] = np.zeros(len(ebert_imdb_df), dtype=int)

In [23]:
pd.set_option("display.max_columns", 150)
ebert_imdb_df.head()

Unnamed: 0,Title,IMDB_Rating,Rating_Count,Description,Metascore,User_Review_Count,Critic_Review_Count,Genre_List,Stars_List,Director,Country,Release_Date,EbertStars,Year,URL,Rating,Runtime,Review,Foreign,UC_Ratio,Description_Len,Review_Len,Season,Fall,Spring,Summer,Winter,G,NC-17,PG,PG-13,R,Unrated,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,Film-Noir,Game-Show,History,Horror,Music,Musical,Mystery,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western
0,The Emoji Movie,1.5,2334.0,"Directed by Tony Leondis. With T.J. Miller, J...",9.0,55.0,15.0,"Animation, Adventure, Comedy","T.J. Miller,, James Corden,, Anna Faris",Tony Leondis,USA,2017-07-28,0.5,2017,/reviews/the-emoji-movie-2017,PG,,Since “Toy Story” became an enormous box-offic...,0,3.666667,27,1144.0,Summer,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Menashe,6.1,164.0,Directed by Joshua Z Weinstein. With Menashe ...,81.0,1.0,21.0,Drama,"Menashe Lustig,, Yoel Falkowitz,, Ruben Niborsk",Joshua Z Weinstein,USA,2017-07-28,4.0,2017,/reviews/menashe-2017,Unrated,82.0,Like all great films that nudge the world towa...,0,0.047619,47,803.0,Summer,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Detroit,7.9,100.0,Directed by Kathryn Bigelow. With John Boyega...,86.0,1.0,13.0,"Crime, Drama, History","John Boyega,, Anthony Mackie,, Algee Smith",Kathryn Bigelow,USA,2017-08-04,2.0,2017,/reviews/detroit-2017,R,143.0,"Watching ""Detroit,"" the latest film directed b...",0,0.076923,45,1490.0,Summer,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Brigsby Bear,7.6,316.0,"Directed by Dave McCary. With Mark Hamill, Cl...",69.0,4.0,27.0,"Comedy, Drama","Mark Hamill,, Claire Danes,, Kyle Mooney",Dave McCary,Australia,2017-09-21,2.5,2017,/reviews/brigsby-bear-2017,PG-13,100.0,"Released in the wake of Comic-Con, it’s imposs...",1,0.148148,41,862.0,Fall,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,The Incredible Jessica James,6.4,178.0,Directed by Jim Strouse. With Lakeith Stanfie...,71.0,,27.0,Comedy,"Lakeith Stanfield,, Chris O'Dowd,, Noël Wells",Jim Strouse,USA,2017-07-28,3.0,2017,/reviews/the-incredible-jessica-james-2017,Unrated,85.0,"In the opening credits sequence of ""The Incred...",0,,33,1067.0,Summer,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [24]:
# fill the genres with 1's
for idx, row in ebert_imdb_df.iterrows():
    for genre in row['Genre_List'].split(", "):
        if genre != '':
            ebert_imdb_df.ix[idx, genre] = 1

In [25]:
ebert_imdb_df.head()

Unnamed: 0,Title,IMDB_Rating,Rating_Count,Description,Metascore,User_Review_Count,Critic_Review_Count,Genre_List,Stars_List,Director,Country,Release_Date,EbertStars,Year,URL,Rating,Runtime,Review,Foreign,UC_Ratio,Description_Len,Review_Len,Season,Fall,Spring,Summer,Winter,G,NC-17,PG,PG-13,R,Unrated,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,Film-Noir,Game-Show,History,Horror,Music,Musical,Mystery,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western
0,The Emoji Movie,1.5,2334.0,"Directed by Tony Leondis. With T.J. Miller, J...",9.0,55.0,15.0,"Animation, Adventure, Comedy","T.J. Miller,, James Corden,, Anna Faris",Tony Leondis,USA,2017-07-28,0.5,2017,/reviews/the-emoji-movie-2017,PG,,Since “Toy Story” became an enormous box-offic...,0,3.666667,27,1144.0,Summer,0,0,1,0,0,0,1,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Menashe,6.1,164.0,Directed by Joshua Z Weinstein. With Menashe ...,81.0,1.0,21.0,Drama,"Menashe Lustig,, Yoel Falkowitz,, Ruben Niborsk",Joshua Z Weinstein,USA,2017-07-28,4.0,2017,/reviews/menashe-2017,Unrated,82.0,Like all great films that nudge the world towa...,0,0.047619,47,803.0,Summer,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Detroit,7.9,100.0,Directed by Kathryn Bigelow. With John Boyega...,86.0,1.0,13.0,"Crime, Drama, History","John Boyega,, Anthony Mackie,, Algee Smith",Kathryn Bigelow,USA,2017-08-04,2.0,2017,/reviews/detroit-2017,R,143.0,"Watching ""Detroit,"" the latest film directed b...",0,0.076923,45,1490.0,Summer,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Brigsby Bear,7.6,316.0,"Directed by Dave McCary. With Mark Hamill, Cl...",69.0,4.0,27.0,"Comedy, Drama","Mark Hamill,, Claire Danes,, Kyle Mooney",Dave McCary,Australia,2017-09-21,2.5,2017,/reviews/brigsby-bear-2017,PG-13,100.0,"Released in the wake of Comic-Con, it’s imposs...",1,0.148148,41,862.0,Fall,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,The Incredible Jessica James,6.4,178.0,Directed by Jim Strouse. With Lakeith Stanfie...,71.0,,27.0,Comedy,"Lakeith Stanfield,, Chris O'Dowd,, Noël Wells",Jim Strouse,USA,2017-07-28,3.0,2017,/reviews/the-incredible-jessica-james-2017,Unrated,85.0,"In the opening credits sequence of ""The Incred...",0,,33,1067.0,Summer,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [26]:
ebert_imdb_df = ebert_imdb_df.drop(['Genre_List'], axis='columns')

In [27]:
# pickle.dump(ebert_imdb_df, open('../data/interim/ebert_imdb_df_v3.pkl', 'wb'))
# pickle.dump(unique_genres, open('../data/interim//unique_genres.pkl', 'wb'))

## Convert Directors to Numerical

In [24]:
ebert_imdb_df = pickle.load(open('../data/interim/ebert_imdb_df_v3.pkl', 'rb'))

In [28]:
print(len(ebert_imdb_df.Director.unique()))
print((ebert_imdb_df.Director.value_counts() >= 2).value_counts()[1])

4469
1567


Directors that appear in a single movie don't add predictive value. As we can see above, removing them from the dataset will let us reduce the number of directors dummy variables from over 4400 to under 1600.

In [29]:
series = ebert_imdb_df['Director'].value_counts() >= 2
relevant_directors = series[series].index.values

In [30]:
for director in sorted(relevant_directors):
    ebert_imdb_df[director] = np.zeros(len(ebert_imdb_df), dtype=int)

In [31]:
ebert_imdb_df.head(2)

Unnamed: 0,Title,IMDB_Rating,Rating_Count,Description,Metascore,User_Review_Count,Critic_Review_Count,Stars_List,Director,Country,Release_Date,EbertStars,Year,URL,Rating,Runtime,Review,Foreign,UC_Ratio,Description_Len,Review_Len,Season,Fall,Spring,Summer,Winter,G,NC-17,PG,PG-13,R,Unrated,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,Film-Noir,Game-Show,History,Horror,Music,Musical,Mystery,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western,Unnamed: 60,Aaron Blaise,Aaron Katz,Aaron Norris,Abbas Kiarostami,Abdellatif Kechiche,Abel Ferrara,Adam F. Goldberg,Adam Green,Adam Leon,Adam McKay,Adam Rapp,Adam Shankman,Adam Wingard,Adrian Dunbar,Adrian Lyne,...,Tom Mankiewicz,Tom McCarthy,Tom McGrath,Tom McLoughlin,Tom Shadyac,Tom Six,Tom Tykwer,Tom Vaughan,Tomas Alfredson,Tomm Moore,Tommy Lee Jones,Tommy O'Haver,Tommy Wirkola,Tomás Gutiérrez Alea,Toni Myers,Tony Bill,Tony Gilroy,Tony Goldwyn,Tony Kaye,Tony Scott,Tran Anh Hung,Trey Edward Shults,Trey Parker,Tyler Perry,Udayan Prasad,Uli Edel,Ulrich Seidl,Ulu Grosbard,Victor Nunez,Victor Salva,Vincent Gallo,Vincent Paronnaud,Vincent Ward,Vincenzo Natali,Vitaliy Manskiy,Volker Schlöndorff,Wai-Keung Lau,Wallace Wolodarsky,Walt Becker,Walter Hill,Walter Salles,Warren Beatty,Wayne Blair,Wayne Kramer,Wayne Wang,Werner Herzog,Wes Anderson,Wes Ball,Wes Craven,Whit Stillman,Will Finn,Will Gluck,William Dear,William Eubank,William Friedkin,William Gazecki,Wilson Yip,Wim Wenders,Wolfgang Petersen,Woo-Ping Yuen,Woody Allen,Xavier Dolan,Xavier Giannoli,Xavier Koller,Yang Zhang,Yimou Zhang,Yorgos Lanthimos,Yves Robert,Zach Braff,Zack Snyder,Zalman King,Zhangke Jia,Zoe R. Cassavetes,Álex de la Iglesia,Éric Rohmer
0,The Emoji Movie,1.5,2334.0,"Directed by Tony Leondis. With T.J. Miller, J...",9.0,55.0,15.0,"T.J. Miller,, James Corden,, Anna Faris",Tony Leondis,USA,2017-07-28,0.5,2017,/reviews/the-emoji-movie-2017,PG,,Since “Toy Story” became an enormous box-offic...,0,3.666667,27,1144.0,Summer,0,0,1,0,0,0,1,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Menashe,6.1,164.0,Directed by Joshua Z Weinstein. With Menashe ...,81.0,1.0,21.0,"Menashe Lustig,, Yoel Falkowitz,, Ruben Niborsk",Joshua Z Weinstein,USA,2017-07-28,4.0,2017,/reviews/menashe-2017,Unrated,82.0,Like all great films that nudge the world towa...,0,0.047619,47,803.0,Summer,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [32]:
# fill the directors with 1's
for idx, row in ebert_imdb_df.iterrows():
    director = row['Director']
    if director != '' and director in relevant_directors:
        ebert_imdb_df.ix[idx, director] = 1

In [33]:
ebert_imdb_df.head(2)

Unnamed: 0,Title,IMDB_Rating,Rating_Count,Description,Metascore,User_Review_Count,Critic_Review_Count,Stars_List,Director,Country,Release_Date,EbertStars,Year,URL,Rating,Runtime,Review,Foreign,UC_Ratio,Description_Len,Review_Len,Season,Fall,Spring,Summer,Winter,G,NC-17,PG,PG-13,R,Unrated,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,Film-Noir,Game-Show,History,Horror,Music,Musical,Mystery,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western,Unnamed: 60,Aaron Blaise,Aaron Katz,Aaron Norris,Abbas Kiarostami,Abdellatif Kechiche,Abel Ferrara,Adam F. Goldberg,Adam Green,Adam Leon,Adam McKay,Adam Rapp,Adam Shankman,Adam Wingard,Adrian Dunbar,Adrian Lyne,...,Tom Mankiewicz,Tom McCarthy,Tom McGrath,Tom McLoughlin,Tom Shadyac,Tom Six,Tom Tykwer,Tom Vaughan,Tomas Alfredson,Tomm Moore,Tommy Lee Jones,Tommy O'Haver,Tommy Wirkola,Tomás Gutiérrez Alea,Toni Myers,Tony Bill,Tony Gilroy,Tony Goldwyn,Tony Kaye,Tony Scott,Tran Anh Hung,Trey Edward Shults,Trey Parker,Tyler Perry,Udayan Prasad,Uli Edel,Ulrich Seidl,Ulu Grosbard,Victor Nunez,Victor Salva,Vincent Gallo,Vincent Paronnaud,Vincent Ward,Vincenzo Natali,Vitaliy Manskiy,Volker Schlöndorff,Wai-Keung Lau,Wallace Wolodarsky,Walt Becker,Walter Hill,Walter Salles,Warren Beatty,Wayne Blair,Wayne Kramer,Wayne Wang,Werner Herzog,Wes Anderson,Wes Ball,Wes Craven,Whit Stillman,Will Finn,Will Gluck,William Dear,William Eubank,William Friedkin,William Gazecki,Wilson Yip,Wim Wenders,Wolfgang Petersen,Woo-Ping Yuen,Woody Allen,Xavier Dolan,Xavier Giannoli,Xavier Koller,Yang Zhang,Yimou Zhang,Yorgos Lanthimos,Yves Robert,Zach Braff,Zack Snyder,Zalman King,Zhangke Jia,Zoe R. Cassavetes,Álex de la Iglesia,Éric Rohmer
0,The Emoji Movie,1.5,2334.0,"Directed by Tony Leondis. With T.J. Miller, J...",9.0,55.0,15.0,"T.J. Miller,, James Corden,, Anna Faris",Tony Leondis,USA,2017-07-28,0.5,2017,/reviews/the-emoji-movie-2017,PG,,Since “Toy Story” became an enormous box-offic...,0,3.666667,27,1144.0,Summer,0,0,1,0,0,0,1,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Menashe,6.1,164.0,Directed by Joshua Z Weinstein. With Menashe ...,81.0,1.0,21.0,"Menashe Lustig,, Yoel Falkowitz,, Ruben Niborsk",Joshua Z Weinstein,USA,2017-07-28,4.0,2017,/reviews/menashe-2017,Unrated,82.0,Like all great films that nudge the world towa...,0,0.047619,47,803.0,Summer,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [34]:
ebert_imdb_df[ebert_imdb_df.Director == 'Gore Verbinski']['Gore Verbinski']

296     1
2523    1
3261    1
4510    1
5175    1
5797    1
Name: Gore Verbinski, dtype: int64

In [35]:
ebert_imdb_df = ebert_imdb_df.drop(['Director'], axis='columns')

In [36]:
# pickle.dump(ebert_imdb_df, open('../data/interim/ebert_imdb_df_v4.pkl', 'wb'))

## Convert Actors to Numerical

In [34]:
ebert_imdb_df = pickle.load(open('../data/interim/ebert_imdb_df_v4.pkl', 'rb'))

In [37]:
stars_array = [stars_list.split(",,") for stars_list in ebert_imdb_df['Stars_List']]
unique_stars = {star.strip() for stars in stars_array for star in stars}
unique_stars.discard('')
unique_stars = pd.Series(sorted(unique_stars))

In [38]:
# count how many appearances each star has
stars_appearances  = dict.fromkeys(unique_stars, 0)

for idx, row in ebert_imdb_df.iterrows():
    for star in row['Stars_List'].split(",, "):
        if star != '':
            stars_appearances[star] += 1

stars_appearances

{'Celeste Corcoran': 1,
 'Michael MacRae': 1,
 'Alfre Woodard': 8,
 'Hadas Yaron': 2,
 'Gloria Allred': 1,
 'Emayatzy Corinealdi': 3,
 'Sarah-Jane Potts': 1,
 'Shawn Farmer': 1,
 'Vittorio Gassman': 3,
 'Dre Pahich': 1,
 'Baby Peggy': 1,
 'Jason Behr': 1,
 'Gérard Jugnot': 2,
 'Judy Greer': 4,
 'Alexandre Belin': 1,
 'Ryan Lenz': 1,
 'André Szymanski': 1,
 'Caio Blat': 1,
 'Roxanne Pallett': 1,
 'Omar Epps': 6,
 'Kristyan Ferrer': 1,
 'Trine Dyrholm': 2,
 'Fabrizio Ferracane': 1,
 'David Chang': 1,
 'Robert Sean Leonard': 5,
 'Charlie Creed-Miles': 1,
 'Nilbio Torres': 1,
 'Samantha': 1,
 'BoA': 1,
 'Kika Mirylees': 1,
 'Bert Convy': 1,
 'Guillaume Verdier': 1,
 'Lexi Atkins': 1,
 "Liza D'Agostino": 1,
 'Maggie Grace': 9,
 'Golo Euler': 1,
 'Lise Roy': 1,
 'Marta Mazurek': 1,
 'Florence Hoath': 2,
 'Jayson Blair': 2,
 'Deepika Padukone': 4,
 '50 Cent': 3,
 'Maria Ouspenskaya': 1,
 'David Irving': 1,
 'Bing He': 1,
 'Brendan Cowell': 1,
 'Loreto Aravena': 1,
 'Derek Hough': 1,
 'Francin

In [39]:
stars_appearances_df = pd.DataFrame.from_dict(stars_appearances, orient='index')
stars_appearances_df.columns = ['Appearances']
stars_appearances_df

Unnamed: 0,Appearances
Celeste Corcoran,1
Michael MacRae,1
Alfre Woodard,8
Hadas Yaron,2
Gloria Allred,1
Emayatzy Corinealdi,3
Sarah-Jane Potts,1
Shawn Farmer,1
Vittorio Gassman,3
Dre Pahich,1


In [40]:
print(len(stars_appearances_df))
print(len(stars_appearances_df[stars_appearances_df.Appearances >= 2]))

11050
3374


As with directors, stars that only appear in a single movie in the dataset don't provide predictive value. Removing them allows us to reduce the number of stars dummy variables from over 11 thousand to under 3400.

In [41]:
relevant_actors = stars_appearances_df[stars_appearances_df.Appearances >= 2].index.values
relevant_actors

array(['Alfre Woodard', 'Hadas Yaron', 'Emayatzy Corinealdi', ...,
       'Alex Descas', 'Chao Deng', 'Nicole Kidman'], dtype=object)

In [42]:
for actor in sorted(relevant_actors):
    ebert_imdb_df[actor] = np.zeros(len(ebert_imdb_df), dtype=int)

In [43]:
ebert_imdb_df.head(2)

Unnamed: 0,Title,IMDB_Rating,Rating_Count,Description,Metascore,User_Review_Count,Critic_Review_Count,Stars_List,Country,Release_Date,EbertStars,Year,URL,Rating,Runtime,Review,Foreign,UC_Ratio,Description_Len,Review_Len,Season,Fall,Spring,Summer,Winter,G,NC-17,PG,PG-13,R,Unrated,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,Film-Noir,Game-Show,History,Horror,Music,Musical,Mystery,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western,Unnamed: 59,Aaron Blaise,Aaron Katz,Aaron Norris,Abbas Kiarostami,Abdellatif Kechiche,Abel Ferrara,Adam F. Goldberg,Adam Green,Adam Leon,Adam McKay,Adam Rapp,Adam Shankman,Adam Wingard,Adrian Dunbar,Adrian Lyne,Adrian Shergold,...,Will Poulter,Will Smith,Will Yun Lee,Willem Dafoe,William Atherton,William Baldwin,William Devane,William Fichtner,William Forsythe,William H. Macy,William Holden,William Hurt,William Katt,William Moseley,William Petersen,William Sadler,William Shatner,Willie Nelson,Wilmer Valderrama,Winona Ryder,Wojciech Pszoniak,Wood Harris,Woody Harrelson,Wrenn Schmidt,Wu Jiang,Wunmi Mosaku,Wyatt Russell,Xander Berkeley,Xavier Samuel,Xun Zhou,Yada Beener,Yannick Renier,Yaphet Kotto,Yasiin Bey,Yaya DaCosta,Yayan Ruhian,Yaël Abecassis,Ye Liu,Yeo-jeong Yoon,Yi Huang,Yiftach Klein,Yoko Ono,Yoo Gong,Yu-Yong,Yui Natsukawa,Yuliya Aug,Yun-Fat Chow,Yuqi Zhang,Yuriy Tsurilo,Yves Montand,Yvonne Strahovski,Yôko Maki,Yûki Kudô,Zabou Breitman,Zac Efron,Zach Galifianakis,Zach Galligan,Zach Gilford,Zachary Booth,Zachary Knighton,Zachary Quinto,Zakes Mokae,Zdenek Sverák,Ziggy Marley,Zinedine Soualem,Ziyi Zhang,Zoe Kazan,Zoe Lister-Jones,Zoe Saldana,Zoey Deutch,Zooey Deschanel,Zoë Kravitz,Zulay Henao,Élodie Bouchez,Émilie Dequenne
0,The Emoji Movie,1.5,2334.0,"Directed by Tony Leondis. With T.J. Miller, J...",9.0,55.0,15.0,"T.J. Miller,, James Corden,, Anna Faris",USA,2017-07-28,0.5,2017,/reviews/the-emoji-movie-2017,PG,,Since “Toy Story” became an enormous box-offic...,0,3.666667,27,1144.0,Summer,0,0,1,0,0,0,1,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Menashe,6.1,164.0,Directed by Joshua Z Weinstein. With Menashe ...,81.0,1.0,21.0,"Menashe Lustig,, Yoel Falkowitz,, Ruben Niborsk",USA,2017-07-28,4.0,2017,/reviews/menashe-2017,Unrated,82.0,Like all great films that nudge the world towa...,0,0.047619,47,803.0,Summer,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [44]:
# fill actors with 1's
for idx, row in ebert_imdb_df.iterrows():
    for actor in row['Stars_List'].split(",, "):
        if actor != '' and actor in relevant_actors:
            ebert_imdb_df.ix[idx, actor] = 1

In [45]:
ebert_imdb_df.head(2)

Unnamed: 0,Title,IMDB_Rating,Rating_Count,Description,Metascore,User_Review_Count,Critic_Review_Count,Stars_List,Country,Release_Date,EbertStars,Year,URL,Rating,Runtime,Review,Foreign,UC_Ratio,Description_Len,Review_Len,Season,Fall,Spring,Summer,Winter,G,NC-17,PG,PG-13,R,Unrated,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,Film-Noir,Game-Show,History,Horror,Music,Musical,Mystery,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western,Unnamed: 59,Aaron Blaise,Aaron Katz,Aaron Norris,Abbas Kiarostami,Abdellatif Kechiche,Abel Ferrara,Adam F. Goldberg,Adam Green,Adam Leon,Adam McKay,Adam Rapp,Adam Shankman,Adam Wingard,Adrian Dunbar,Adrian Lyne,Adrian Shergold,...,Will Poulter,Will Smith,Will Yun Lee,Willem Dafoe,William Atherton,William Baldwin,William Devane,William Fichtner,William Forsythe,William H. Macy,William Holden,William Hurt,William Katt,William Moseley,William Petersen,William Sadler,William Shatner,Willie Nelson,Wilmer Valderrama,Winona Ryder,Wojciech Pszoniak,Wood Harris,Woody Harrelson,Wrenn Schmidt,Wu Jiang,Wunmi Mosaku,Wyatt Russell,Xander Berkeley,Xavier Samuel,Xun Zhou,Yada Beener,Yannick Renier,Yaphet Kotto,Yasiin Bey,Yaya DaCosta,Yayan Ruhian,Yaël Abecassis,Ye Liu,Yeo-jeong Yoon,Yi Huang,Yiftach Klein,Yoko Ono,Yoo Gong,Yu-Yong,Yui Natsukawa,Yuliya Aug,Yun-Fat Chow,Yuqi Zhang,Yuriy Tsurilo,Yves Montand,Yvonne Strahovski,Yôko Maki,Yûki Kudô,Zabou Breitman,Zac Efron,Zach Galifianakis,Zach Galligan,Zach Gilford,Zachary Booth,Zachary Knighton,Zachary Quinto,Zakes Mokae,Zdenek Sverák,Ziggy Marley,Zinedine Soualem,Ziyi Zhang,Zoe Kazan,Zoe Lister-Jones,Zoe Saldana,Zoey Deutch,Zooey Deschanel,Zoë Kravitz,Zulay Henao,Élodie Bouchez,Émilie Dequenne
0,The Emoji Movie,1.5,2334.0,"Directed by Tony Leondis. With T.J. Miller, J...",9.0,55.0,15.0,"T.J. Miller,, James Corden,, Anna Faris",USA,2017-07-28,0.5,2017,/reviews/the-emoji-movie-2017,PG,,Since “Toy Story” became an enormous box-offic...,0,3.666667,27,1144.0,Summer,0,0,1,0,0,0,1,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Menashe,6.1,164.0,Directed by Joshua Z Weinstein. With Menashe ...,81.0,1.0,21.0,"Menashe Lustig,, Yoel Falkowitz,, Ruben Niborsk",USA,2017-07-28,4.0,2017,/reviews/menashe-2017,Unrated,82.0,Like all great films that nudge the world towa...,0,0.047619,47,803.0,Summer,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [46]:
print(ebert_imdb_df.iloc[0]['Will Smith'])
print(ebert_imdb_df.iloc[0]['Dane DeHaan'])

0
0


In [47]:
ebert_imdb_df = ebert_imdb_df.drop(['Stars_List'], axis='columns')

In [48]:
# pickle.dump(ebert_imdb_df, open('../data/interim/ebert_imdb_df_v5.pkl', 'wb'))

## Create buckets for decades

In [65]:
ebert_imdb_df = pickle.load(open('../data/interim/ebert_imdb_df_v5.pkl', 'rb'))

In [66]:
ebert_imdb_df['Year'] = ebert_imdb_df['Year'].apply(pd.to_numeric, args=('coerce',))

In [67]:
ebert_imdb_df = ebert_imdb_df.dropna(subset=['Release_Date', 'Year'])
ax = ebert_imdb_df.Year.hist(bins=range(1920,2020, 10), figsize=(17, 8))

In [68]:
decade_buckets = range(1920, 2020, 10)
for decade in decade_buckets:
    ebert_imdb_df[decade] = np.zeros(len(ebert_imdb_df), dtype=int)

In [69]:
# fill decades with 1's
for idx, row in ebert_imdb_df.iterrows():
    decade_idx = int((row['Year'] - 1920) // 10)
    ebert_imdb_df.ix[idx, decade_buckets[decade_idx]] = 1

In [70]:
ebert_imdb_df.head(2)

Unnamed: 0,Title,IMDB_Rating,Rating_Count,Description,Metascore,User_Review_Count,Critic_Review_Count,Country,Release_Date,EbertStars,Year,URL,Rating,Runtime,Review,Foreign,UC_Ratio,Description_Len,Review_Len,Season,Fall,Spring,Summer,Winter,G,NC-17,PG,PG-13,R,Unrated,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,Film-Noir,Game-Show,History,Horror,Music,Musical,Mystery,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western,Unnamed: 58,Aaron Blaise,Aaron Katz,Aaron Norris,Abbas Kiarostami,Abdellatif Kechiche,Abel Ferrara,Adam F. Goldberg,Adam Green,Adam Leon,Adam McKay,Adam Rapp,Adam Shankman,Adam Wingard,Adrian Dunbar,Adrian Lyne,Adrian Shergold,Adrián García Bogliano,...,William Holden,William Hurt,William Katt,William Moseley,William Petersen,William Sadler,William Shatner,Willie Nelson,Wilmer Valderrama,Winona Ryder,Wojciech Pszoniak,Wood Harris,Woody Harrelson,Wrenn Schmidt,Wu Jiang,Wunmi Mosaku,Wyatt Russell,Xander Berkeley,Xavier Samuel,Xun Zhou,Yada Beener,Yannick Renier,Yaphet Kotto,Yasiin Bey,Yaya DaCosta,Yayan Ruhian,Yaël Abecassis,Ye Liu,Yeo-jeong Yoon,Yi Huang,Yiftach Klein,Yoko Ono,Yoo Gong,Yu-Yong,Yui Natsukawa,Yuliya Aug,Yun-Fat Chow,Yuqi Zhang,Yuriy Tsurilo,Yves Montand,Yvonne Strahovski,Yôko Maki,Yûki Kudô,Zabou Breitman,Zac Efron,Zach Galifianakis,Zach Galligan,Zach Gilford,Zachary Booth,Zachary Knighton,Zachary Quinto,Zakes Mokae,Zdenek Sverák,Ziggy Marley,Zinedine Soualem,Ziyi Zhang,Zoe Kazan,Zoe Lister-Jones,Zoe Saldana,Zoey Deutch,Zooey Deschanel,Zoë Kravitz,Zulay Henao,Élodie Bouchez,Émilie Dequenne,1920,1930,1940,1950,1960,1970,1980,1990,2000,2010
0,The Emoji Movie,1.5,2334.0,"Directed by Tony Leondis. With T.J. Miller, J...",9.0,55.0,15.0,USA,2017-07-28,0.5,2017.0,/reviews/the-emoji-movie-2017,PG,,Since “Toy Story” became an enormous box-offic...,0,3.666667,27,1144.0,Summer,0,0,1,0,0,0,1,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,Menashe,6.1,164.0,Directed by Joshua Z Weinstein. With Menashe ...,81.0,1.0,21.0,USA,2017-07-28,4.0,2017.0,/reviews/menashe-2017,Unrated,82.0,Like all great films that nudge the world towa...,0,0.047619,47,803.0,Summer,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [72]:
pickle.dump(ebert_imdb_df, open('../data/processed/ebert_imdb_final.pkl', 'wb'))

# Plan for Following Notebooks

- More Exploratory Data Analysis
- Making predictions
- Final analysis