In [1]:
import pickle
import pandas as pd
import numpy as np
import datetime as dt
import re

In [2]:
movie_df = pickle.load(open('clean_movie_df.pickle', 'rb'))

In [3]:
#read holidays csv
holidays = pd.read_csv('usholidays.csv')

#convert all dates to datetime/timestamp
movie_df.release = pd.to_datetime(movie_df.release)
holidays.loc[:,'Date'] = pd.to_datetime(holidays.Date)

#filter out holiday dates before the year 2000
holidays = holidays[holidays.Date >= dt.datetime(2000,1,1)].Date

def find_closest_holiday(date):
    """
    Finds the days to the nearest US Federal Holiday
    args:
    date (timestamp)
    returns:
    nearest_hol (int) days to the nearest holiday
    """
    nearest_hol = 365
    for hol in holidays:
        d = abs((hol - date).days)
        if d < nearest_hol:
            nearest_hol = d
    return nearest_hol

movie_df['holiday_distance'] = movie_df.release.map(find_closest_holiday)

In [4]:
movie_df.head()

Unnamed: 0,index,link_stub,release,theaters,desc,distr,opening,budget,mpaa,runtime,...,gross_world,Director,Writer,Producer,Composer,Cinematographer,Editor,Production Designer,Actors,holiday_distance
1663,Sin City,/release/rl3027338753/?ref_=bo_yld_table_32,2005-04-01,3230,A movie that explores the dark and miserable t...,Dimension Films,28100000,40000000,R,124.0,...,158733820,"Frank Miller, Quentin Tarantino, Robert Rodriguez","Frank Miller, Robert Rodriguez, Frank Miller",Elizabeth Avellan,"John Debney, Graeme Revell, Robert Rodriguez",Robert Rodriguez,Robert Rodriguez,,"Mickey Rourke, Clive Owen, Bruce Willis, Jessi...",39
1704,Source Code,/release/rl4033906177/?ref_=bo_yld_table_60,2011-04-01,2971,A soldier wakes up in someone else's body and ...,Summit Entertainment,14812094,32000000,PG-13,93.0,...,147332697,Duncan Jones,Ben Ripley,"Mark Gordon, Philippe Rousselet, Jordan Wynn",Chris Bacon,Don Burgess,Paul Hirsch,Barry Chusid,"Jake Gyllenhaal, Michelle Monaghan, Vera Farmi...",39
961,Insidious,/release/rl3428550145/?ref_=bo_yld_table_63,2011-04-01,2419,A family looks to prevent evil spirits from tr...,FilmDistrict,13271464,1500000,PG-13,103.0,...,99557032,James Wan,Leigh Whannell,"Jason Blum, Oren Peli, Steven Schneider",Joseph Bishara,"David M. Brewer, John R. Leonetti","Kirk M. Morri, James Wan",Aaron Sims,"Patrick Wilson, Rose Byrne, Ty Simpkins, Lin S...",39
881,Hop,/release/rl391874049/?ref_=bo_yld_table_25,2011-04-01,3616,"E.B., the Easter Bunny's teenage son, heads to...",Universal Pictures,37543710,63000000,PG,95.0,...,183953723,Tim Hill,"Cinco Paul, Ken Daurio, Brian Lynch, Cinco Pau...","Michele Imperato, Christopher Meledandri",Christopher Lennertz,Peter Lyons Collister,"Peter S. Elliot, Gregory Perler",Richard Holland,"Russell Brand, James Marsden, Elizabeth Perkin...",39
2479,Woman in Gold,/release/rl981108225/?ref_=bo_yld_table_77,2015-04-01,2011,"Maria Altmann, an octogenarian Jewish refugee,...",The Weinstein Company,2091551,11000000,PG-13,109.0,...,61619773,Simon Curtis,"Alexi Kaye Campbell, E. Randol Schoenberg, Mar...","David M. Thompson, Kris Thykier","Martin Phipps, Hans Zimmer",Ross Emery,Peter Lambert,Jim Clay,"Helen Mirren, Ryan Reynolds, Daniel Brühl, Kat...",44


In [5]:
def dummies():
    #Get dummies from Genres
    movie_df = movie_df.merge(movie_df.genres.str.get_dummies(sep=','), left_index=True, right_index=True, );

    #Get dummies from movie distributor
    movie_df = movie_df.merge(pd.get_dummies(movie_df.distr), left_index=True, right_index=True);

In [7]:
#looking for movies with more than 1 director
#shows a small subset
movie_df.Director[movie_df.Director.str.split(pat=',').map(len)>2]

1663    Frank Miller, Quentin Tarantino, Robert Rodriguez
624                Jeff Schaffer, Alec Berg, David Mandel
1264    Elizabeth Banks, Steven Brill, Steve Carr, Rus...
308           Mark Andrews, Brenda Chapman, Steve Purcell
504               Kyle Balda, Pierre Coffin, Eric Guillon
1170             Eric Darnell, Tom McGrath, Conrad Vernon
2191          Bibo Bergeron, Don Paul, Jeffrey Katzenberg
1655          Andrew Adamson, Kelly Asbury, Conrad Vernon
826            George Miller, Warren Coleman, Judy Morris
1256            Pete Docter, David Silverman, Lee Unkrich
1438           Jorge Blanco, Javier Abad, Marcos Martínez
1644           Bibo Bergeron, Vicky Jenson, Rob Letterman
912     Gabriel Riva Palacio Alatriste, Rodolfo Riva P...
Name: Director, dtype: object

In [8]:
movie_df.Cinematographer[movie_df.Cinematographer.str.split(pat=',').map(len)>2]

1264    Mattian Anderssonn Rudh, Frank G. DeMarco, Ste...
996        Lance Bangs, Dimitry Elyashkevich, Rick Kosick
997        Lance Bangs, Dimitry Elyashkevich, Rick Kosick
Name: Cinematographer, dtype: object

In [10]:
movie_df.Composer[movie_df.Composer.str.split(pat=',').map(len)>2]

1663         John Debney, Graeme Revell, Robert Rodriguez
1057    Marius De Vries, Ilan Eshkeri, Henry Jackman, ...
1324              Anthony Gonzalez, M83, Joseph Trapanese
1444    Shinji Miyazaki, Ralph Schuckett, Hirokazu Tanaka
1788    Thomas Bangalter, Guy-Manuel De Homem-Christo,...
108         Larry Paxton, Marty Stuart, Kristin Wilkinson
1678       Joseph L. Altruda, Venus Brown, Justin Stanley
2042             Reinhold Heil, Johnny Klimek, Tom Tykwer
1415          John Flansburgh, John Linnell, Joel McNeely
1864            Atticus Ross, Leopold Ross, Claudia Sarne
272     Harry Gregson-Williams, Atticus Ross, Leopold ...
325             Atticus Ross, Leopold Ross, Claudia Sarne
1264    Tyler Bates, Christophe Beck, Leo Birenberg, W...
116     Reinhard Besser, Oliver Lieb, Bernd Wendlandt,...
1185          Benny Andersson, Anne Dudley, Björn Ulvaeus
74           Deke Dickerson, Andrew Feltenstein, John Nau
1725    John Debney, Danny Elfman, Harry Gregson-Willi...
880           

In [11]:
movie_df.rename(columns={'gross_inter':'gross_intl'}, inplace=True)

In [12]:
#Get average gross income by director to get a measure of how successfull the director is. 
director_average = movie_df.groupby('Director', as_index=False)[['gross_dom', 'gross_intl', 'gross_world']].aggregate('mean')

In [13]:
director_average

Unnamed: 0,Director,gross_dom,gross_intl,gross_world
0,"Aaron Horvath, Peter Rida Michail",29790236.0,2.230000e+07,5.209024e+07
1,"Aaron Seltzer, Jason Friedberg",48548426.0,3.720061e+07,8.574903e+07
2,"Abby Kohn, Marc Silverstein",48795601.0,4.574382e+07,9.453943e+07
3,Adam McKay,115671763.8,2.913685e+07,1.448086e+08
4,Adam Robitel,62375465.5,9.942337e+07,1.617988e+08
...,...,...,...,...
1016,"Yarrow Cheney, Scott Mosier",270620950.0,2.409750e+08,5.115960e+08
1017,Yimou Zhang,49625424.5,2.065393e+08,2.561647e+08
1018,Zach Braff,45018541.0,3.990000e+07,8.491854e+07
1019,Zack Snyder,155802746.0,2.035224e+08,3.593251e+08


In [14]:
n_movies_directed = movie_df.Director.value_counts()
n_movies_directed = pd.DataFrame(n_movies_directed).reset_index().rename(columns={'index':'Director', 'Director':'n_movies_directed'})

In [15]:
movie_df.rename(columns={'index':'title'}, inplace=True)

In [21]:
movie_df = movie_df.merge(n_movies_directed, on='Director')

In [22]:
movie_df.head()

Unnamed: 0,title,link_stub,release,theaters,desc,distr,opening,budget,mpaa,runtime,...,Director,Writer,Producer,Composer,Cinematographer,Editor,Production Designer,Actors,holiday_distance,n_movies_directed
0,Sin City,/release/rl3027338753/?ref_=bo_yld_table_32,2005-04-01,3230,A movie that explores the dark and miserable t...,Dimension Films,28100000,40000000,R,124.0,...,"Frank Miller, Quentin Tarantino, Robert Rodriguez","Frank Miller, Robert Rodriguez, Frank Miller",Elizabeth Avellan,"John Debney, Graeme Revell, Robert Rodriguez",Robert Rodriguez,Robert Rodriguez,,"Mickey Rourke, Clive Owen, Bruce Willis, Jessi...",39,1
1,Source Code,/release/rl4033906177/?ref_=bo_yld_table_60,2011-04-01,2971,A soldier wakes up in someone else's body and ...,Summit Entertainment,14812094,32000000,PG-13,93.0,...,Duncan Jones,Ben Ripley,"Mark Gordon, Philippe Rousselet, Jordan Wynn",Chris Bacon,Don Burgess,Paul Hirsch,Barry Chusid,"Jake Gyllenhaal, Michelle Monaghan, Vera Farmi...",39,2
2,Warcraft,/release/rl1064863233/?ref_=bo_yld_table_66,2016-06-10,3406,As an Orc horde invades the planet Azeroth usi...,Universal Pictures,24166110,160000000,PG-13,123.0,...,Duncan Jones,"Charles Leavitt, Duncan Jones","Stuart Fenegan, Alex Gartner, Jon Jashni, Char...",Ramin Djawadi,Simon Duggan,Paul Hirsch,Gavin Bocquet,"Travis Fimmel, Paula Patton, Ben Foster, Domin...",11,2
3,Insidious,/release/rl3428550145/?ref_=bo_yld_table_63,2011-04-01,2419,A family looks to prevent evil spirits from tr...,FilmDistrict,13271464,1500000,PG-13,103.0,...,James Wan,Leigh Whannell,"Jason Blum, Oren Peli, Steven Schneider",Joseph Bishara,"David M. Brewer, John R. Leonetti","Kirk M. Morri, James Wan",Aaron Sims,"Patrick Wilson, Rose Byrne, Ty Simpkins, Lin S...",39,7
4,Furious 7,/release/rl1045661185/?ref_=bo_yld_table_5,2015-04-03,4022,Deckard Shaw seeks revenge against Dominic Tor...,Universal Pictures,147187040,190000000,PG-13,137.0,...,James Wan,"Chris Morgan, Gary Scott Thompson","Vin Diesel, Michael Fottrell, Neal H. Moritz",Brian Tyler,"Marc Spicer, Stephen F. Windon","Leigh Folsom Boyd, Dylan Highsmith, Kirk M. Mo...",Bill Brzeski,"Vin Diesel, Paul Walker, Dwayne Johnson, Jason...",46,7


In [23]:
with open('processed_movie_df.pickle', 'wb') as to_write:
    pickle.dump(movie_df, to_write)