In [1]:
from sklearn import datasets
import pandas as pd
import ast
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from collections import Counter
pd.set_option('display.max_columns', 200)

In [2]:
data = pd.read_csv('./train.csv')
data.shape

(3000, 23)

# Genre

In [3]:
for col in ['genres', 'production_companies', 'spoken_languages', 'Keywords', 'cast', 'crew']:
    data[col] = data[col].apply(lambda x: {} if pd.isna(x) else ast.literal_eval(x))

In [4]:
list_of_genres = list(data['genres'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values)

In [5]:
Counter([i for j in list_of_genres for i in j]).most_common()

[('Drama', 1531),
 ('Comedy', 1028),
 ('Thriller', 789),
 ('Action', 741),
 ('Romance', 571),
 ('Crime', 469),
 ('Adventure', 439),
 ('Horror', 301),
 ('Science Fiction', 290),
 ('Family', 260),
 ('Fantasy', 232),
 ('Mystery', 225),
 ('Animation', 141),
 ('History', 132),
 ('Music', 100),
 ('War', 100),
 ('Documentary', 87),
 ('Western', 43),
 ('Foreign', 31),
 ('TV Movie', 1)]

In [6]:
# data['num_genres'] = data['genres'].apply(lambda x: len(x) if x != {} else 0)
data['all_genres'] = data['genres'].apply(lambda x: ' '.join(sorted([i['name'] for i in x])) if x != {} else '')
top_genres = [m[0] for m in Counter([i for j in list_of_genres for i in j]).most_common(20)]
print(top_genres)

['Drama', 'Comedy', 'Thriller', 'Action', 'Romance', 'Crime', 'Adventure', 'Horror', 'Science Fiction', 'Family', 'Fantasy', 'Mystery', 'Animation', 'History', 'Music', 'War', 'Documentary', 'Western', 'Foreign', 'TV Movie']


In [7]:
for genre in top_genres:
    data['genre_'+ genre] = data['all_genres'].apply(lambda x: 1 if genre in x else 0)
data = data.drop(['genres', 'all_genres'], axis=1)

# Production Companies

In [8]:
list_of_prod_comp = list(data['production_companies'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values)
Counter([i for j in list_of_prod_comp for i in j]).most_common(50)

[('Warner Bros.', 202),
 ('Universal Pictures', 188),
 ('Paramount Pictures', 161),
 ('Twentieth Century Fox Film Corporation', 138),
 ('Columbia Pictures', 91),
 ('Metro-Goldwyn-Mayer (MGM)', 84),
 ('New Line Cinema', 75),
 ('Touchstone Pictures', 63),
 ('Walt Disney Pictures', 62),
 ('Columbia Pictures Corporation', 61),
 ('TriStar Pictures', 53),
 ('Relativity Media', 48),
 ('Canal+', 46),
 ('United Artists', 44),
 ('Miramax Films', 40),
 ('Village Roadshow Pictures', 36),
 ('Regency Enterprises', 31),
 ('BBC Films', 30),
 ('Dune Entertainment', 30),
 ('Working Title Films', 30),
 ('Fox Searchlight Pictures', 29),
 ('StudioCanal', 28),
 ('Lionsgate', 28),
 ('DreamWorks SKG', 27),
 ('Fox 2000 Pictures', 25),
 ('Summit Entertainment', 24),
 ('Hollywood Pictures', 24),
 ('Orion Pictures', 24),
 ('Amblin Entertainment', 23),
 ('Dimension Films', 23),
 ('Castle Rock Entertainment', 21),
 ('Epsilon Motion Pictures', 21),
 ('Morgan Creek Productions', 21),
 ('Original Film', 21),
 ('Focus 

In [9]:
# data['num_production_companies'] = data['production_companies'].apply(lambda x: len(x) if x != {} else 0)
data['all_production_companies'] = data['production_companies'].apply(lambda x: ' '.join(sorted([i['name'] for i in x])) if x != {} else '')
top_prod_comp = [m[0] for m in Counter([i for j in list_of_prod_comp for i in j]).most_common(50)]
for prod_comp in top_prod_comp:
    data['production_company_' + prod_comp] = data['all_production_companies'].apply(lambda x: 1 if prod_comp in x else 0)
data = data.drop(['production_companies', 'all_production_companies'], axis=1)

# Keywords

In [10]:
list_keywords = list(data['Keywords'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values)

In [11]:
# data['number_keywords'] = data['Keywords'].apply(lambda x: len(x) if x != {} else 0)
data['all_keywords'] = data['Keywords'].apply(lambda x: ' '.join(sorted(i['name'] for i in x)) if x != {} else '')
top_keywords = [m[0] for m in Counter([i for j in list_keywords for i in j]).most_common(50)]
print(top_keywords)

['woman director', 'independent film', 'duringcreditsstinger', 'murder', 'based on novel', 'violence', 'sport', 'biography', 'aftercreditsstinger', 'dystopia', 'revenge', 'friendship', 'sex', 'suspense', 'sequel', 'love', 'police', 'teenager', 'nudity', 'female nudity', 'drug', 'prison', 'musical', 'high school', 'los angeles', 'new york', 'family', 'father son relationship', 'kidnapping', 'investigation', 'wedding', '3d', 'detective', 'london england', 'paris', 'based on comic', 'robbery', 'based on true story', 'dying and death', 'escape', 'alien', 'brother brother relationship', 'prostitute', 'suicide', 'rape', 'corruption', 'death', 'superhero', 'new york city', 'martial arts']


In [12]:
for keyword in top_keywords:
    data['keyword_' + keyword] = data['all_keywords'].apply(lambda x: 1 if keyword in x else 0)
data = data.drop(['Keywords', 'all_keywords'], axis=1)

# Revenue Interval

In [13]:
def revenue_interval(x):
    if x <= 500000:
        return '0-500000'
    elif x <= 1000000:
        return '500001-1000000'
    elif x <= 40000000:
        return '1000001-40000000'
    elif x <= 150000000:
        return '40000001-150000000'
    elif x >= 150000000:
        return '>150000000'
data["revenue_interval"] = data['revenue'].apply(revenue_interval)

In [14]:
data.head()

Unnamed: 0,id,belongs_to_collection,budget,homepage,imdb_id,original_language,original_title,overview,popularity,poster_path,production_countries,release_date,runtime,spoken_languages,status,tagline,title,cast,crew,revenue,genre_Drama,genre_Comedy,genre_Thriller,genre_Action,genre_Romance,genre_Crime,genre_Adventure,genre_Horror,genre_Science Fiction,genre_Family,genre_Fantasy,genre_Mystery,genre_Animation,genre_History,genre_Music,genre_War,genre_Documentary,genre_Western,genre_Foreign,genre_TV Movie,production_company_Warner Bros.,production_company_Universal Pictures,production_company_Paramount Pictures,production_company_Twentieth Century Fox Film Corporation,production_company_Columbia Pictures,production_company_Metro-Goldwyn-Mayer (MGM),production_company_New Line Cinema,production_company_Touchstone Pictures,production_company_Walt Disney Pictures,production_company_Columbia Pictures Corporation,production_company_TriStar Pictures,production_company_Relativity Media,production_company_Canal+,production_company_United Artists,production_company_Miramax Films,production_company_Village Roadshow Pictures,production_company_Regency Enterprises,production_company_BBC Films,production_company_Dune Entertainment,production_company_Working Title Films,production_company_Fox Searchlight Pictures,production_company_StudioCanal,production_company_Lionsgate,production_company_DreamWorks SKG,production_company_Fox 2000 Pictures,production_company_Summit Entertainment,production_company_Hollywood Pictures,production_company_Orion Pictures,production_company_Amblin Entertainment,production_company_Dimension Films,production_company_Castle Rock Entertainment,production_company_Epsilon Motion Pictures,production_company_Morgan Creek Productions,production_company_Original Film,production_company_Focus Features,production_company_Legendary Pictures,production_company_Participant Media,production_company_Blumhouse Productions,production_company_New Regency Pictures,production_company_Film4,production_company_Spyglass Entertainment,production_company_Imagine Entertainment,production_company_Screen Gems,production_company_Millennium Films,production_company_TSG Entertainment,production_company_Lakeshore Entertainment,production_company_France 2 Cinéma,production_company_Silver Pictures,production_company_The Weinstein Company,production_company_PolyGram Filmed Entertainment,keyword_woman director,keyword_independent film,keyword_duringcreditsstinger,keyword_murder,keyword_based on novel,keyword_violence,keyword_sport,keyword_biography,keyword_aftercreditsstinger,keyword_dystopia,keyword_revenge,keyword_friendship,keyword_sex,keyword_suspense,keyword_sequel,keyword_love,keyword_police,keyword_teenager,keyword_nudity,keyword_female nudity,keyword_drug,keyword_prison,keyword_musical,keyword_high school,keyword_los angeles,keyword_new york,keyword_family,keyword_father son relationship,keyword_kidnapping,keyword_investigation,keyword_wedding,keyword_3d,keyword_detective,keyword_london england,keyword_paris,keyword_based on comic,keyword_robbery,keyword_based on true story,keyword_dying and death,keyword_escape,keyword_alien,keyword_brother brother relationship,keyword_prostitute,keyword_suicide,keyword_rape,keyword_corruption,keyword_death,keyword_superhero,keyword_new york city,keyword_martial arts,revenue_interval
0,1,"[{'id': 313576, 'name': 'Hot Tub Time Machine ...",14000000,,tt2637294,en,Hot Tub Time Machine 2,"When Lou, who has become the ""father of the In...",6.575393,/tQtWuwvMf0hCc2QR2tkolwl7c3c.jpg,"[{'iso_3166_1': 'US', 'name': 'United States o...",2/20/15,93.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The Laws of Space and Time are About to be Vio...,Hot Tub Time Machine 2,"[{'cast_id': 4, 'character': 'Lou', 'credit_id...","[{'credit_id': '59ac067c92514107af02c8c8', 'de...",12314651,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1000001-40000000
1,2,"[{'id': 107674, 'name': 'The Princess Diaries ...",40000000,,tt0368933,en,The Princess Diaries 2: Royal Engagement,Mia Thermopolis is now a college graduate and ...,8.248895,/w9Z7A0GHEhIp7etpj0vyKOeU1Wx.jpg,"[{'iso_3166_1': 'US', 'name': 'United States o...",8/6/04,113.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,It can take a lifetime to find true love; she'...,The Princess Diaries 2: Royal Engagement,"[{'cast_id': 1, 'character': 'Mia Thermopolis'...","[{'credit_id': '52fe43fe9251416c7502563d', 'de...",95149435,1,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,40000001-150000000
2,3,,3300000,http://sonyclassics.com/whiplash/,tt2582802,en,Whiplash,"Under the direction of a ruthless instructor, ...",64.29999,/lIv1QinFqz4dlp5U4lQ6HaiskOZ.jpg,"[{'iso_3166_1': 'US', 'name': 'United States o...",10/10/14,105.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The road to greatness can take you to the edge.,Whiplash,"[{'cast_id': 5, 'character': 'Andrew Neimann',...","[{'credit_id': '54d5356ec3a3683ba0000039', 'de...",13092000,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1000001-40000000
3,4,,1200000,http://kahaanithefilm.com/,tt1821480,hi,Kahaani,Vidya Bagchi (Vidya Balan) arrives in Kolkata ...,3.174936,/aTXRaPrWSinhcmCrcfJK17urp3F.jpg,"[{'iso_3166_1': 'IN', 'name': 'India'}]",3/9/12,122.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,,Kahaani,"[{'cast_id': 1, 'character': 'Vidya Bagchi', '...","[{'credit_id': '52fe48779251416c9108d6eb', 'de...",16000000,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1000001-40000000
4,5,,0,,tt1380152,ko,마린보이,Marine Boy is the story of a former national s...,1.14807,/m22s7zvkVFDU9ir56PiiqIEWFdT.jpg,"[{'iso_3166_1': 'KR', 'name': 'South Korea'}]",2/5/09,118.0,"[{'iso_639_1': 'ko', 'name': '한국어/조선말'}]",Released,,Marine Boy,"[{'cast_id': 3, 'character': 'Chun-soo', 'cred...","[{'credit_id': '52fe464b9251416c75073b43', 'de...",3923970,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1000001-40000000


In [15]:
# data = data.drop(['id', 'belongs_to_collection', 'homepage', 'imdb_id', 'original_language', 'original_title', 'overview', 'poster_path', 'production_countries'], axis=1)

In [16]:
# data = data.drop(['release_date', 'runtime', 'spoken_languages', 'status', 'tagline', 'title', 'cast', 'crew', 'revenue'], axis=1)

In [17]:
# data.head()

# Training

In [18]:
# x = data.drop(['revenue_interval'], axis=1)
# y = data['revenue_interval']
# x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
# clf=RandomForestClassifier(n_estimators=100)
# clf.fit(x_train, y_train)
# y_pred = clf.predict(x_test)
# print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

# Cast

In [19]:
#cast = list(data['cast'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values)

In [20]:
#top_cast_names = [m[0] for m in Counter([i for j in cast for i in j]).most_common(100)]
#print(top_cast_names)

In [21]:
#for name in top_cast_names:
#    data['cast_' + name] = data['cast'].apply(lambda x: 1 if name in str(x) else 0)
#data.head()

In [22]:
data['runtime'] = data['runtime'].fillna(value=0)

In [23]:
def get_release_year(date):
    year = int(date.split('/')[2])
    if int(year) <= 19:
        return 2000 + year
    else:
        return 1900 + year
data['year'] = data['release_date'].apply(lambda x: get_release_year(x))

In [24]:
data = data.drop(['id', 'belongs_to_collection', 'homepage', 'imdb_id', 'original_language', 'original_title', 'overview', 'poster_path', 'production_countries'], axis=1)
data = data.drop(['release_date', 'spoken_languages', 'status', 'tagline', 'title', 'cast', 'crew', 'revenue'], axis=1)
data.head()

Unnamed: 0,budget,popularity,runtime,genre_Drama,genre_Comedy,genre_Thriller,genre_Action,genre_Romance,genre_Crime,genre_Adventure,genre_Horror,genre_Science Fiction,genre_Family,genre_Fantasy,genre_Mystery,genre_Animation,genre_History,genre_Music,genre_War,genre_Documentary,genre_Western,genre_Foreign,genre_TV Movie,production_company_Warner Bros.,production_company_Universal Pictures,production_company_Paramount Pictures,production_company_Twentieth Century Fox Film Corporation,production_company_Columbia Pictures,production_company_Metro-Goldwyn-Mayer (MGM),production_company_New Line Cinema,production_company_Touchstone Pictures,production_company_Walt Disney Pictures,production_company_Columbia Pictures Corporation,production_company_TriStar Pictures,production_company_Relativity Media,production_company_Canal+,production_company_United Artists,production_company_Miramax Films,production_company_Village Roadshow Pictures,production_company_Regency Enterprises,production_company_BBC Films,production_company_Dune Entertainment,production_company_Working Title Films,production_company_Fox Searchlight Pictures,production_company_StudioCanal,production_company_Lionsgate,production_company_DreamWorks SKG,production_company_Fox 2000 Pictures,production_company_Summit Entertainment,production_company_Hollywood Pictures,production_company_Orion Pictures,production_company_Amblin Entertainment,production_company_Dimension Films,production_company_Castle Rock Entertainment,production_company_Epsilon Motion Pictures,production_company_Morgan Creek Productions,production_company_Original Film,production_company_Focus Features,production_company_Legendary Pictures,production_company_Participant Media,production_company_Blumhouse Productions,production_company_New Regency Pictures,production_company_Film4,production_company_Spyglass Entertainment,production_company_Imagine Entertainment,production_company_Screen Gems,production_company_Millennium Films,production_company_TSG Entertainment,production_company_Lakeshore Entertainment,production_company_France 2 Cinéma,production_company_Silver Pictures,production_company_The Weinstein Company,production_company_PolyGram Filmed Entertainment,keyword_woman director,keyword_independent film,keyword_duringcreditsstinger,keyword_murder,keyword_based on novel,keyword_violence,keyword_sport,keyword_biography,keyword_aftercreditsstinger,keyword_dystopia,keyword_revenge,keyword_friendship,keyword_sex,keyword_suspense,keyword_sequel,keyword_love,keyword_police,keyword_teenager,keyword_nudity,keyword_female nudity,keyword_drug,keyword_prison,keyword_musical,keyword_high school,keyword_los angeles,keyword_new york,keyword_family,keyword_father son relationship,keyword_kidnapping,keyword_investigation,keyword_wedding,keyword_3d,keyword_detective,keyword_london england,keyword_paris,keyword_based on comic,keyword_robbery,keyword_based on true story,keyword_dying and death,keyword_escape,keyword_alien,keyword_brother brother relationship,keyword_prostitute,keyword_suicide,keyword_rape,keyword_corruption,keyword_death,keyword_superhero,keyword_new york city,keyword_martial arts,revenue_interval,year
0,14000000,6.575393,93.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1000001-40000000,2015
1,40000000,8.248895,113.0,1,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,40000001-150000000,2004
2,3300000,64.29999,105.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1000001-40000000,2014
3,1200000,3.174936,122.0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1000001-40000000,2012
4,0,1.14807,118.0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1000001-40000000,2009


In [25]:
data = data.loc[data['budget'] != 0]

In [26]:
data['budget/year'] = data['budget']/data['year']

In [27]:
x = data.drop(['revenue_interval'], axis=1)
y = data['revenue_interval']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
clf=RandomForestClassifier(n_estimators=100)
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.5775729646697388
