In [1]:
import pandas as pd
import sklearn
import numpy as np
import ast
from sklearn.preprocessing import MultiLabelBinarizer, PowerTransformer, StandardScaler
from sklearn.neural_network import MLPRegressor
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import datetime

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

Run time adjustment

In [3]:
train.loc[train['id'] == 1336,'runtime'] = 130
train.loc[train['id'] == 2303,'runtime'] = 80 
train.loc[train['id'] == 391,'runtime'] = 96
train.loc[train['id'] == 592,'runtime'] = 90
train.loc[train['id'] == 925,'runtime'] = 86
train.loc[train['id'] == 978,'runtime'] = 93
train.loc[train['id'] == 1256,'runtime'] = 92 
train.loc[train['id'] == 1542,'runtime'] = 93 
train.loc[train['id'] == 1875,'runtime'] = 93 
train.loc[train['id'] == 2151,'runtime'] = 108
train.loc[train['id'] == 2499,'runtime'] = 86 
train.loc[train['id'] == 2646,'runtime'] = 98 
train.loc[train['id'] == 2786,'runtime'] = 111
train.loc[train['id'] == 2866,'runtime'] = 96 
test.loc[test['id'] == 3244,'runtime'] = 93 
test.loc[test['id'] == 4490,'runtime'] = 90 
test.loc[test['id'] == 4633,'runtime'] = 108
test.loc[test['id'] == 6818,'runtime'] = 90 
test.loc[test['id'] == 4074,'runtime'] = 103
test.loc[test['id'] == 4222,'runtime'] = 91 
test.loc[test['id'] == 4431,'runtime'] = 96 
test.loc[test['id'] == 5520,'runtime'] = 86 
test.loc[test['id'] == 5845,'runtime'] = 83 
test.loc[test['id'] == 5849,'runtime'] = 140
test.loc[test['id'] == 6210,'runtime'] = 104
test.loc[test['id'] == 6804,'runtime'] = 140
test.loc[test['id'] == 7321,'runtime'] = 87 

Additional Features

In [4]:
train_add = pd.read_csv('TrainAdditionalFeatures.csv')
test_add = pd.read_csv('TestAdditionalFeatures.csv')

train = pd.merge(train, train_add, how='left', on=['imdb_id'])
test = pd.merge(test, test_add, how='left', on=['imdb_id'])

In [5]:
df = pd.concat([train, test]).set_index("id")

In [6]:
df.loc[df.index == 90,'budget'] = 30000000
df.loc[df.index == 118,'budget'] = 60000000
df.loc[df.index == 149,'budget'] = 18000000
df.loc[df.index == 464,'budget'] = 20000000
df.loc[df.index == 819,'budget'] = 90000000
df.loc[df.index == 1112,'budget'] = 6000000
df.loc[df.index == 1131,'budget'] = 4300000
df.loc[df.index == 1359,'budget'] = 10000000
df.loc[df.index == 1570,'budget'] = 15800000
df.loc[df.index == 1714,'budget'] = 46000000
df.loc[df.index == 1865,'budget'] = 80000000
df.loc[df.index == 2602,'budget'] = 31000000

In [7]:
df = df.drop(["poster_path", "original_title"], axis=1)

In [8]:
df["log_revenue"] = np.log1p(df["revenue"])
df["log_budget"] = np.log1p(df["budget"])

In [9]:
df['isbelongs_to_collectionNA'] = 1
df.loc[pd.isnull(df['belongs_to_collection']) ,"isbelongs_to_collectionNA"] = 0

In [10]:
df['isbelongs_to_collectionNA']

id
1       1
2       1
3       0
4       0
5       0
       ..
7394    0
7395    1
7396    0
7397    0
7398    0
Name: isbelongs_to_collectionNA, Length: 7398, dtype: int64

In [11]:
dict_columns = ['belongs_to_collection', 'genres', 'production_companies',
                'production_countries', 'spoken_languages', 'Keywords', 'cast', 'crew']

for col in dict_columns:
       df[col]=df[col].apply(lambda x: [] if pd.isna(x) else ast.literal_eval(x) )

In [12]:
dfdic_feature = {}

In [13]:
def multi_label_encoding_and_select_top(series, topn=9999):
    mlb = MultiLabelBinarizer()
    series = series.apply(lambda x : [ i["name"] for i in x])
    data = mlb.fit_transform(series)
    df = pd.DataFrame(data, columns=mlb.classes_)
    column_counts = df.eq(1).sum()
    sorted_columns = column_counts.sort_values(ascending=False)
    top_n_columns = sorted_columns.head(topn)
    return df[top_n_columns.index]

In [14]:
df['num_genres'] = df['genres'].apply(lambda x: len(x) if x != {} else 0)
dfdic_feature["genre"] = multi_label_encoding_and_select_top(df["genres"])
dfdic_feature["genre"] = dfdic_feature["genre"].drop("TV Movie", axis=1)
dfdic_feature["genre"].shape

(7398, 19)

In [15]:
dfdic_feature["genre"]

Unnamed: 0,Drama,Comedy,Thriller,Action,Romance,Adventure,Crime,Science Fiction,Horror,Family,Fantasy,Mystery,Animation,History,Music,War,Documentary,Western,Foreign
0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7393,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7394,0,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
7395,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7396,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [16]:
df["original_language"] = df["original_language"].astype("category")

In [17]:
df['isOriginalLanguageEng'] = 0 
df.loc[ df['original_language'] == "en" ,"isOriginalLanguageEng"] = 1

In [18]:
dfdic_feature["original_language"] = pd.get_dummies(df["original_language"])

dfdic_feature["original_language"].shape

(7398, 44)

In [19]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['original_language'] = le.fit_transform(df['original_language'])

In [20]:
dfdic_feature["production_companies"] = multi_label_encoding_and_select_top(df["production_companies"], 9)
dfdic_feature["production_companies"].shape

(7398, 9)

In [21]:
dfdic_feature["production_countries"] = multi_label_encoding_and_select_top(df["production_countries"], 10)
dfdic_feature["production_countries"].shape

(7398, 10)

In [22]:
df.loc[3829, "release_date"] = "5/1/00"

In [23]:
df["release_year"] = pd.to_datetime(df["release_date"]).dt.year.astype(int)
df.loc[df["release_year"]>2020, "release_year"] = df.loc[df["release_year"]>2020, "release_year"]-100

df["release_month"] = pd.to_datetime(df["release_date"]).dt.month.astype(int)
df["release_day"] = pd.to_datetime(df["release_date"]).dt.day.astype(int)
df["release_date"] = df.apply(lambda s: datetime.datetime(
    year=s["release_year"],month=s["release_month"],day=s["release_day"]), axis=1)
df["release_dayofyear"] = df["release_date"].dt.dayofyear
df["release_dayofweek"] = df["release_date"].dt.dayofweek

In [24]:
df['has_homepage'] = 1
df.loc[ pd.isnull(df['homepage']),'has_homepage'] = 0

In [25]:
df['num_Keywords'] = df['Keywords'].apply(lambda x: len(x) if x != {} else 0)

In [26]:
df['overview_word_count'] = df['overview'].apply(lambda x: len(str(x).split()))

df['overview_char_count'] = df['overview'].apply(lambda x: len(str(x)))

In [27]:

df['tagline_word_count'] = df['tagline'].apply(lambda x: len(str(x).split()))

df['tagline_char_count'] = df['tagline'].apply(lambda x: len(str(x)))

df['isTaglineNA'] = 0
df.loc[df['tagline'] == 0 ,"isTaglineNA"] = 1 

In [28]:

df['title_word_count'] = df['title'].apply(lambda x: len(str(x).split()))

df['title_char_count'] = df['title'].apply(lambda x: len(str(x)))

In [29]:
df['num_cast'] = df['cast'].apply(len)

In [30]:
list_of_cast_genders = list(df['cast'].apply(lambda x: [i['gender'] for i in x] if x != {} else []).values)

df['genders_0_cast'] = df['cast'].apply(lambda x: sum([1 for i in x if i['gender'] == 0]))
df['genders_1_cast'] = df['cast'].apply(lambda x: sum([1 for i in x if i['gender'] == 1]))
df['genders_2_cast'] = df['cast'].apply(lambda x: sum([1 for i in x if i['gender'] == 2]))

df[['genders_0_cast', 'genders_1_cast']] = df[['genders_0_cast', 'genders_1_cast']].fillna(df[['genders_0_cast', 'genders_1_cast']].mean())

In [31]:
df['num_crew'] = df['crew'].apply(len)

In [32]:
department_count = pd.Series(Counter([job for lst in df["crew"].apply(lambda x : [ i["department"] for i in x]).values for job in lst]))
department_count.sort_values(ascending=False)


Production           38927
Sound                22497
Art                  19870
Crew                 17529
Writing              16329
Costume & Make-Up    15223
Camera               13229
Directing            12245
Editing              11135
Visual Effects        9472
Lighting              3129
Actors                   5
dtype: int64

In [33]:
job_count = pd.Series(Counter([job for lst in df["crew"].apply(lambda x : [ i["job"] for i in x]).values for job in lst]))
job_count.sort_values(ascending=False).head(30)

Producer                     14670
Executive Producer            8533
Director                      8051
Screenplay                    7381
Editor                        7054
Casting                       6055
Director of Photography       5611
Original Music Composer       4768
Art Direction                 4637
Production Design             4027
Costume Design                3876
Writer                        3870
Set Decoration                3261
Makeup Artist                 2710
Sound Re-Recording Mixer      2257
Script Supervisor             2230
Camera Operator               2207
Animation                     2044
Visual Effects Supervisor     2001
Hairstylist                   1939
Sound Effects Editor          1855
Still Photographer            1701
Visual Effects Producer       1695
Music Editor                  1669
Dialogue Editor               1597
Co-Producer                   1589
Music                         1583
Stunts                        1492
Stunt Coordinator   

In [34]:
df_crew = { idx : pd.DataFrame([ [crew["department"], crew["job"], crew["name"]] 
                        for crew in x], columns=["department", "job", "name"]) 
    for idx, x in df["crew"].iteritems() }

  for idx, x in df["crew"].iteritems() }


In [35]:
df_crew = pd.concat(df_crew)
df_crew.head()

Unnamed: 0,Unnamed: 1,department,job,name
1,0,Directing,First Assistant Director,Kelly Cantley
1,1,Directing,Director,Steve Pink
1,2,Writing,Writer,Josh Heald
1,3,Writing,Characters,Josh Heald
1,4,Production,Producer,Andrew Panay


In [36]:
def select_job(list_dict, key, value):
    return [ dic["name"] for dic in list_dict if dic[key]==value]

In [37]:
for department in department_count.index:
    df['dep_{}_num'.format(department)] = df["crew"].apply(select_job, key="department", value=department).apply(len) 

In [38]:
df_crewname = pd.DataFrame([], index=df.index)
for job in ["Producer", "Director", "Screenplay", "Casting", "Original Music Composer","Writer"]:
    col = 'job_{}_list'.format(job)
    df[col] = df["crew"].apply(select_job, key="job", value=job)

    top_list = [m[0] for m in Counter([i for j in df[col] for i in j]).most_common(15)]
    for i in top_list:
        df_crewname['{}_{}'.format(job,i)] = df[col].apply(lambda x: i in x)

In [39]:
for job in ["Sound", "Art", "Costume & Make-Up", "Camera", "Visual Effects"]:
    col = 'department_{}_list'.format(job)
    df[col] = df["crew"].apply(select_job, key="department", value=job)

    top_list = [m[0] for m in Counter([i for j in df[col] for i in j]).most_common(15)]
    for i in top_list:
        df_crewname['{}_{}'.format(job,i)] = df[col].apply(lambda x: i in x)

  df_crewname['{}_{}'.format(job,i)] = df[col].apply(lambda x: i in x)
  df_crewname['{}_{}'.format(job,i)] = df[col].apply(lambda x: i in x)
  df_crewname['{}_{}'.format(job,i)] = df[col].apply(lambda x: i in x)
  df_crewname['{}_{}'.format(job,i)] = df[col].apply(lambda x: i in x)
  df_crewname['{}_{}'.format(job,i)] = df[col].apply(lambda x: i in x)
  df_crewname['{}_{}'.format(job,i)] = df[col].apply(lambda x: i in x)
  df_crewname['{}_{}'.format(job,i)] = df[col].apply(lambda x: i in x)
  df_crewname['{}_{}'.format(job,i)] = df[col].apply(lambda x: i in x)
  df_crewname['{}_{}'.format(job,i)] = df[col].apply(lambda x: i in x)
  df_crewname['{}_{}'.format(job,i)] = df[col].apply(lambda x: i in x)
  df_crewname['{}_{}'.format(job,i)] = df[col].apply(lambda x: i in x)
  df_crewname['{}_{}'.format(job,i)] = df[col].apply(lambda x: i in x)
  df_crewname['{}_{}'.format(job,i)] = df[col].apply(lambda x: i in x)
  df_crewname['{}_{}'.format(job,i)] = df[col].apply(lambda x: i in x)
  df_c

In [40]:
df.columns

Index(['belongs_to_collection', 'budget', 'genres', 'homepage', 'imdb_id',
       'original_language', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'runtime', 'spoken_languages',
       'status', 'tagline', 'title', 'Keywords', 'cast', 'crew', 'revenue',
       'popularity2', 'rating', 'totalVotes', 'log_revenue', 'log_budget',
       'isbelongs_to_collectionNA', 'num_genres', 'isOriginalLanguageEng',
       'release_year', 'release_month', 'release_day', 'release_dayofyear',
       'release_dayofweek', 'has_homepage', 'num_Keywords',
       'overview_word_count', 'overview_char_count', 'tagline_word_count',
       'tagline_char_count', 'isTaglineNA', 'title_word_count',
       'title_char_count', 'num_cast', 'genders_0_cast', 'genders_1_cast',
       'genders_2_cast', 'num_crew', 'dep_Directing_num', 'dep_Writing_num',
       'dep_Production_num', 'dep_Sound_num', 'dep_Camera_num',
       'dep_Editing_num', 'dep_Art_num', 'dep_Costum

In [41]:
df['job_Animation_num'] = df["crew"].apply(select_job, key="job", value="Animation").apply(len)

In [42]:

df['genders_0_crew'] = df['crew'].apply(lambda x: sum([1 for i in x if i['gender'] == 0]))
df['genders_1_crew'] = df['crew'].apply(lambda x: sum([1 for i in x if i['gender'] == 1]))
df['genders_2_crew'] = df['crew'].apply(lambda x: sum([1 for i in x if i['gender'] == 2]))

df[['genders_0_crew', 'genders_1_crew','genders_2_crew']] = df[['genders_0_crew', 'genders_1_crew','genders_2_crew']].fillna(df[['genders_0_crew', 'genders_1_crew','genders_2_crew']].mean())

In [43]:
df['budget_runtime_ratio'] = df['budget']/df['runtime']

In [44]:
df['budget_popularity_ratio'] = df['budget']/df['popularity']

In [45]:
df['budget_popularity2_ratio'] = df['budget']/df['popularity2']
df['budget_year_ratio'] = df['budget']/df['release_year']
df['production_countries_count'] = df['production_countries'].apply(lambda x : len(x))
df['production_companies_count'] = df['production_companies'].apply(lambda x : len(x))

In [46]:
df["collection_name"] = df["belongs_to_collection"].apply(lambda x : x[0]["name"] if len(x)>0 else 0)
le.fit(list(df['collection_name'].fillna('')))
df['collection_name'] = le.transform(df['collection_name'].fillna('').astype(str))

In [47]:
df['mean_pop1_bud'] = df.groupby('popularity')['budget'].transform('mean')
df['mean_pop2_bud'] = df.groupby('popularity2')['budget'].transform('mean')
df['mean_year_bud'] = df.groupby('release_year')['budget'].transform('mean')
df['mean_pop1_rate'] = df.groupby('popularity')['rating'].transform('mean')
df['mean_pop2_rate'] = df.groupby('popularity2')['rating'].transform('mean')
df['mean_rate_tV'] = df.groupby('rating')['totalVotes'].transform('mean')

In [48]:
df['runtime_to_mean_year'] = df['runtime'] / df.groupby("release_year")["runtime"].transform('mean')
df['popularity_to_mean_year'] = df['popularity'] / df.groupby("release_year")["popularity"].transform('mean')
df['budget_to_mean_year'] = df['budget'] / df.groupby("release_year")["budget"].transform('mean')

In [49]:
for i in dfdic_feature:
    dfdic_feature[i]['id'] = range(1, len(dfdic_feature[i]) + 1)
    dfdic_feature[i] = dfdic_feature[i].set_index('id')

In [50]:
df_features = pd.concat(dfdic_feature.values(), axis=1, ignore_index=True)

In [51]:
df_features.shape

(7398, 81)

In [52]:
df_use = df[['num_cast', 'genders_0_cast','runtime_to_mean_year','budget_to_mean_year',"log_budget",
       'genders_1_cast','genders_2_cast', 'num_crew', 'genders_0_crew', 'genders_1_crew','genders_2_crew',
             "tagline_word_count","overview_word_count","title_word_count","has_homepage",
            'popularity','runtime','release_year', 'release_month','release_dayofweek',"num_genres"
            ,"popularity2","rating","totalVotes",'isOriginalLanguageEng',
             'budget_runtime_ratio','budget_popularity_ratio','budget_year_ratio','budget_popularity2_ratio',
            'production_countries_count','production_companies_count','mean_pop1_bud','mean_pop2_bud','mean_year_bud','mean_pop1_rate',
            'mean_pop2_rate','mean_rate_tV',
           'dep_Directing_num', 'dep_Writing_num', 'dep_Production_num',
       'dep_Sound_num', 'dep_Camera_num', 'dep_Editing_num', 'dep_Art_num',
       'dep_Costume & Make-Up_num', 'dep_Crew_num', 'dep_Lighting_num',
       'dep_Visual Effects_num', 'dep_Actors_num', 'job_Animation_num' ]]

In [53]:
df_use = pd.concat([df_use, df_features], axis=1)

In [54]:
df_use.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in df_use.columns]

In [55]:
trainX = df_use.iloc[:train.shape[0],:].reset_index(drop=True)
test_X = df_use.iloc[train.shape[0]:,:].reset_index(drop=True)
trainy = np.log1p(train["revenue"])

In [56]:
test_X.shape

(4398, 131)

In [57]:
X_train, X_test, y_train, y_test = train_test_split(trainX,trainy,test_size=0.3,random_state=100)

In [75]:
#dont use train test split to train the real model. 
X_train = trainX
y_train = trainy

In [76]:
import xgboost as xgb
xgbmodel = xgb.XGBRegressor(max_depth=6, 
                            min_child_weight=3,
                            alpha = 0.5,
                            learning_rate=0.05, 
                            n_estimators=150, 
                            objective='reg:linear', 
                            gamma=0.01,  
                            silent=1,
                            subsample=0.8, 
                            colsample_bytree=0.8)

In [77]:
xgbmodel.fit(X_train, y_train)

Parameters: { "silent" } are not used.



In [78]:
pred_xgb = xgbmodel.predict(test_X)
test_id = test["id"].astype('Int32')
pred_xgb = pd.DataFrame(np.expm1(pred_xgb),columns=["revenue"])
sub=pd.concat([test_id, pred_xgb],axis=1)
sub.to_csv('TMDB_xgb.csv',index=False)

In [79]:
import lightgbm as lgb
import catboost as cat

In [80]:
lgbmodel = lgb.LGBMRegressor(n_estimators=150, 
                             objective='regression', 
                             metric='rmse',
                             max_depth = 5,
                             num_leaves=30, 
                             min_child_samples=30,
                             learning_rate=0.05,
                             boosting = 'gbdt',
                             min_data_in_leaf= 15,
                             feature_fraction = 0.9,
                             bagging_freq = 1,
                             bagging_fraction = 0.9,
                             importance_type='gain',
                             lambda_l1 = 0.2, 
                             subsample=.8, 
                             colsample_bytree=.8,
                             use_best_model=True)

In [81]:
lgbmodel.fit(X_train, y_train)



In [82]:
pred_train2 = lgbmodel.predict(X_train)
pred_test2 = lgbmodel.predict(X_test)

In [84]:
pred_lgb = lgbmodel.predict(test_X)

In [85]:
pred_lgb = pd.DataFrame(np.expm1(pred_lgb),columns=["revenue"])
pred_lgb

Unnamed: 0,revenue
0,1.024921e+07
1,2.103732e+06
2,7.067813e+06
3,1.096651e+07
4,7.342764e+05
...,...
4393,3.854020e+07
4394,2.676140e+07
4395,3.392100e+07
4396,2.270883e+07


In [86]:
sub1=pd.concat([test_id, pred_lgb],axis=1)
sub1.to_csv('TMDB_lgb.csv',index=False)

In [87]:
catmodel = cat.CatBoostRegressor(iterations=2000, 
                                 learning_rate=0.01, 
                                 depth=8, 
                                 eval_metric='RMSE',
                                 colsample_bylevel=0.8,
                                 bagging_temperature = 0.2,
                                 metric_period = None,
                                 early_stopping_rounds=200)

In [88]:
catmodel.fit(X_train, y_train)

0:	learn: 3.0474585	total: 6.32ms	remaining: 12.6s
1:	learn: 3.0339631	total: 14.2ms	remaining: 14.2s
2:	learn: 3.0209238	total: 22.4ms	remaining: 14.9s
3:	learn: 3.0089888	total: 30.3ms	remaining: 15.1s
4:	learn: 2.9959251	total: 38.3ms	remaining: 15.3s
5:	learn: 2.9830462	total: 47ms	remaining: 15.6s
6:	learn: 2.9701897	total: 61.1ms	remaining: 17.4s
7:	learn: 2.9591553	total: 69.9ms	remaining: 17.4s
8:	learn: 2.9477532	total: 78.5ms	remaining: 17.4s
9:	learn: 2.9364012	total: 86.2ms	remaining: 17.1s
10:	learn: 2.9249534	total: 93.7ms	remaining: 17s
11:	learn: 2.9131815	total: 102ms	remaining: 16.9s
12:	learn: 2.9016342	total: 109ms	remaining: 16.7s
13:	learn: 2.8910753	total: 116ms	remaining: 16.4s
14:	learn: 2.8803434	total: 124ms	remaining: 16.3s
15:	learn: 2.8700563	total: 131ms	remaining: 16.3s
16:	learn: 2.8590939	total: 139ms	remaining: 16.2s
17:	learn: 2.8484080	total: 147ms	remaining: 16.2s
18:	learn: 2.8374148	total: 155ms	remaining: 16.1s
19:	learn: 2.8267127	total: 162ms	

<catboost.core.CatBoostRegressor at 0x1a3e4049000>

In [89]:
pred_cat = catmodel.predict(test_X)
pred_cat = pd.DataFrame(np.expm1(pred_cat),columns=["revenue"])
pred_cat

Unnamed: 0,revenue
0,5.039568e+06
1,1.480847e+06
2,8.291562e+06
3,7.591512e+06
4,9.178180e+05
...,...
4393,4.428913e+07
4394,2.990588e+07
4395,3.010510e+07
4396,2.421864e+07


In [90]:
sub2=pd.concat([test_id, pred_cat],axis=1)
sub2.to_csv('TMDB_cat.csv',index=False)