In [13]:
# We first read the french movies "entrées"
from lib.preprocessing.load import read_movies_entrees, read_movies_features
bo = read_movies_entrees('data/french-box-office-29nov2020.json')

# Then fetch their main features. Note that it seems not all movies were found in the feature database
features = read_movies_features('data/movie-features-29nov2020.json')

# Let's merge both dataframes
import pandas as pd
data = pd.merge(bo, features, on='id')
data

Unnamed: 0,year,title,id,sales,release_date,is_adult,is_part_of_collection,budget,genres,original_language,production_countries,languages,runtime
0,2019,Maléfique : Le Pouvoir du Mal,19073,786485,2019-10-16,False,True,185000000,"[Fantastique, Familial, Aventure]",en,[US],[en],110.0
1,2019,Nous finirons ensemble,18875,1261701,2019-05-01,False,True,0,"[Comédie, Drame]",fr,"[BE, FR]",[fr],135.0
2,2019,Spider-Man: Far from Home,18243,1370178,2019-07-03,False,True,160000000,"[Action, Aventure, Science-Fiction]",en,[US],"[cs, nl, en, de, it]",129.0
3,2019,Jumanji: next level,18258,785636,2019-12-04,False,True,125000000,"[Aventure, Comédie, Fantastique]",en,[US],[en],123.0
4,2019,Dragons 3 : Le monde caché,18167,1224811,2019-02-06,False,True,129000000,"[Animation, Familial, Aventure]",en,"[JP, US]",[en],104.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7048,2017,La Planète des Singes - Suprématie,15748,1016520,2017-08-02,False,True,150000000,"[Drame, Science-Fiction, Guerre]",en,"[CA, US, NZ]",[en],142.0
7049,2017,Cars 3,16452,950353,2017-08-02,False,True,175000000,"[Aventure, Animation, Comédie, Familial]",en,[US],[en],109.0
7050,2017,Le Sens de la fête,17386,808544,2017-10-04,False,False,17200000,[Comédie],fr,[FR],[fr],117.0
7051,2017,Cinquante nuances plus sombres,15186,1618110,2017-02-08,False,True,55000000,"[Drame, Romance]",en,[US],[en],118.0


In [14]:
# We first use little house keeping following what we learned in the last notebook

# Remove movies with zero sales
data = data[data["sales"] != 0]

# Remove is_adult variable
data = data.drop(["is_adult"], axis=1, inplace=False)

# For the moment we remove the title columns as well, as we don't know how to process nlp stuff
data = data.drop(["title"], axis=1, inplace=False)

# We remove also complex data => languages and countries of production
data = data.drop(["languages", "production_countries", "year"], axis=1, inplace=False)

# No need of ids anymore
data = data.drop(["id"], axis=1, inplace=False)

In [15]:
# Now let's properly encode our variables
from lib.preprocessing.encode import dummy_encode

data = dummy_encode(
    data, 
    'original_language',
    'original_language_is',
    'fr'
)

In [16]:
from lib.preprocessing.encode import multilabel_encode

data = multilabel_encode(
    data=data, 
    column_to_encode='genres',
    column_to_drop='Action')

In [17]:
data

Unnamed: 0,sales,release_date,is_part_of_collection,budget,runtime,original_language_is_aa,original_language_is_ab,original_language_is_af,original_language_is_ar,original_language_is_bm,...,Guerre,Histoire,Horreur,Musique,Mystère,Romance,Science-Fiction,Thriller,Téléfilm,Western
0,786485,2019-10-16,True,185000000,110.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1261701,2019-05-01,True,0,135.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1370178,2019-07-03,True,160000000,129.0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,785636,2019-12-04,True,125000000,123.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1224811,2019-02-06,True,129000000,104.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7048,1016520,2017-08-02,True,150000000,142.0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
7049,950353,2017-08-02,True,175000000,109.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7050,808544,2017-10-04,False,17200000,117.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7051,1618110,2017-02-08,True,55000000,118.0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [18]:
# For the moment we drop rows with nans but we should properly handle these values
data = data.dropna(inplace=False)
data # We removed 40 rows 

Unnamed: 0,sales,release_date,is_part_of_collection,budget,runtime,original_language_is_aa,original_language_is_ab,original_language_is_af,original_language_is_ar,original_language_is_bm,...,Guerre,Histoire,Horreur,Musique,Mystère,Romance,Science-Fiction,Thriller,Téléfilm,Western
0,786485,2019-10-16,True,185000000,110.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1261701,2019-05-01,True,0,135.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1370178,2019-07-03,True,160000000,129.0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,785636,2019-12-04,True,125000000,123.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1224811,2019-02-06,True,129000000,104.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7048,1016520,2017-08-02,True,150000000,142.0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
7049,950353,2017-08-02,True,175000000,109.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7050,808544,2017-10-04,False,17200000,117.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7051,1618110,2017-02-08,True,55000000,118.0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [19]:
# Let's prepare our data into exog, endo data
data = data.sort_values(by='release_date', inplace=False)

# Features
import statsmodels.api as sm
data_exog = data.copy()
data_exog = data_exog.drop(['sales', 'release_date'], axis=1)
data_exog = sm.add_constant(data_exog, prepend=True)
data_exog = data_exog.astype(float)

# Target
data_endo = data['sales'].copy()
data_endo = data_endo.astype(float)

In [20]:
data

Unnamed: 0,sales,release_date,is_part_of_collection,budget,runtime,original_language_is_aa,original_language_is_ab,original_language_is_af,original_language_is_ar,original_language_is_bm,...,Guerre,Histoire,Horreur,Musique,Mystère,Romance,Science-Fiction,Thriller,Téléfilm,Western
4802,139087,2000-01-01,False,0,120.0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3144,66228,2000-01-05,False,22000000,142.0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
3139,32954,2000-01-05,False,0,116.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6271,1463152,2000-01-05,False,0,77.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2301,25224,2000-01-12,False,9000000,165.0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499,15304,2020-10-07,False,0,60.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
450,25924,2020-10-07,False,0,93.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
453,30218,2020-10-07,False,0,107.0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
466,11185,2020-10-14,False,0,52.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
# Let's split our dataset into a train / test using released dates
import numpy as np

# Split
X_train, X_test = np.split(data_exog, [int(.8 * len(data_exog))])
y_train, y_test = np.split(data_endo, [int(.8 * len(data_endo))])

In [22]:
# Fit models
import numpy as np
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor

mod = sm.OLS(y_train, X_train)
res = mod.fit()

models = {
    "Linear Regression": sm.OLS(y_train, X_train).fit(),
    "Decision Tree": tree.DecisionTreeRegressor().fit(X_train, y_train),
    "Random Forest": RandomForestRegressor(random_state=1, n_estimators=50).fit(X_train, y_train)
}

for model_name, model in models.items():
    preds = model.predict(X_test)
    fa = round((1 - np.sum(np.abs(y_test - preds))/np.sum(y_test)) * 100, 2)
    
    print()
    print(model_name)
    print(fa)


Linear Regression
19.98

Decision Tree
0.43

Random Forest
12.23


In [23]:
# Interpreting linear regression coefs => 
#   - Being part of collection as a strong impact on sales
#   - Budget as well, positive impact + significant
#   - Runtime as a positive impact
#   - Among original languages, mostly European languages have a significant impact (negative compared to if movie was written in French)
#   - Some genres make a difference compared to Action (Horror => reduces sales, Aventure increases them...)
print(models['Linear Regression'].summary())

                            OLS Regression Results                            
Dep. Variable:                  sales   R-squared:                       0.380
Model:                            OLS   Adj. R-squared:                  0.373
Method:                 Least Squares   F-statistic:                     49.34
Date:                Sun, 13 Dec 2020   Prob (F-statistic):               0.00
Time:                        19:23:55   Log-Likelihood:                -76072.
No. Observations:                5456   AIC:                         1.523e+05
Df Residuals:                    5388   BIC:                         1.527e+05
Df Model:                          67                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                     