<table><tr>
<td> <img src="https://upload.wikimedia.org/wikipedia/fr/thumb/e/e5/Logo_%C3%A9cole_des_ponts_paristech.svg/676px-Logo_%C3%A9cole_des_ponts_paristech.svg.png" width="200"  height="200" hspace="200"/> </td>
<td> <img src="https://pbs.twimg.com/profile_images/1156541928193896448/5ihYIbCQ_200x200.png" width="200" height="200" /> </td>
</tr></table>

<br/>

<h1><center>Session 11 - Model Serving</center></h1>



<font size="3">This session is divided into **2** parts:
- **Model selection**
- **Model optimization:**
>  * 1-Hyperparameters optimization
>  * 2-Features selection

In each of these parts, some **guidelines** and **hints** are given for each task. 
Do not hesitate to check the links to documentation to understand the functions you use. 
    
The goal of this session is to **select a model** that you will use as your best candidate and optimize it to get the best out of it.
</font>

# 1. Save Model

In [4]:
%config Completer.use_jedi = False

In [2]:
import os

import pandas as pd
from lib.evaluation.evaluate import evaluate
from lib.modelling.training import train
from lib.preprocessing.preprocess import (clean_data, get_x_y,
                                          train_test_split_by_date,
                                          transform_target)
from lib.utils.io import load_dataset
from lightgbm import LGBMRegressor
from loguru import logger
from config import TRAINING_DATASET_FILEPATH, LGBM_BEST_PARAMS, BEST_K_FEATURES, FEATURE_IMPORTANCE

In [3]:
raw_data = load_dataset(TRAINING_DATASET_FILEPATH)
data = clean_data(raw_data, drop_2020=False)
train_data, validation_data, test_data = train_test_split_by_date(data,
                                                                '2018-01-01',
                                                                '2020-01-01')
train_x, train_y = get_x_y(train_data)
validation_x, validation_y = get_x_y(validation_data)
test_x, test_y = get_x_y(test_data)
lgbm = LGBMRegressor(**LGBM_BEST_PARAMS)
features_list = FEATURE_IMPORTANCE[:BEST_K_FEATURES+1]
msg = f"Training fitting LightGBM using features: {features_list}"
msg += f"hyper-parameters: {LGBM_BEST_PARAMS}"
logger.info(msg)
lgbm = train(lgbm, train_x[features_list], train_y, transformer=transform_target)
logger.info("Evaluate on validation set ...")
evaluate(lgbm, validation_x[features_list], validation_y, transformer=transform_target)
logger.info("Evaluate on test set...")
evaluate(lgbm, test_x[features_list], test_y, transformer=transform_target)

2021-04-23 16:06:10.320 | INFO     | lib.utils.io:load_dataset:27 - loading raw data /Users/hugo/Documents/PONTS/french-box-office/data/data_prepared_session4.csv...
2021-04-23 16:06:10.455 | INFO     | lib.preprocessing.preprocess:clean_data:7 - cleaning data..
2021-04-23 16:06:10.511 | INFO     | __main__:<module>:13 - Training fitting LightGBM using features: ['runtime', 'mean_5_popularity', 'mean_3_popularity', 'budget', 'actor_1_sales', 'mean_sales_actor', 'max_sales_actor', 'actor_3_sales', 'actor_2_sales', 'month', 'cos_month', 'Comédie', 'Drame', 'is_part_of_collection', 'rolling_sales_collection', 'prod_FR', 'Action', 'prod_OTHER', 'available_lang_fr', 'original_lang_fr', 'holiday', 'Romance', 'original_lang_en', 'prod_US', 'Familial', 'nb_movie_collection', 'Horreur', 'available_lang_other', 'prod_GB', 'Other', 'original_lang_other', 'available_lang_it', 'Fantastique', 'available_lang_en', 'vacances_zone_c', 'vacances_zone_a', 'available_lang_es']hyper-parameters: {'max_depth



2021-04-23 16:06:10.780 | INFO     | lib.modelling.training:train:12 - {'mape': 192.63352372800645, 'rmse': 234053.95434579128, 'mae': 102825.81323968484}
2021-04-23 16:06:10.783 | INFO     | __main__:<module>:15 - Evaluate on validation set ...
2021-04-23 16:06:10.801 | INFO     | lib.evaluation.evaluate:evaluate:33 - {'mape': 364.0570041903621, 'rmse': 238342.93099787852, 'mae': 104460.42007271445}
2021-04-23 16:06:10.802 | INFO     | __main__:<module>:17 - Evaluate on test set...
2021-04-23 16:06:10.824 | INFO     | lib.evaluation.evaluate:evaluate:33 - {'mape': 177.9903019586095, 'rmse': 115865.58894202021, 'mae': 63725.9358367287}


In [13]:
type(lgbm)

lightgbm.sklearn.LGBMRegressor

In [5]:
# find how to save the model in a file
lgbm.booster_.save_model('model.txt', num_iteration=lgbm.best_iteration_)

<lightgbm.basic.Booster at 0x10fcea750>

In [11]:
def save_model(model, filepath):
    model.booster_.save_model(filepath, num_iteration=lgbm.best_iteration_)

In [8]:
# load the model from the file
model = LGBMRegressor(model_file='model.txt')

In [9]:
evaluate(lgbm, test_x[features_list], test_y, transformer=transform_target)

2021-04-23 16:13:34.567 | INFO     | lib.evaluation.evaluate:evaluate:33 - {'mape': 177.9903019586095, 'rmse': 115865.58894202021, 'mae': 63725.9358367287}


# Prediction on new data

In [23]:
def predict(model, features, transformer=None, ret=False):
    predicted_target = model.predict(features)
    if transformer:
        predicted_target = transformer(predicted_target, forward=False)
    return predicted_target

In [42]:
test_json = test_x.iloc[0].to_dict()

In [36]:
test_foo = pd.DataFrame([test_x.iloc[0].to_dict()])

In [37]:
predictions = predict(lgbm, test_foo[features_list], transformer=transform_target)

In [38]:
predictions

[17607.51362607005]

# make a requests again server

In [41]:
import requests
import json

In [61]:
res = requests.post('http://0.0.0.0:8080/predict', json={'user': 'john'})

In [62]:
res.content

b'"hello john"'

## Apply on new movie

In [27]:
from lib.utils.io import read_movies_entrees, read_movies_features

df_boxoffice = read_movies_entrees(os.path.join(ROOT_DIRPATH, 'data', 'french-box-office-23april2021.json'))
df_features = read_movies_features(os.path.join(ROOT_DIRPATH, 'data', 'movie-features-23-april-2021.json'))

ImportError: cannot import name 'read_movies_entrees' from 'lib.utils.io' (/Users/hugo/Documents/PONTS/french-box-office/lib/utils/io.py)

In [30]:
from vacances_scolaires_france import SchoolHolidayDates

fr_holidays = SchoolHolidayDates()
df_vacances = pd.DataFrame()
for year in [2021]:
    df_vacances = pd.concat([df_vacances, pd.DataFrame.from_dict(fr_holidays.holidays_for_year(year)).T])

In [19]:
new_film = json.loads(NEW_FILM)

## We need to preprocess data - Package the following code into one workflow function

In [None]:
# Here are some functions we wrote to help you, feel free to check them out to see what they do
lang_to_keep = ['en', 'fr', 'es', 'it', 'ja', 'de']
country_to_keep = ['FR', 'US', 'GB', 'DE', 'BE', 'CA']
dict_genres = {
    'Drame': 'Drame',
    'Comédie': 'Comédie',
    'Romance': 'Romance',
    'Action': 'Action',
    'Thriller': 'Action',
    'Aventure': 'Action',
    'Crime': 'Action',
    'Guerre': 'Action',
    'Western': 'Action',
    'Familial': 'Familial',
    'Animation': 'Familial',
    'Fantastique': 'Fantastique',
    'Science-Fiction': 'Fantastique',
    'Horreur': 'Horreur',
    'Mystère': 'Other',
    'Musique': 'Other',
    'Histoire': 'Other',
    'Documentaire': 'Other',
    'Téléfilm': 'Other'
}


def read_movies_entrees(path):
    '''
    Read the box office dataset 
    and casts it as an usable pandas DataFrame

    Parameters
    ----------
    path: str
        path to the dataset

    Returns
    -------
    df: pd.DataFrame
        Data as DataFrame
    '''
    bo = read_from_json(path)
    bo = [
        {
            "year": item['year'], 
            "title": item['title'], 
            "id": int(item['id']), 
            "sales": item['first_week_sales'],
            "release_date": item['release_date']
        } for item in bo
    ]
    return pd.DataFrame(bo)


def read_movies_features(path):
    '''
    Read the movie features dataset 
    and casts it as an usable pandas DataFrame
    N.B: Fields that are not yet used are commented
    Parameters
    ----------
    path: str
        path to the dataset
    Returns
    -------
    df: pd.DataFrame
        Data as DataFrame
    '''
    features = read_from_json(path)
    features = [
        {
            "is_adult": item['adult'],
            "is_part_of_collection": not not item['belongs_to_collection'],
            "collection_name": item['belongs_to_collection']['name'] if item['belongs_to_collection'] != {} else None, # Currently simple bool, may be interesting to use a more complex feature later
            "budget": item['budget'],
            "genres": [ genre['name'] for genre in item['genres'] ], 
            "original_language": item['original_language'],
            "overview": item['overview'], # Not used yet. Blob of text
            "production_countries": [ country['iso_code'] for country in item['production_countries'] ],
            "languages": [ language['iso_code'] for language in item['languages'] ],
            "tagline": item['tagline'], # Not used yet. Blob of text
            "runtime": item['runtime'],
            "cast": item['cast'], # Not used yet. List of dicts with actor gender, name, id...
            "id": int(item['id'])
        } for item in features
    ]
    return pd.DataFrame(features)


def read_from_json(path):
    '''
    Read and cast a json into a python object
    
    Parameters
    ----------
    path: str
        Path to json file

    Returns
    -------
    data: Union[dict, list]
        Json casted as python object
    '''
    data = json.load(open(path, 'r', encoding='utf-8', errors="ignore"))
    return data


def reduce_lang_categories(lang_list, lang_to_keep=lang_to_keep):
    return list(set([el if el in lang_to_keep else 'other' for el in lang_list]))


def reduce_country_categories(country_list, country_to_keep=country_to_keep):
    return list(set([el if el in country_to_keep else 'OTHER' for el in country_list]))


def reduce_genre_categories(genre_list, dict_genres=dict_genres):
    return list(set([dict_genres[el] for el in genre_list]))


def flatten_list_series(column):
    flattened_series = column.apply(pd.Series).stack().reset_index(drop=True)
    flattened_series.name = column.name
    return pd.DataFrame(flattened_series)

In [None]:
df_boxoffice = read_movies_entrees('../../data/french-box-office-29nov2020.json')
df_features = read_movies_features('../../data/movie-features-29nov2020.json')

data = pd.merge(df_boxoffice, df_features, on='id')

In [None]:
data = data.loc[data['sales'] != 0]
# Missing/zero values for budget
# Hint: check the number of missing values to see which approach is the best suited
median = np.median(data.loc[data['budget'] != 0]['budget'])
print(median) # 25000000.0

In [None]:
# setting Null and zeros values to median
data.loc[(data['budget'] == 0) | (data['budget'].isnull()), 'budget'] = median

In [None]:
# Missing/zero values for runtime
# Hint: check the number of missing values to see which approach is the best suited
runtime_mean = np.mean(data.loc[(data['runtime'] != 0) & (data['runtime'].isnull() == False)]['runtime'])
print(runtime_mean)
data.loc[(data['runtime'].isnull() == True) | (data['runtime'] == 0), 'runtime'] = runtime_mean

In [None]:
# To go further:
# Missing values for production countries
# Hint: you can use the column "original_language" (e.g.: if original_language is "FR", the production country
# is likely to be France)

# For all movies where the production country is empty and the original language is french, we suppose that the
# production country is France
data.loc[(data.astype(str)['production_countries'] == '[]') & (
    data['original_language'] == 'fr'), 'production_countries'] = ['FR']

# For all movies where the production country is empty and the original language is english, we suppose that the
# production country is USA
data.loc[(data.astype(str)['production_countries'] == '[]') & (
    data['original_language'] == 'en'), 'production_countries'] = ['US']

# Elsewhere we put 'Other'
data.loc[(data.astype(str)['production_countries'] == '[]'), 'production_countries'] = ['OTHER']

In [None]:
# Reduce number of categories for: Original language
# Hint: you can use the lang_to_keep variable from part 0, already imported. If the original language is in
# lang_to_keep then we keep it, otherwise we set the value to 'other'
data['original_language'] = data['original_language'].map(lambda x: x if x in lang_to_keep else 'other')

In [None]:
# Reduce number of categories for: languages, production_countries and genres
# For example for languages:
data['languages'] = data['languages'].map(lambda x: reduce_lang_categories(x))

# Hint: for production_countries, you can use the reduce_country_categories() function from part 0,
# already imported
data['production_countries'] = data['production_countries'].map(lambda x: reduce_country_categories(x))

# Hint: for production_countries, you can use the reduce_genre_categories() function from part 0,
# already imported
data['genres'] = data['genres'].map(lambda x: reduce_genre_categories(x))

In [None]:
# Encode is_part_of_collection into numerical
# Hint: you can use a dictionary to map numerical values for each value of is_part_of_collection (True or False)
dict_collection = {
    True: 1,
    False: 0
}

data['is_part_of_collection'] = data['is_part_of_collection'].map(dict_collection)

In [None]:
# Encode original_language
# Hint: you can perform one-hot encoding with pd.get_dummies(), check documentation to see how to use it (
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.get_dummies.html)

# Adding the argument "drop_first=True" allows to to get k-1 dummies out of k categorical levels by removing the 
# first level to avoid multicolinearity
data_final = pd.get_dummies(data, prefix='original_lang', columns=['original_language'], drop_first=True)

In [None]:
# Encode categorical features with multiple categories
# Hint: you can use MultiLabelBinarizer(). MultiLabelBinarizer is used when an observation can have multiple 
# categories: here a movie can be available in french, english, german and japanese.

# Set index to ID for MultiLabelBinarizer
data_final = data_final.set_index('id')

# Languages
mlb = MultiLabelBinarizer()
# Create a new dataset 'df_lang' that will contain one binary column for each language
# (ex: available_lang_fr takes 1 if 'fr' was in the column 'languages', else 0)
df_lang = pd.DataFrame(mlb.fit_transform(data_final['languages']), columns=mlb.classes_, index=data_final.index)
df_lang.columns = ['available_lang_' + col for col in df_lang.columns]

# Genres
mlb = MultiLabelBinarizer()
df_genre = pd.DataFrame(mlb.fit_transform(data_final['genres']), columns=mlb.classes_, index=data_final.index)

# Production countries
mlb = MultiLabelBinarizer()
df_country = pd.DataFrame(mlb.fit_transform(data_final['production_countries']), columns=mlb.classes_,
                          index=data_final.index)
df_country.columns = ['prod_' + col for col in df_country.columns]

In [None]:
# Merge encoded dataframes and store it into a new dataframe named data_final
# Hint: you can use the merge function from pandas, pd.merge(), check documentation to see how to use it

# We successively merge the three new datasets with data_final: df_lang, df_genre, df_country
# NB: the merge function from pandas can be used in two ways: either pd.merge(df1, df2) or df1.merge(df2)
data_final = pd.merge(data_final, df_lang, left_index=True, right_index=True) \
               .merge(df_genre, left_index=True, right_index=True) \
               .merge(df_country, left_index=True, right_index=True)

In [None]:
# Load school holidays for France
fr_holidays = SchoolHolidayDates()
df_vacances = pd.DataFrame()
for year in list(set(data_final['year'])):
    df_vacances = pd.concat([df_vacances, pd.DataFrame.from_dict(fr_holidays.holidays_for_year(year)).T])

# Load bank holidays for France
df_jf = pd.DataFrame()
for year in list(set(data_final['year'])):
    df_jf = pd.concat([df_jf, pd.DataFrame([
        {'date': el[0], 'jour_ferie': el[1]} for el in sorted(holidays.FRA(years=year).items())])])
    
# Merge school and bank holidays
df_holidays = pd.merge(df_vacances, df_jf, how='outer', on='date')

In [None]:
# Create features from df_holidays dataframes (school holidays and bank holidays):
# - 3 binary features for school holidays, taking 1 if the given zone is on holiday, else 0 (vacances_zone_a, 
# vacances_zone_b, vacances_zone_c)

# Definition of a dictionary to encode boolean into numeric
dict_map_vac = {
    True: 1,
    False: 0
}
# Apply dictionary to each holiday column for the three zones (A, B, C)
df_holidays['vacances_zone_a'] = df_holidays['vacances_zone_a'].map(dict_map_vac)
df_holidays['vacances_zone_b'] = df_holidays['vacances_zone_b'].map(dict_map_vac)
df_holidays['vacances_zone_c'] = df_holidays['vacances_zone_c'].map(dict_map_vac)

# - 1 binary feature for bank holiday, taking 1 if it is a bank holiday, else 0
# The column "jour ferie" contains either the name of the holiday or a missing value (NaN)
# The idea is to put a '1' when it's a holiday (i.e. when the value is different from nan, else 0)
df_holidays['jour_ferie'] = df_holidays['jour_ferie'].map(lambda x: 1 if str(x) != 'nan' else 0)

# - To go further: Try to create a combined feature with school and bank holidays
df_holidays['holiday'] = df_holidays['vacances_zone_a'] + df_holidays['vacances_zone_b'] + df_holidays[
    'vacances_zone_c'] + df_holidays['jour_ferie']

In [None]:
# Merge df_holidays that contains newly created features with the main dataframe and store it into data_final

# We change the type of the column "date" of the dataframe df_holidays to be able to merge it with our main
# dataframe. Otherwise we would get a Type Error because you cannot merge on a column that has different types
df_holidays['date'] = df_holidays['date'].map(lambda x: str(x))
data_final_cal = pd.merge(data_final, df_holidays, how='left', left_on='release_date', right_on='date').fillna(0)

In [20]:
# Create calendar features for month: 
# - the number of the month (named "month")

# The date is in this format : "2019-02-06", the month corresponds to the 5th and 6th characters so we extract it
# NB: when you use "[" and "]", the last number is not included, therefore x[5:7] will take characters from the 5th
# position to the 7th position not included (i.e. the 5th and 6th characters)
data_final_cal['month'] = data_final_cal['release_date'].map(lambda x: int(x[5:7]))

# To go further: try to transform the "month" variable using a mathematical function to capture cyclicity
# (January (1) comes right after December (12))

def apply_cos(df: pd.DataFrame,
              x: str, col_name: str, period: int) -> pd.DataFrame:
    """ Cos function on a column, for a specified period
    """
    df[col_name] = 2 * np.cos(2 * np.pi * df[x] / period)
    return df

data_final_cal = apply_cos(data_final_cal, 'month', 'cos_month', 12)

NameError: name 'data_final_cal' is not defined

In [None]:
# Collection with an high number of movies are often sagas that have worked well (ex: Star Wars, Fast and
# Furious, ...)
# We can therefore use the variable "is_part_of_collection" to compute the number of movies per collection
# Hint: to create this kind of feature, you can use the .value_counts() method

# Exclude collections with only one movie

# We count the number of movies per collection
df_count_col = data_final_cal.groupby(['collection_name']).count().reset_index()
# We define a list of collection names with less than 2 movies (we won't take them into account)
not_collection = list(set(df_count_col.loc[df_count_col['year'] < 2]['collection_name']))
# For movies with less than 2 movies per collection, we set the values of "is_part_of_colleciton" to 0
data_final_cal.loc[data_final_cal['collection_name'].isin(not_collection), 'is_part_of_collection'] = 0
# For movies with less than 2 movies per collection, we set the values of "collection_name" to None
data_final_cal.loc[data_final_cal['collection_name'].isin(not_collection), 'collection_name'] = None

# Create the feature: number of movies per collection
# We define a dictionary with the number of movies per collection (only collections with at least 2 movies since
# we excluded the other ones just before)
map_col_count = dict(data_final_cal['collection_name'].value_counts())
# We remove the "None" collection (i.e. the first item of the dictionary that corresponds to all movies that are
# not part of a collection)
del map_col_count[0]
# We map the dictionary into a new feature: the number of movies per collection
data_final_cal['nb_movie_collection'] = data_final_cal['collection_name'].map(map_col_count)


In [None]:
# To go (much) further:
# Movies from a same collection will tend to have a similar number of sales
# We can therefore use the variable "is_part_of_collection" to calculate an average of the sales of previous films
# from the same collection
# Hint: to create this kind of feature, you can use the .groupby(), .transform(), .rolling(), .mean() and .shift()
# methods

# We isolate the movies that are part of a collection and we store it into df_collection
df_collection = data_final_cal.loc[data_final_cal['is_part_of_collection'] == 1]
# We compute the rolling average of the sales of the 10 previous movies per collection 
df_collection['rolling_sales_collection'] = df_collection.sort_values(by=['collection_name', 'release_date']) \
             .groupby('collection_name')['sales'] \
             .transform(lambda x: x.rolling(10, 1).mean().shift())


# Merge with main dataframe
cols = ['year', 'title', 'release_date', 'collection_name', 'sales', 'rolling_sales_collection']
df_all = pd.merge(data_final_cal, df_collection[cols], how = 'left', 
                  on = ['year', 'title', 'release_date', 'collection_name', 'sales']).fillna(0)

In [None]:
# A movie with bankable actors is more likely to have an important number of entrees than a movie with unknown 
# actors
# We can leverage the "cast" feature that contains information about actors and their popularity for each movie
# to create several features, for example:
# - for one movie, compute the mean popularity of its 3 main actors
# - for one movie, compute the mean popularity of its 5 main actors

# In the 'cast' column we have the TMDB popularity associated to each actor present in the movie
# We keep only the top 3 actors for each movie and we compute the average of their TMDB popularities
# We apply a log function on the popularity in order to smooth the values and have a better distribution (otherwise
# Jason Statham is largely #1...)
df_all['mean_3_popularity'] = df_all['cast'].map(
    lambda x: np.mean([np.log(el['tmdb_popularity']) if np.log(el['tmdb_popularity']) > 0 else 0 for el in x[:3]])) \
    .fillna(0)

# We do the same for the top 5 actors
df_all['mean_5_popularity'] = df_all['cast'].map(
    lambda x: np.mean([np.log(el['tmdb_popularity']) if np.log(el['tmdb_popularity']) > 0 else 0 for el in x[:5]])) \
    .fillna(0)

In [None]:
# To go (much much) further:
# In the same vein, we could create features taking into account sales of previous movies per actor and create 
# features that represent:
# - for one movie, the mean of sales of previous movies of the #1 actor
# - for one movie, the mean of sales of previous movies of the #2 actor
# - for one movie, the mean of sales of previous movies of the #3 actor
# - for one movie, the mean or the maximum of the three features above
# This would also give an idea of an actor's "popularity"

# /!\ For more details, this process is presented in the main deck, at the end of the feature engineering part 

# We create three new columns with the names of the top 3 actors for each movie
df_all['actor_1'] = df_all['cast'].map(lambda x: x[0]['name'] if len(x) > 0 else None) # name of actor #1
df_all['actor_2'] = df_all['cast'].map(lambda x: x[1]['name'] if len(x) > 1 else None) # name of actor #2
df_all['actor_3'] = df_all['cast'].map(lambda x: x[2]['name'] if len(x) > 2 else None) # name of actor #3
# We define a list of all actors that appear in either #1, #2 or #3 positions for all movies
actors_list = set(list(set(df_all['actor_1'])) + list(set(df_all['actor_2'])) + list(set(df_all['actor_3'])))

k = 5
df_all = df_all.sort_values('release_date')
# For each actor we will compute the average of sales of its previous movies and we will copy this value to our
# main dataframe when the given actor is in #1 or #2 or #3 position.
for actor in list(actors_list):
    # We find all the movies where a given actor is in either #1, #2 or #3 position
    data_actor = df_all.loc[(df_all['actor_1'] == actor) | (df_all['actor_2'] == actor) | (df_all['actor_3'] == actor)]
    data_actor['actor'] = actor
    # We compute the average sales on its k previous movies (here k = 5)
    data_actor['mean_sales'] = data_actor.groupby('actor')['sales'] \
        .transform(lambda x: x.rolling(k, 1).mean().shift()).fillna(0)
    # We copy those values in the right place in our main dataframe
    df_all.loc[df_all['actor_1'] == actor, 'actor_1'] = data_actor.loc[data_actor['actor_1'] == actor, 'mean_sales']
    df_all.loc[df_all['actor_2'] == actor, 'actor_2'] = data_actor.loc[data_actor['actor_2'] == actor, 'mean_sales']
    df_all.loc[df_all['actor_3'] == actor, 'actor_3'] = data_actor.loc[data_actor['actor_3'] == actor, 'mean_sales']

# We rename the columns to make them more understandable
df_all = df_all.rename({
    'actor_1': 'actor_1_sales',
    'actor_2': 'actor_2_sales',
    'actor_3': 'actor_3_sales'
}, axis=1)

# We create two new features based on the ones created just above: the mean of the sales of the 3 main actors
df_all['mean_sales_actor'] = (df_all['actor_1_sales'] + df_all['actor_2_sales'] + df_all['actor_3_sales']) / 3
# and the maximum of the sales among the three main actors
df_all['max_sales_actor'] = df_all[["actor_1_sales", "actor_2_sales", "actor_3_sales"]].max(axis=1)

In [None]:
# Drop useless columns for modeling (not numerical columns, raw columns that have been transformed, ...)
# Hint: you can use the .drop() method
to_drop = ['nom_vacances', 'date', 'genres', 'production_countries', 'languages', 'is_adult', 'collection_name',
           'overview', 'tagline', 'cast']
df_all = df_all.drop(to_drop, axis=1).reset_index()

In [None]:
# Fill missing values due to feature engineering if any
# Hint: you can use the .fillna() method
df_all = df_all.fillna(0)