<table><tr>
<td> <img src="https://upload.wikimedia.org/wikipedia/fr/thumb/e/e5/Logo_%C3%A9cole_des_ponts_paristech.svg/676px-Logo_%C3%A9cole_des_ponts_paristech.svg.png" width="200"  height="200" hspace="200"/> </td>
<td> <img src="https://pbs.twimg.com/profile_images/1156541928193896448/5ihYIbCQ_200x200.png" width="200" height="200" /> </td>
</tr></table>

<br/>

<h1><center>Session 11 - Model Serving</center></h1>



<font size="3">This session is divided into **4** parts:
- **1. Package models and feature engineering**
- **2. Get new data and create inference function**
- **3. Package this code and create a FastAPI server**
- **4. Build a UI to request this API**


In each of these parts, some **guidelines** and **hints** are given for each task. 
Do not hesitate to check the links to documentation to understand the functions you use. 
    
The goal of this session is to **select a model** that you will use as your best candidate and optimize it to get the best out of it.
</font>

In [1]:
%config Completer.use_jedi = False

# 1. Package models and feature engineering

## A - Data preprocessing

In [2]:
import pandas as pd
from config import ROOT_DIRPATH, COLS_TO_DROP_PREPROCESSING
from lib.preprocessing.encode import (encode_movie_data,
                                      get_encoded_collections_df,
                                      get_encoded_actors_df,
                                      get_mean_popularity)
from lib.utils.io import read_movies_entrees, read_movies_features
import os

In [3]:
# data loading and merging
df_boxoffice = read_movies_entrees(os.path.join(ROOT_DIRPATH, 'data', 'french-box-office-29nov2020.json'))
df_features = read_movies_features(os.path.join(ROOT_DIRPATH, 'data', 'movie-features-29nov2020.json'))
data = pd.merge(df_boxoffice, df_features, on='id')
data = data.loc[(data['sales'] != 0) & (data['sales'].notna())]

In [4]:
# the preprocessing code has been packaged in a function
data_final_cal = encode_movie_data(data)
data_final_cal.head()

2021-04-26 15:28:10.218 | INFO     | lib.preprocessing.encode:encode_movie_data:164 - budget median: 25000000.0
2021-04-26 15:28:10.244 | INFO     | lib.preprocessing.encode:encode_movie_data:168 - runtime_mean: 101.67367174781708


Unnamed: 0,release_date,sales,is_part_of_collection,budget,runtime,original_lang_en,original_lang_es,original_lang_fr,original_lang_it,original_lang_ja,...,prod_GB,prod_OTHER,prod_US,vacances_zone_a,vacances_zone_b,vacances_zone_c,jour_ferie,holiday,month,cos_month
0,2019-10-16,786485,1,185000000.0,110.0,1,0,0,0,0,...,0,0,1,0.0,0.0,0.0,0.0,0.0,10,1.0
1,2019-05-01,1261701,1,25000000.0,135.0,0,0,1,0,0,...,0,0,0,0.0,0.0,1.0,1.0,2.0,5,-1.732051
2,2019-07-03,1370178,1,160000000.0,129.0,1,0,0,0,0,...,0,0,1,0.0,0.0,0.0,0.0,0.0,7,-1.732051
3,2019-12-04,785636,1,125000000.0,123.0,1,0,0,0,0,...,0,0,1,0.0,0.0,0.0,0.0,0.0,12,2.0
4,2019-02-06,1224811,1,129000000.0,104.0,1,0,0,0,0,...,0,1,1,0.0,0.0,0.0,0.0,0.0,2,1.0


In [5]:
# let's store these info for later
BUDGET_MEDIAN = 25000000.0
RUNTIME_MEAN = 101.67367174781708

## B - Model training

In [6]:
from lib.preprocessing.preprocess import (clean_data, get_x_y,
                                          train_test_split_by_date,
                                          transform_target)
from lightgbm import LGBMRegressor

In [7]:
LGBM_BEST_PARAMS = {
    "max_depth": 70,
    "n_estimators": 80,
    "num_leaves": 31,
}  # LightGBM hyperparameters

In [8]:
data = clean_data(data_final_cal, drop_2020=False)

2021-04-26 15:28:16.386 | INFO     | lib.preprocessing.preprocess:clean_data:7 - cleaning data..


In [9]:
data.head()

Unnamed: 0_level_0,sales,is_part_of_collection,budget,runtime,original_lang_en,original_lang_es,original_lang_fr,original_lang_it,original_lang_ja,original_lang_other,...,prod_GB,prod_OTHER,prod_US,vacances_zone_a,vacances_zone_b,vacances_zone_c,jour_ferie,holiday,month,cos_month
release_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-01-01,139087,0,25000000.0,120.0,0,0,1,0,0,0,...,0,0,0,1.0,1.0,1.0,1.0,4.0,1,1.732051
2000-01-05,1463152,0,25000000.0,77.0,0,1,0,0,0,0,...,0,1,0,0.0,0.0,0.0,0.0,0.0,1,1.732051
2000-01-05,32954,0,25000000.0,116.0,1,0,0,0,0,0,...,1,0,0,0.0,0.0,0.0,0.0,0.0,1,1.732051
2000-01-05,66228,0,22000000.0,142.0,1,0,0,0,0,0,...,0,0,1,0.0,0.0,0.0,0.0,0.0,1,1.732051
2000-01-12,25224,0,9000000.0,165.0,0,0,0,1,0,0,...,0,1,0,0.0,0.0,0.0,0.0,0.0,1,1.732051


In [39]:
train_data, validation_data, test_data = train_test_split_by_date(data,
                                                                '2018-01-01',
                                                                '2020-01-01')
train_x, train_y = get_x_y(train_data)
validation_x, validation_y = get_x_y(validation_data)
test_x, test_y = get_x_y(test_data)
lgbm = LGBMRegressor(**LGBM_BEST_PARAMS)

In [11]:
from lib.evaluation.evaluate import evaluate
from lib.modelling.training import save_model, train
from loguru import logger

In [12]:
lgbm = train(lgbm, train_x, train_y, transformer=transform_target)
logger.info("Evaluate on validation set ...")
evaluate(lgbm, validation_x, validation_y, transformer=transform_target)
logger.info("Evaluate on test set...")
evaluate(lgbm, test_x, test_y, transformer=transform_target)

2021-04-26 15:28:23.036 | INFO     | lib.modelling.training:train:7 - start fitting a <class 'lightgbm.sklearn.LGBMRegressor'>...




2021-04-26 15:28:23.469 | INFO     | lib.modelling.training:train:13 - {'mape': 259.7951138784969, 'rmse': 248446.17436903322, 'mae': 111806.91020683752}
2021-04-26 15:28:23.471 | INFO     | __main__:<module>:2 - Evaluate on validation set ...
2021-04-26 15:28:23.488 | INFO     | lib.evaluation.evaluate:evaluate:33 - {'mape': 385.7822207702718, 'rmse': 245750.790776571, 'mae': 106328.76600547637}
2021-04-26 15:28:23.489 | INFO     | __main__:<module>:4 - Evaluate on test set...
2021-04-26 15:28:23.499 | INFO     | lib.evaluation.evaluate:evaluate:33 - {'mape': 247.92134224671875, 'rmse': 121393.32992711238, 'mae': 68727.43167489658}


## C - Save trained model

In [13]:
from config import ROOT_DIRPATH

LGBM_MODEL_FILEPATH = os.path.join(ROOT_DIRPATH, "models", "light_gbm_model.txt")

In [14]:
def save_model(model: LGBMRegressor, filepath: str):
    model.booster_.save_model(filepath, num_iteration=model.best_iteration_)
    logger.info(f'Model saved to {filepath}')

In [15]:
save_model(lgbm, LGBM_MODEL_FILEPATH)

2021-04-26 15:28:29.040 | INFO     | __main__:save_model:3 - Model saved to /Users/hugo/Documents/PONTS/french-box-office/models/light_gbm_model.txt


# 2. Get new data and create inference function

## A - Query TMDb API to get new movie data

In [16]:
# if you don't want to make API calls, run this cell and not the followings
movie_card = json.loads('{"tmdb_id": 577242, "adult": false, "belongs_to_collection": {}, "budget": 17516235, "genres": [{"id": 12, "name": "Aventure"}, {"id": 35, "name": "Com\\u00e9die"}], "imdb_id": "tt9844322", "original_language": "fr", "original_title": "Kaamelott : Premier volet", "overview": "La suite sur grand \\u00e9cran de la s\\u00e9rie culte d\'Alexandre Astier, version d\\u00e9cal\\u00e9e de la l\\u00e9gende des Chevaliers de la Table Ronde. R\\u00e9fugi\\u00e9 \\u00e0 Rome, le Roi Arthur y fait son grand retour pour s\'opposer \\u00e0 l\'arm\\u00e9e de son ancien ami Lancelot.", "tmdb_popularity": 4.833, "production_companies": [{"id": 2902, "name": "SND", "origin_country": "FR"}], "production_countries": [{"iso_code": "FR", "name": "France"}], "release_date": "2021-07-21", "revenue": 0, "runtime": 0, "languages": [{"iso_code": "fr", "name": "Fran\\u00e7ais"}], "status": "Post Production", "tagline": "La patience est un plat qui se mange sans sauce.", "title": "Kaamelott : Premier volet", "tmdb_vote_count": 0, "tmdb_vote_average": 0.0, "cast": [{"adult": false, "gender": 2, "tmdb_id": 47826, "name": "Alexandre Astier", "tmdb_popularity": 1.4, "order": 0}, {"adult": false, "gender": 2, "tmdb_id": 145231, "name": "Lionnel Astier", "tmdb_popularity": 1.283, "order": 1}, {"adult": false, "gender": 1, "tmdb_id": 204034, "name": "Anne Girouard", "tmdb_popularity": 1.213, "order": 2}, {"adult": false, "gender": 2, "tmdb_id": 1243291, "name": "Thomas Cousseau", "tmdb_popularity": 0.728, "order": 3}, {"adult": false, "gender": 2, "tmdb_id": 1243293, "name": "Franck Pitiot", "tmdb_popularity": 0.6, "order": 4}, {"adult": false, "gender": 2, "tmdb_id": 1243294, "name": "Jean-Christophe Hembert", "tmdb_popularity": 1.614, "order": 5}, {"adult": false, "gender": 1, "tmdb_id": 219708, "name": "Audrey Fleurot", "tmdb_popularity": 5.294, "order": 6}, {"adult": false, "gender": 2, "tmdb_id": 219707, "name": "Jacques Chambon", "tmdb_popularity": 0.694, "order": 7}, {"adult": false, "gender": 2, "tmdb_id": 46280, "name": "Antoine de Caunes", "tmdb_popularity": 2.364, "order": 8}, {"adult": false, "gender": 2, "tmdb_id": 4275, "name": "Alain Chabat", "tmdb_popularity": 1.873, "order": 9}, {"adult": false, "gender": 2, "tmdb_id": 1372039, "name": "Lo\\u00efc Varraut", "tmdb_popularity": 0.6, "order": 10}, {"adult": false, "gender": 1, "tmdb_id": 1316265, "name": "Jo\\u00eblle Sevilla", "tmdb_popularity": 0.84, "order": 11}, {"adult": false, "gender": 2, "tmdb_id": 1913754, "name": "Bruno Fontaine", "tmdb_popularity": 0.6, "order": 12}, {"adult": false, "gender": 0, "tmdb_id": 1856314, "name": "Jean-Robert Lombard", "tmdb_popularity": 0.6, "order": 13}, {"adult": false, "gender": 2, "tmdb_id": 41031, "name": "Fran\\u00e7ois Rollin", "tmdb_popularity": 1.109, "order": 14}, {"adult": false, "gender": 1, "tmdb_id": 2214804, "name": "Caroline Ferrus", "tmdb_popularity": 0.6, "order": 15}, {"adult": false, "gender": 2, "tmdb_id": 1152669, "name": "Guillaume Briat", "tmdb_popularity": 0.6, "order": 16}, {"adult": false, "gender": 2, "tmdb_id": 219705, "name": "Nicolas Gabion", "tmdb_popularity": 0.98, "order": 17}, {"adult": false, "gender": 2, "tmdb_id": 28781, "name": "Christian Clavier", "tmdb_popularity": 4.909, "order": 18}, {"adult": false, "gender": 2, "tmdb_id": 77929, "name": "Fran\\u00e7ois Morel", "tmdb_popularity": 2.643, "order": 19}, {"adult": false, "gender": 2, "tmdb_id": 6554, "name": "Guillaume Gallienne", "tmdb_popularity": 1.668, "order": 20}, {"adult": false, "gender": 2, "tmdb_id": 24891, "name": "Clovis Cornillac", "tmdb_popularity": 1.596, "order": 21}, {"adult": false, "gender": 2, "tmdb_id": 982, "name": "Sting", "tmdb_popularity": 4.406, "order": 22}, {"adult": false, "gender": 0, "tmdb_id": 586758, "name": "Marie-Christine Orry", "tmdb_popularity": 1.38, "order": 23}, {"adult": false, "gender": 1, "tmdb_id": 1574596, "name": "Jehnny Beth", "tmdb_popularity": 0.6, "order": 24}, {"adult": false, "gender": 0, "tmdb_id": 587147, "name": "Brice Fournier", "tmdb_popularity": 0.675, "order": 26}, {"adult": false, "gender": 0, "tmdb_id": 225853, "name": "Serge Papagalli", "tmdb_popularity": 0.98, "order": 27}, {"adult": false, "gender": 2, "tmdb_id": 114953, "name": "G\\u00e9raldine Nakache", "tmdb_popularity": 0.958, "order": 28}, {"adult": false, "gender": 0, "tmdb_id": 2442062, "name": "Gilles Graveleau", "tmdb_popularity": 0.6, "order": 29}, {"adult": false, "gender": 0, "tmdb_id": 2837722, "name": "St\\u00e9phane Margot", "tmdb_popularity": 0.6, "order": 30}, {"adult": false, "gender": 2, "tmdb_id": 2442059, "name": "Aur\\u00e9lien Portehaut", "tmdb_popularity": 0.6, "order": 31}, {"adult": false, "gender": 2, "tmdb_id": 1536874, "name": "Etienne Fague", "tmdb_popularity": 0.6, "order": 32}, {"adult": false, "gender": 2, "tmdb_id": 134216, "name": "Carlo Brandt", "tmdb_popularity": 1.396, "order": 33}, {"adult": false, "gender": 2, "tmdb_id": 2625427, "name": "Pascal Vincent", "tmdb_popularity": 0.6, "order": 34}, {"adult": false, "gender": 1, "tmdb_id": 146491, "name": "Valerie K\\u00e9ruzor\\u00e9", "tmdb_popularity": 3.57, "order": 35}, {"adult": false, "gender": 0, "tmdb_id": 2475712, "name": "Mehdi Rahim-Silvioli", "tmdb_popularity": 0.6, "order": 36}, {"adult": false, "gender": 2, "tmdb_id": 1636467, "name": "David Ayala", "tmdb_popularity": 1.38, "order": 37}, {"adult": false, "gender": 0, "tmdb_id": 1090662, "name": "Jean-charles Simon", "tmdb_popularity": 0.6, "order": 38}, {"adult": false, "gender": 0, "tmdb_id": 2837723, "name": "Lamari Amine", "tmdb_popularity": 0.6, "order": 39}, {"adult": false, "gender": 0, "tmdb_id": 2837724, "name": "H\\u00e9l\\u00e8ne Rudermann", "tmdb_popularity": 0.6, "order": 40}, {"adult": false, "gender": 0, "tmdb_id": 2837725, "name": "Yazan Al-Mashni", "tmdb_popularity": 0.6, "order": 41}, {"adult": false, "gender": 0, "tmdb_id": 2837726, "name": "Neil Astier", "tmdb_popularity": 0.6, "order": 42}, {"adult": false, "gender": 0, "tmdb_id": 1865891, "name": "Tigran Mekhitarian", "tmdb_popularity": 0.6, "order": 43}, {"adult": false, "gender": 0, "tmdb_id": 2837727, "name": "Oc\\u00e9ane Slim", "tmdb_popularity": 0.6, "order": 44}, {"adult": false, "gender": 0, "tmdb_id": 2837728, "name": "Antoine Bordes", "tmdb_popularity": 0.6, "order": 45}], "id": 577242, "query": "Kaamelott : Premier volet", "year": 2021, "first_week_sales": null}')

In [95]:
## register here to get an API key : https://developers.themoviedb.org/3/getting-started/introduction

from lib.crawling.movie_features.tmdb.client import TMDbClient
import os

In [119]:
import dotenv

In [120]:
dotenv.load_dotenv(dotenv.find_dotenv())

True

In [97]:
# if you didn't manage to do ```export TMDB_API_KEY='My API key'```, run the following cells
## TMDB_API_KEY = 'My API key'
## os.environ["TMDB_API_KEY"] = TMDB_API_KEY

In [None]:
MOVIE_TITLE = "Kaamelott : Premier volet"

In [23]:
tmdb_client = TMDbClient()
movie_card = tmdb_client.find_movie_features(MOVIE_TITLE)

In [24]:
# If response is not null, write to results
if movie_card:
    movie_card['id'] = movie_card['tmdb_id']
    movie_card['query'] = MOVIE_TITLE
    movie_card["year"] = int(movie_card['release_date'][:4])
    movie_card["first_week_sales"] = None

In [114]:
def query_movie_data_from_title(title: str) -> dict:
    tmdb_client = TMDbClient()
    movie_card = tmdb_client.find_movie_features(title)
    # If response is not null, write to results
    if movie_card:
        movie_card['id'] = movie_card['tmdb_id']
        movie_card['query'] = title
        movie_card["year"] = int(movie_card['release_date'][:4])
        movie_card["first_week_sales"] = None
    return movie_card

In [48]:
#MOVIE_JSON_RESULT = json.dumps(movie_card)

## B - Process data

In [17]:
import pandas as pd
from lib.preprocessing.load import get_dataset_from_api_res

In [43]:
movie_data = get_dataset_from_api_res(movie_card)

In [44]:
movie_data

Unnamed: 0,is_adult,is_part_of_collection,budget,genres,original_language,production_countries,languages,runtime,id,year,title,sales,release_date
0,False,False,17516235,"[Aventure, Comédie]",fr,[FR],[fr],0,577242,2021,Kaamelott : Premier volet,,2021-07-21


In [45]:
# Apply feature engineering and pre-processing
# the preprocessing code has been packaged in a function
movie_data = encode_movie_data(movie_data, budget_median=BUDGET_MEDIAN, runtime_mean=RUNTIME_MEAN)
movie_data.head()

Unnamed: 0,release_date,sales,is_part_of_collection,budget,runtime,original_lang_en,original_lang_es,original_lang_fr,original_lang_it,original_lang_ja,...,prod_GB,prod_OTHER,prod_US,vacances_zone_a,vacances_zone_b,vacances_zone_c,jour_ferie,holiday,month,cos_month
0,2021-07-21,0,0,17516235.0,101.673672,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,1.0,0,3.0,7,-1.732051


In [46]:
data_clean = clean_data(movie_data, drop_2020=False)
X, _ = get_x_y(data_clean)

2021-04-26 15:32:55.514 | INFO     | lib.preprocessing.preprocess:clean_data:7 - cleaning data..


## C. Predict using the saved model

In [84]:
import lightgbm as lgb

model = lgb.Booster(model_file=LGBM_MODEL_FILEPATH)

In [85]:
def predict(model, features, transformer=None):
    predicted_target = model.predict(features)
    if transformer:
        predicted_target = transformer(predicted_target, forward=False)
    return predicted_target

In [88]:
predictions = predict(model, X, transformer=transform_target)

In [89]:
predictions[0]

72623.28705647879

## D. Wrapping up: write the inference function

In [107]:
def infer_from_movie_title(title: str) -> float:
    movie_card = query_movie_data_from_title(title)
    movie_data = get_dataset_from_api_res(movie_card)
    movie_data = encode_movie_data(movie_data, budget_median=BUDGET_MEDIAN, runtime_mean=RUNTIME_MEAN)
    data_clean = clean_data(movie_data, drop_2020=False)
    X, _ = get_x_y(data_clean)
    model = lgb.Booster(model_file=LGBM_MODEL_FILEPATH)
    predictions = predict(model, X, transformer=transform_target)
    return movie_card, predictions[0]

In [116]:
# chose a movie from here
NEW_MOVIE = "Nomadland"
movie_card, prediction = infer_from_movie_title(NEW_MOVIE)
print(prediction)

2021-04-26 16:01:27.535 | INFO     | lib.preprocessing.preprocess:clean_data:7 - cleaning data..


53121.10894798973


In [117]:
from pprint import pprint

In [118]:
pprint(movie_card)

{'adult': False,
 'belongs_to_collection': {},
 'budget': 5000000,
 'cast': [{'adult': False,
           'gender': 1,
           'name': 'Frances McDormand',
           'order': 0,
           'tmdb_id': 3910,
           'tmdb_popularity': 12.053},
          {'adult': False,
           'gender': 2,
           'name': 'David Strathairn',
           'order': 1,
           'tmdb_id': 11064,
           'tmdb_popularity': 6.894},
          {'adult': False,
           'gender': 1,
           'name': 'Linda May',
           'order': 2,
           'tmdb_id': 2241214,
           'tmdb_popularity': 1.417},
          {'adult': False,
           'gender': 1,
           'name': 'Swankie',
           'order': 3,
           'tmdb_id': 2776341,
           'tmdb_popularity': 1.473},
          {'adult': False,
           'gender': 0,
           'name': 'Gay DeForest',
           'order': 4,
           'tmdb_id': 2786070,
           'tmdb_popularity': 1.411},
          {'adult': False,
           'gender'

# 3. Create the service that make prediction

To run the server

# 4. Make a requests agaist the API server and build an UI client

In [41]:
import requests
import json

In [61]:
res = requests.post('http://0.0.0.0:8080/predict', json={'user': 'john'})

In [62]:
res.content

b'"hello john"'