In [None]:
# Copyright (C) 2020 Artefact
# licence-information@artefact.com

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.

# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.

<table><tr>
<td> <img src="https://upload.wikimedia.org/wikipedia/fr/thumb/e/e5/Logo_%C3%A9cole_des_ponts_paristech.svg/676px-Logo_%C3%A9cole_des_ponts_paristech.svg.png" width="200"  height="200" hspace="200"/> </td>
<td> <img src="https://pbs.twimg.com/profile_images/1156541928193896448/5ihYIbCQ_200x200.png" width="200" height="200" /> </td>
</tr></table>

<br/>

<h1><center>Session 11 - Model Serving</center></h1>



<font size="3">This session is divided into **4** parts:
- **1. Package models and feature engineering**
- **2. Get new data and create inference function**
- **3. Package this code and create a FastAPI server**
- **4. Build a UI to request this API**


In each of these parts, some **guidelines** and **hints** are given for each task. 
Do not hesitate to check the links to documentation to understand the functions you use. 
    
The goal of this session is to **select a model** that you will use as your best candidate and optimize it to get the best out of it.
</font>

In [1]:
%config Completer.use_jedi = False

# 1. Package models and feature engineering

## A - Data preprocessing

In this part, we are going to run the code we made from the previous sessions, in order to save our train models.

In [2]:
# to import the libraries from the package, you must provide the absolute path of the place where you put your
# folder french-box-office. For example, if my path is C:/path/to/french-box-office
# replace with the valid filepath in your computer:
import sys

sys.path.append(os.path.join('C:/', 'path', 'to', 'french-box-office')

In [5]:
# import libraries
import pandas as pd
from config import ROOT_DIRPATH
from lib.preprocessing.encode import (encode_movie_data,
                                      get_encoded_collections_df,
                                      get_encoded_actors_df,
                                      get_mean_popularity)
from lib.utils.io import read_movies_entrees, read_movies_features
import os

In [6]:
# to make sure this notebook runs, please make sure your have pandas 1.0.3 installed. 
# You can do this to check your version:
# if you need to reinstall, uncomment and run the following cell:
# !pip install pandas==1.0.3
pd.__version__

'1.0.3'

In [7]:
# data loading and merging
df_boxoffice = read_movies_entrees(os.path.join(ROOT_DIRPATH, 'data', 'french-box-office-29nov2020.json'))
df_features = read_movies_features(os.path.join(ROOT_DIRPATH, 'data', 'movie-features-29nov2020.json'))
data = pd.merge(df_boxoffice, df_features, on='id')
data = data.loc[(data['sales'] != 0) & (data['sales'].notna())]

In [8]:
# the preprocessing code has been wrapped in a function in order to re-use it with new data
data_final_cal = encode_movie_data(data)
data_final_cal.head()

2021-05-03 15:22:19.778 | INFO     | lib.preprocessing.encode:encode_movie_data:182 - budget median: 25000000.0
2021-05-03 15:22:19.808 | INFO     | lib.preprocessing.encode:encode_movie_data:186 - runtime_mean: 101.67367174781708


Unnamed: 0,release_date,sales,is_part_of_collection,budget,runtime,original_lang_en,original_lang_es,original_lang_fr,original_lang_it,original_lang_ja,...,prod_GB,prod_OTHER,prod_US,vacances_zone_a,vacances_zone_b,vacances_zone_c,jour_ferie,holiday,month,cos_month
0,2019-10-16,786485,1,185000000.0,110.0,1,0,0,0,0,...,0,0,1,0.0,0.0,0.0,0.0,0.0,10,1.0
1,2019-05-01,1261701,1,25000000.0,135.0,0,0,1,0,0,...,0,0,0,0.0,0.0,1.0,1.0,2.0,5,-1.732051
2,2019-07-03,1370178,1,160000000.0,129.0,1,0,0,0,0,...,0,0,1,0.0,0.0,0.0,0.0,0.0,7,-1.732051
3,2019-12-04,785636,1,125000000.0,123.0,1,0,0,0,0,...,0,0,1,0.0,0.0,0.0,0.0,0.0,12,2.0
4,2019-02-06,1224811,1,129000000.0,104.0,1,0,0,0,0,...,0,1,1,0.0,0.0,0.0,0.0,0.0,2,1.0


In [None]:
# let's store these info for later, as we want to fill missing values in new data with those of our training dataset
BUDGET_MEDIAN = 25000000.0
RUNTIME_MEAN = 101.67367174781708

## B - Model training

In this part, we are re-training our lightGBM model in order to save it and use it to make a prediction on a new dataset.

In [None]:
from lib.preprocessing.preprocess import (clean_data, get_x_y,
                                          train_test_split_by_date,
                                          transform_target)
from lightgbm import LGBMRegressor

In [None]:
# We are going to store our LightGBM hyperparameters in a constant variable. 
LGBM_BEST_PARAMS = {
    "max_depth": 70,
    "n_estimators": 80,
    "num_leaves": 31,
}

In [None]:
# we need to preprocess our encoded dataset before training the model
data = clean_data(data_final_cal, drop_2020=False)

In [None]:
# let's check how our DataFrame has been processed
data.head()

In [None]:
# Let's train-test split our model for training, as usual, and make X and y datasets
train_data, validation_data, test_data = train_test_split_by_date(data,
                                                                '2018-01-01',
                                                                '2020-01-01')
train_x, train_y = get_x_y(train_data)
validation_x, validation_y = get_x_y(validation_data)
test_x, test_y = get_x_y(test_data)
lgbm = LGBMRegressor(**LGBM_BEST_PARAMS)

In [None]:
# we are re-using the function of the optimization course 
from lib.evaluation.evaluate import evaluate
from lib.modelling.training import save_model, train
from loguru import logger

In [None]:
# A coding good practice is to use logging functions rather than print() to output information. 
# there are several levels of criticity: INFO, DEBUG, ERROR, SUCCESS
# loguru is a very simple library to get a logger
lgbm = train(lgbm, train_x, train_y, transformer=transform_target)
logger.info("Evaluate on validation set ...")
evaluate(lgbm, validation_x, validation_y, transformer=transform_target)
logger.info("Evaluate on test set...")
evaluate(lgbm, test_x, test_y, transformer=transform_target)

## C - Save trained model

Now that we have a trained model, we need to save its weights in order to load it later

In [None]:
# Exercise: Write a function that save the model in a file
# The function will input a LGBMRegressor, and the filename of the saved model
# The function won't return anything, but will log an info message "Model saved to [filepath]"
# Hint: search on stackoverflow how to save a lightGBM model to a file

def save_model(...):
    ...

In [None]:
# Now apply the function to save your model
save_model(...)

# 2. Get new data and create inference function

## A - Query TMDb API to get new movie data

In this part, we are going to fetch new data by querying the [TMDB API](https://www.themoviedb.org/). 

**1. Create an account to TMDb developer portal to get an API key**

Movie features are extracted via [The Movie Database API](https://developers.themoviedb.org/3/getting-started/introduction). Create an account on their website and export your API key.

**2. Store your API Key in a safe place**

These kind of credentials are considered as "secrets", so you don't want to share it with anyone. NEVER version it or write it down in your code. 
A good practice is to set an environment variable. In a terminal shell, run the following command

- On Windows:
```bash
set TMDB_API_KEY='your-key-here'
```
- On Mac/Linux:
```bash
export TMDB_API_KEY='your-key-here'
```

Then, to access it in python, you can use the following lib:
```python
import os

print(os.environ['TMDB_API_KEY'])
```

Another method is to use the `dotenv-python` library:
- Make a copy of the `.env_template` file and save it as `.env`
- Copy your key to the file
- Now, you can export the environment variable with the following direcly in your python code
```python
dotenv.load_dotenv(dotenv.find_dotenv())
```
- Then you can access it in your python code using the previous `os.environ` function

In [None]:
import json

In [None]:
# if you don't want to make API calls, run this cell instead of the followings

# movie_card = json.loads('{"tmdb_id": 577242, "adult": false, "belongs_to_collection": {}, "budget": 17516235, "genres": [{"id": 12, "name": "Aventure"}, {"id": 35, "name": "Com\\u00e9die"}], "imdb_id": "tt9844322", "original_language": "fr", "original_title": "Kaamelott : Premier volet", "overview": "La suite sur grand \\u00e9cran de la s\\u00e9rie culte d\'Alexandre Astier, version d\\u00e9cal\\u00e9e de la l\\u00e9gende des Chevaliers de la Table Ronde. R\\u00e9fugi\\u00e9 \\u00e0 Rome, le Roi Arthur y fait son grand retour pour s\'opposer \\u00e0 l\'arm\\u00e9e de son ancien ami Lancelot.", "tmdb_popularity": 4.833, "production_companies": [{"id": 2902, "name": "SND", "origin_country": "FR"}], "production_countries": [{"iso_code": "FR", "name": "France"}], "release_date": "2021-07-21", "revenue": 0, "runtime": 0, "languages": [{"iso_code": "fr", "name": "Fran\\u00e7ais"}], "status": "Post Production", "tagline": "La patience est un plat qui se mange sans sauce.", "title": "Kaamelott : Premier volet", "tmdb_vote_count": 0, "tmdb_vote_average": 0.0, "cast": [{"adult": false, "gender": 2, "tmdb_id": 47826, "name": "Alexandre Astier", "tmdb_popularity": 1.4, "order": 0}, {"adult": false, "gender": 2, "tmdb_id": 145231, "name": "Lionnel Astier", "tmdb_popularity": 1.283, "order": 1}, {"adult": false, "gender": 1, "tmdb_id": 204034, "name": "Anne Girouard", "tmdb_popularity": 1.213, "order": 2}, {"adult": false, "gender": 2, "tmdb_id": 1243291, "name": "Thomas Cousseau", "tmdb_popularity": 0.728, "order": 3}, {"adult": false, "gender": 2, "tmdb_id": 1243293, "name": "Franck Pitiot", "tmdb_popularity": 0.6, "order": 4}, {"adult": false, "gender": 2, "tmdb_id": 1243294, "name": "Jean-Christophe Hembert", "tmdb_popularity": 1.614, "order": 5}, {"adult": false, "gender": 1, "tmdb_id": 219708, "name": "Audrey Fleurot", "tmdb_popularity": 5.294, "order": 6}, {"adult": false, "gender": 2, "tmdb_id": 219707, "name": "Jacques Chambon", "tmdb_popularity": 0.694, "order": 7}, {"adult": false, "gender": 2, "tmdb_id": 46280, "name": "Antoine de Caunes", "tmdb_popularity": 2.364, "order": 8}, {"adult": false, "gender": 2, "tmdb_id": 4275, "name": "Alain Chabat", "tmdb_popularity": 1.873, "order": 9}, {"adult": false, "gender": 2, "tmdb_id": 1372039, "name": "Lo\\u00efc Varraut", "tmdb_popularity": 0.6, "order": 10}, {"adult": false, "gender": 1, "tmdb_id": 1316265, "name": "Jo\\u00eblle Sevilla", "tmdb_popularity": 0.84, "order": 11}, {"adult": false, "gender": 2, "tmdb_id": 1913754, "name": "Bruno Fontaine", "tmdb_popularity": 0.6, "order": 12}, {"adult": false, "gender": 0, "tmdb_id": 1856314, "name": "Jean-Robert Lombard", "tmdb_popularity": 0.6, "order": 13}, {"adult": false, "gender": 2, "tmdb_id": 41031, "name": "Fran\\u00e7ois Rollin", "tmdb_popularity": 1.109, "order": 14}, {"adult": false, "gender": 1, "tmdb_id": 2214804, "name": "Caroline Ferrus", "tmdb_popularity": 0.6, "order": 15}, {"adult": false, "gender": 2, "tmdb_id": 1152669, "name": "Guillaume Briat", "tmdb_popularity": 0.6, "order": 16}, {"adult": false, "gender": 2, "tmdb_id": 219705, "name": "Nicolas Gabion", "tmdb_popularity": 0.98, "order": 17}, {"adult": false, "gender": 2, "tmdb_id": 28781, "name": "Christian Clavier", "tmdb_popularity": 4.909, "order": 18}, {"adult": false, "gender": 2, "tmdb_id": 77929, "name": "Fran\\u00e7ois Morel", "tmdb_popularity": 2.643, "order": 19}, {"adult": false, "gender": 2, "tmdb_id": 6554, "name": "Guillaume Gallienne", "tmdb_popularity": 1.668, "order": 20}, {"adult": false, "gender": 2, "tmdb_id": 24891, "name": "Clovis Cornillac", "tmdb_popularity": 1.596, "order": 21}, {"adult": false, "gender": 2, "tmdb_id": 982, "name": "Sting", "tmdb_popularity": 4.406, "order": 22}, {"adult": false, "gender": 0, "tmdb_id": 586758, "name": "Marie-Christine Orry", "tmdb_popularity": 1.38, "order": 23}, {"adult": false, "gender": 1, "tmdb_id": 1574596, "name": "Jehnny Beth", "tmdb_popularity": 0.6, "order": 24}, {"adult": false, "gender": 0, "tmdb_id": 587147, "name": "Brice Fournier", "tmdb_popularity": 0.675, "order": 26}, {"adult": false, "gender": 0, "tmdb_id": 225853, "name": "Serge Papagalli", "tmdb_popularity": 0.98, "order": 27}, {"adult": false, "gender": 2, "tmdb_id": 114953, "name": "G\\u00e9raldine Nakache", "tmdb_popularity": 0.958, "order": 28}, {"adult": false, "gender": 0, "tmdb_id": 2442062, "name": "Gilles Graveleau", "tmdb_popularity": 0.6, "order": 29}, {"adult": false, "gender": 0, "tmdb_id": 2837722, "name": "St\\u00e9phane Margot", "tmdb_popularity": 0.6, "order": 30}, {"adult": false, "gender": 2, "tmdb_id": 2442059, "name": "Aur\\u00e9lien Portehaut", "tmdb_popularity": 0.6, "order": 31}, {"adult": false, "gender": 2, "tmdb_id": 1536874, "name": "Etienne Fague", "tmdb_popularity": 0.6, "order": 32}, {"adult": false, "gender": 2, "tmdb_id": 134216, "name": "Carlo Brandt", "tmdb_popularity": 1.396, "order": 33}, {"adult": false, "gender": 2, "tmdb_id": 2625427, "name": "Pascal Vincent", "tmdb_popularity": 0.6, "order": 34}, {"adult": false, "gender": 1, "tmdb_id": 146491, "name": "Valerie K\\u00e9ruzor\\u00e9", "tmdb_popularity": 3.57, "order": 35}, {"adult": false, "gender": 0, "tmdb_id": 2475712, "name": "Mehdi Rahim-Silvioli", "tmdb_popularity": 0.6, "order": 36}, {"adult": false, "gender": 2, "tmdb_id": 1636467, "name": "David Ayala", "tmdb_popularity": 1.38, "order": 37}, {"adult": false, "gender": 0, "tmdb_id": 1090662, "name": "Jean-charles Simon", "tmdb_popularity": 0.6, "order": 38}, {"adult": false, "gender": 0, "tmdb_id": 2837723, "name": "Lamari Amine", "tmdb_popularity": 0.6, "order": 39}, {"adult": false, "gender": 0, "tmdb_id": 2837724, "name": "H\\u00e9l\\u00e8ne Rudermann", "tmdb_popularity": 0.6, "order": 40}, {"adult": false, "gender": 0, "tmdb_id": 2837725, "name": "Yazan Al-Mashni", "tmdb_popularity": 0.6, "order": 41}, {"adult": false, "gender": 0, "tmdb_id": 2837726, "name": "Neil Astier", "tmdb_popularity": 0.6, "order": 42}, {"adult": false, "gender": 0, "tmdb_id": 1865891, "name": "Tigran Mekhitarian", "tmdb_popularity": 0.6, "order": 43}, {"adult": false, "gender": 0, "tmdb_id": 2837727, "name": "Oc\\u00e9ane Slim", "tmdb_popularity": 0.6, "order": 44}, {"adult": false, "gender": 0, "tmdb_id": 2837728, "name": "Antoine Bordes", "tmdb_popularity": 0.6, "order": 45}], "id": 577242, "query": "Kaamelott : Premier volet", "year": 2021, "first_week_sales": null}')

In [None]:
# Let's import the TBDb API client
from lib.crawling.movie_features.tmdb.client import TMDbClient

In [None]:
# Let's load the TMDB_API_KEY that stores our API token in a environment variable
import dotenv

dotenv.load_dotenv(dotenv.find_dotenv())

In [None]:
# if you didn't manage to do ```export TMDB_API_KEY='My API key'``` or the environment file, uncomment the following cells, past your API key and run it

## TMDB_API_KEY = 'My API key'
## os.environ["TMDB_API_KEY"] = TMDB_API_KEY

In [None]:
# Exercise: Enter a valid movie title and execute!
MOVIE_TITLE = ""

tmdb_client = TMDbClient()
movie_card = tmdb_client.find_movie_features(MOVIE_TITLE)

In [None]:
# Exercise: print the result, using the pprint function to make it nicer
from pprint import pprint

pprint(movie_card)

In [None]:
# Exercise: now try to extract the year of the release, from this movie card, and store it in a
# variable "release_year". It has to be of type integer. 
# Hint: look at the "release_date" variable
release_year = ...

In [None]:
# If response is not null, write to results
if movie_card:
    movie_card['id'] = movie_card['tmdb_id']
    movie_card['query'] = MOVIE_TITLE
    movie_card["year"] = release_year
    movie_card["first_week_sales"] = None

In [None]:
# Now let's wrap this in a function

def query_movie_data_from_title(tmdb_client, title: str):
    movie_card = tmdb_client.find_movie_features(title)
    # If response is not null, write to results
    if movie_card:
        movie_card['id'] = movie_card['tmdb_id']
        movie_card['query'] = title
        movie_card["year"] = int(movie_card['release_date'][:4])
        movie_card["first_week_sales"] = None
        status = {"message": "Success", "success": True}
        return movie_card, status
    else:
        status_message = f"Error: Movie {title} not found"
        status = {"message": status_message, "success": False}
        logger.error(status_message)
        return movie_card, status

In [None]:
# Try with a name that doesn't exists, and print the status
MOVIE_TITLE = ""

tmdb_client = TMDbClient()
movie_card, status = query_movie_data_from_title(tmdb_client, MOVIE_TITLE)
print(status)

In [None]:
# Try with a movie name! 
MOVIE_TITLE = ""

tmdb_client = TMDbClient()
movie_card, status = query_movie_data_from_title(tmdb_client, MOVIE_TITLE)
print(status)

## B - Process data

Now we are going to create a DataFrame out of our movie card in order to be able to apply our encoding functions on it.

In [None]:
import pandas as pd
from lib.preprocessing.load import get_dataset_from_api_res

movie_data = get_dataset_from_api_res(movie_card)

movie_data

In [None]:
# Exercise: Now, let's apply our feature encoding function. 
# Hint: Check at the beginning of the notebook. See the parameters and the docstring of the functions (CTRL + SHIFT).
# What values would you input to the function to handle missing values in the new dataset?
movie_data = ...
movie_data.head()

In [None]:
# Then, apply the data cleaning function, and get X and y matrix in order to make prediction on X to get y_hat later. 
data_clean = ..
X, y = ...

## C. Predict using the saved model

In this step, we load the model that has been previously saved, and then we make a prediction against our new data set. 

In [None]:
# Exercise: Create a lightGBM model from the file you saved earlier. 
# Hint: check on this Stackoverflow page:
# https://stackoverflow.com/questions/55208734/save-lgbmregressor-model-from-python-lightgbm-package-to-disc/55209076
...

In [None]:
# Exercise: Now, create a function predict that inputs the Booster model, the feature dataset X, and a transform_target function
# (as we normalized log our target, we want to use transform_target() to do the inverse operation)
# Hint: Feel free to take this function as a basis: lib.evaluation.evaluate.evaluate()
# You need to find it in the codebase.
def predict(...):
    ...

In [None]:
# Now apply the function and get a prediction
predictions = ...

## D. Wrapping up: write the inference function

In [None]:
# Exercise: write a function that inputs a movie title, and return a movie card. 
# This function must do:
# - load the dotenv
# - Instanciate the TMDbClient, and fetch the movie card from the API, and the status
# If the status["success"] is False:
# - return the status
# Else, if the status["success"] is True:
# - Format a dataset from the movie card
# - Encode the movie dataset to get features
# - Clean dataset 
# - Create a dataframe X with features
# - Load lightGBM Model from filepath
# - Predict on X using the model 
# - Store the prediction in the movie_card dict, in a key called box_office_sales_forecast
# - Return the movie_card dict
# Hint: find the relevant functions above

def infer_from_movie_title(...):
    ...

In [None]:
# Now choose a movie and apply your function to get a movie card. 
# Print the result of the prediction
NEW_MOVIE = ...
movie_card = infer_from_movie_title(NEW_MOVIE)
print(...)

In [None]:
# Let's see how the data looks like
from pprint import pprint

pprint(movie_card)

# 3. Create the service that make prediction

The goal of this section is to put our inference function in the codebase, and to put it in a API. 

First, you need to understand how the code repository is organized. Here is the structure of the tree:

```
├── README.md
├── app
│   └── server.py -> the Web Server API that serves our model
│   ├── client.py -> the User Interface we are going to build to query the API that serves our model
├── bin --> stands for "binary", ie. the executable scripts
│   ├── get_movie_features.py
│   ├── predict.py
│   ├── preprocess_data.py
│   └── train.py
├── config.py --> the place where we store the constants, our parameters.
├── data -> the historical data
│   ├── french-box-office-29nov2020.json
│   ├── movie-features-29nov2020.json
├── lib -> stands for "library". The place where you put your function
│   ├── crawling
│   │   ├── boxoffice
│   │   └── movie_features
│   ├── evaluation
│   │   └── evaluate.py
│   ├── modelling
│   │   ├── predict.py
│   │   └── training.py
│   ├── preprocessing
│   │   ├── encode.py
│   │   ├── load.py
│   │   └── preprocess.py
│   ├── utils
│   │   ├── io.py
│   │   └── path.py
│   └── workflows
│       └── inference.py
├── models --> the folder where you store your model
│   └── light_gbm_model.txt
├── notebooks --> your notebooks
└── requirements.txt --> the library dependancies
```

**Questions**

* Where would you copy-paste the function?
* In which script will you call this function?
* Copy-paste the function at the right place in the code
* Execute the code

Hint: to run a python script, you need to:
```bash
conda activate french_box_office
run `python path/to/script.py`
```

To stop the execution, press `CTRL + C`
We are going to let it open for the exercize

# 4. Make a request agaist the API server and build an UI client

The goal of this part is to learn how to make a request to the API we built, and then build a client application with a nice UI for our end-users

## A - Make request

We want to send a request to our API. 

**Questions:**
* What is the host URI of the API?
* What is the port number?
* What is the API endpoint (=route) URL where we can send a POST requests?
* How to format our POST request?

In [None]:
# Exercise: Make an API call to the server in Python
# Hint: The reference lib is called requests. You need to pass a dict to query the API
...

## B - Parse response

In [None]:
# each HTTP request has a status code. 
# print the status code of the request
...

In [None]:
# now make a request with a movie title that doesn't exists and print the status code
...

In [None]:
# print the content of the request
...

## C. Build a UI client

To build an UI client, we are using the library [streamlit](https://docs.streamlit.io/en/stable/getting_started.html).



** Run **
To run the client, in a Shell terminal, do:

```bash
conda activate french_box_office
streamlit run app/client.py
```

**Questions**
* Take 5 minutes to read the [quickstart](https://docs.streamlit.io/en/stable/getting_started.html).
* Run the streamlit app
* Copy-paste the request function at the right place
* Refresh the app