### Import of libraries

In [1]:
import pathlib
import time
import datetime
import pickle
import logging
import copy
import yaml
import os

import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

from netflix_recommender_system.logger import logger
from netflix_recommender_system.config import config
import netflix_recommender_system.server
import netflix_recommender_system.model

### Global variables

In [2]:
#HOMEPATH = ! echo $HOME
# INPUT_FILE_PATH = pathlib.Path(HOMEPATH[0], "projects/public-showcase/netflix-recommender-system-microservice/data")
# OUTPUT_DIRECTORY_PATH = pathlib.Path(HOMEPATH[0], "projects/public-showcase/netflix-recommender-system-microservice/data")
# MODEL_DIRECTORY_PATH = pathlib.Path(HOMEPATH[0], "projects/public-showcase/netflix-recommender-system-microservice/models")


# INPUT_DATA_DIRECTORY_PATH = pathlib.Path("..", "data")
# OUTPUT_DATA_DIRECTORY_PATH = pathlib.Path("..", "data")
# MODEL_DIRECTORY_PATH = pathlib.Path("..", "models")
# PREDICTION_TOTO_DICT = {
#     "rating_year": 2005,
#     "avg_rating_of_similar_customers": 3.0,
#     "number_of_similar_customers": 1.0,
#     "rating_x_era_avg": 3.2458677685950414,
#     "title": "Scotland",
#     "release_era": "90s and 2000s",
# }
# PREDICTION_DICT = {
#     "customer_id": [1044034],
#     # "rating": None,
#     "rating_date": ["2005-02-03"],
#     "movie_id": [12031],
#     "release_year": [2002],
#     "title": ["Scotland"], 
# }
PREDICTION_DICT = {
    "customer_id": 1044034,
    # "rating": None,
    "rating_date": "2005-02-03",
    "movie_id": 12031,
    "release_year": 2002,
    "title": "Scotland", 
}

MODE = "prediction" # "prediction" or "training"
# MODEL_TIMESTAMP_ID = "abc"

In [None]:
# logger = logging.getLogger(__name__)
# logging.basicConfig(level=logging.INFO)

### Randomness seed

### File import

In [5]:
class Preprocessing:
    """Preprocess."""
    def __init__(self, mode: str, input_datafilepath: str, output_datafilepath = None, prediction_dict = None):
        if mode not in ["training", "prediction"]:
            raise ValueError("Mode must be one of 'training' or 'prediction'")
        self.mode = mode
        self.input_datafilepath = input_datafilepath
        self.output_datafilepath = output_datafilepath
        self.prediction_dict = prediction_dict

    def __load_data_and_clean(self) -> pd.DataFrame:
        """Load input data file."""
        df = pd.read_csv(self.input_datafilepath)
        df["rating_date"] = pd.to_datetime(df["rating_date"], format = "mixed")
        df["rating_year"] = pd.DatetimeIndex(df["rating_date"]).year

        def get_set(year):
            if (year <= 2003):
                return "training"
            elif (year == 2004):
                return "validation"
            elif (year == 2005):
                return "prediction"

        df["set"] = df["rating_year"].apply(lambda x: get_set(x))
        df = df[~(df["set"] == "prediction")]

        return df
     

    def __load_data_and_clean_and_insert_prediction_row(self)-> pd.DataFrame:
        """Insert prediction row."""

        df = self.__load_data_and_clean()

        toto = self.prediction_dict["rating_date"]
        self.prediction_dict["rating_date"] = datetime.datetime.strptime(toto, "%Y-%m-%d")

        df = pd.concat(
            [
                df,
                pd.DataFrame.from_dict(
                    {
                        "customer_id": [self.prediction_dict["customer_id"]],
                        "rating_date": [self.prediction_dict["rating_date"]],
                        "movie_id": [self.prediction_dict["movie_id"]],
                        "release_year": [self.prediction_dict["release_year"]],
                        "title": [self.prediction_dict["title"]],
                        "rating": [None],
                        "rating_year": [2005],
                        "set": ["prediction"]
                    }
                    )
            ],
            ignore_index = True
        )

        return df
    
    
    def __write_data(self, df: pd.DataFrame):
        """Write features data file."""
        # df.to_csv(pathlib.Path(config["OUTPUT_DATA_DIRECTORY_PATH"], "netflix_prize_data_features.csv"), sep=",", index = False)
        df.to_csv(self.output_datafilepath, sep=",", index = False)
    
    
    def compute_features(self):
        """Compute features."""
        if self.mode == "training":
            df = self.__load_data_and_clean()
        else:
            df = self.__load_data_and_clean_and_insert_prediction_row()
    
        def get_era(year):
            if (year < 1970):
                return "<1970"
            elif (year < 1990):
                return "70s and 80s"
            else:
                return "90s and 2000s"

        df["release_era"] = df["release_year"].apply(lambda x: get_era(x))

        movie_id_list = set(df["movie_id"])

        counter = 0
        for movie_id in movie_id_list:
            ratings_import_movie_df = df[df["movie_id"] == movie_id]
            ratings_import_movie_df = ratings_import_movie_df.merge(ratings_import_movie_df, on=["movie_id"], how="inner")
            ratings_import_movie_df = ratings_import_movie_df[~(ratings_import_movie_df.customer_id_x == ratings_import_movie_df.customer_id_y) & ~(ratings_import_movie_df.set_y == "prediction")]

            if counter == 0:
                ratings_feature_movie_df = ratings_import_movie_df
            else:
                ratings_feature_movie_df = pd.concat([ratings_feature_movie_df, ratings_import_movie_df])
            print(f"Movie with id {movie_id} was added. Dataset has currently {len(ratings_feature_movie_df)} rows.")
            counter +=1
        
        ratings_feature_movie_df["rating_gap"] = abs(ratings_feature_movie_df["rating_x"] - ratings_feature_movie_df["rating_y"])

        movie_id_list = set(df["movie_id"])

        counter = 0
        for movie_id in movie_id_list:
            cross_join_df = pd.merge(ratings_feature_movie_df[ratings_feature_movie_df["movie_id"] == movie_id], ratings_feature_movie_df[(ratings_feature_movie_df["movie_id"] != movie_id) & (ratings_feature_movie_df["set_y"] == "training")].groupby(["customer_id_x", "customer_id_y"])["rating_gap"].mean(), left_on=["customer_id_x", "customer_id_y"], right_index=True, how = "inner", suffixes= ["", "_avg"])#.rename(columns={"rating_gap_x": "avg_rating_gap"})
            avg_rating_of_similar_customers_movie_df = cross_join_df[cross_join_df["rating_gap_avg"] <= .5].groupby(["customer_id_x", "movie_id"])["rating_y"].mean().rename("avg_rating_of_similar_customers")
            number_of_similar_customers_movie_df = cross_join_df[cross_join_df["rating_gap_avg"] <= .5].groupby(["customer_id_x", "movie_id"])["rating_y"].size().rename("number_of_similar_customers")
            
            rating_x_era_avg_movie_df = pd.merge(ratings_feature_movie_df[ratings_feature_movie_df["movie_id"] == movie_id].drop_duplicates(["customer_id_x", "movie_id"]),
                            ratings_feature_movie_df[(ratings_feature_movie_df["movie_id"] != movie_id) & (ratings_feature_movie_df["set_x"] == "training")].drop_duplicates(["customer_id_x", "movie_id"]).groupby(["customer_id_x", "release_era_x"])["rating_x"].mean(), left_on=["customer_id_x", "release_era_x"], right_index=True, how = "inner", suffixes= ["", "_era_avg"])[["customer_id_x", "movie_id", "rating_x_era_avg"]]

            if counter == 0:
                avg_rating_of_similar_customers_df = avg_rating_of_similar_customers_movie_df
                number_of_similar_customers_df = number_of_similar_customers_movie_df
                rating_x_era_avg_df = rating_x_era_avg_movie_df
            else:
                avg_rating_of_similar_customers_df = pd.concat([avg_rating_of_similar_customers_df, avg_rating_of_similar_customers_movie_df])
                number_of_similar_customers_df = pd.concat([number_of_similar_customers_df, number_of_similar_customers_movie_df])
                rating_x_era_avg_df = pd.concat([rating_x_era_avg_df, rating_x_era_avg_movie_df])
            counter +=1

        ratings_features_export_df = pd.merge(df, avg_rating_of_similar_customers_df, left_on=["customer_id", "movie_id"], right_index=True, how="left").merge(number_of_similar_customers_df, left_on=["customer_id", "movie_id"], right_index=True, how="left").merge(rating_x_era_avg_df, left_on=["customer_id", "movie_id"], right_on=["customer_id_x", "movie_id"], how="left").drop(columns = ["customer_id_x"])

        avg_rating_of_similar_customers_imputed_value = ratings_features_export_df.loc[ratings_features_export_df["set"] == "training", "rating"].mean(skipna=True)
        number_of_similar_customers_imputed_value = 0 # technically not an imputed value
        rating_x_era_avg_imputed_value = ratings_features_export_df.loc[ratings_features_export_df["set"] == "training", "rating"].mean(skipna=True) # quick and dirty (would be better to have it by group)

        ratings_features_export_df.loc[ratings_features_export_df["avg_rating_of_similar_customers"].isna(), "avg_rating_of_similar_customers"] = avg_rating_of_similar_customers_imputed_value
        ratings_features_export_df.loc[ratings_features_export_df["number_of_similar_customers"].isna(), "number_of_similar_customers"] = number_of_similar_customers_imputed_value
        ratings_features_export_df.loc[ratings_features_export_df["rating_x_era_avg"].isna(), "rating_x_era_avg"] = rating_x_era_avg_imputed_value
        
        if self.mode == "training":
            self.__write_data(df = ratings_features_export_df)
        else:
            ratings_features_export_df = ratings_features_export_df[ratings_features_export_df["set"]== "prediction"]

        return ratings_features_export_df

#pathlib.Path(config["INPUT_DATA_DIRECTORY_PATH"], "data_sample.csv")

In [None]:
preprocessing_pipeline.compute_features()

In [None]:
preprocessing_pipeline.prediction_dict["rating_date"]

In [None]:
ratings_import_df = pd.read_csv(pathlib.Path(config["INPUT_DATA_DIRECTORY_PATH"], "data_sample.csv"))

In [None]:
ratings_import_df["rating_date"] = pd.to_datetime(ratings_import_df["rating_date"], format = "mixed")

In [None]:
ratings_import_df["rating_year"] = pd.DatetimeIndex(ratings_import_df["rating_date"]).year

def get_set(year):
    if (year <= 2003):
        return "training"
    elif (year == 2004):
        return "validation"
    elif (year == 2005):
        return "prediction"

ratings_import_df["set"] = ratings_import_df["rating_year"].apply(lambda x: get_set(x))
ratings_import_df = ratings_import_df[~(ratings_import_df["set"] == "prediction")]

### Add prediction row if prediction mode

In [None]:
if (MODE == "prediction"):
    PREDICTION_DICT["rating_date"] = datetime.datetime.strptime(PREDICTION_DICT["rating_date"], "%Y-%m-%d")

    ratings_import_df = pd.concat(
        [
            ratings_import_df,
            pd.DataFrame.from_dict(
                {
                    "customer_id": [PREDICTION_DICT["customer_id"]],
                    "rating_date": [PREDICTION_DICT["rating_date"]],
                    "movie_id": [PREDICTION_DICT["movie_id"]],
                    "release_year": [PREDICTION_DICT["release_year"]],
                    "title": [PREDICTION_DICT["title"]],
                    "rating": [None],
                    "rating_year": [2005],
                    "set": ["prediction"]
                 }
                )
        ],
        ignore_index = True
    )

In [None]:
ratings_import_df

### Feature engineering

In [None]:
def get_era(year):
    if (year < 1970):
        return "<1970"
    elif (year < 1990):
        return "70s and 80s"
    else:
        return "90s and 2000s"

ratings_import_df["release_era"] = ratings_import_df["release_year"].apply(lambda x: get_era(x))

movie_id_list = set(ratings_import_df["movie_id"])

In [None]:
ratings_import_df

In [None]:
counter = 0
for movie_id in movie_id_list:
    ratings_import_movie_df = ratings_import_df[ratings_import_df["movie_id"] == movie_id]
    ratings_import_movie_df = ratings_import_movie_df.merge(ratings_import_movie_df, on=["movie_id"], how="inner")
    ratings_import_movie_df = ratings_import_movie_df[~(ratings_import_movie_df.customer_id_x == ratings_import_movie_df.customer_id_y) & ~(ratings_import_movie_df.set_y == "prediction")]

    if counter == 0:
        ratings_feature_movie_df = ratings_import_movie_df
    else:
        ratings_feature_movie_df = pd.concat([ratings_feature_movie_df, ratings_import_movie_df])
    print(f"Movie with id {movie_id} was added. Dataset has currently {len(ratings_feature_movie_df)} rows.")
    counter +=1

In [None]:
ratings_feature_movie_df["rating_gap"] = abs(ratings_feature_movie_df["rating_x"] - ratings_feature_movie_df["rating_y"])
ratings_feature_movie_df


In [None]:
movie_id_list

In [None]:
movie_id_list = set(ratings_import_df["movie_id"])

counter = 0
for movie_id in movie_id_list:
    cross_join_df = pd.merge(ratings_feature_movie_df[ratings_feature_movie_df["movie_id"] == movie_id], ratings_feature_movie_df[(ratings_feature_movie_df["movie_id"] != movie_id) & (ratings_feature_movie_df["set_y"] == "training")].groupby(["customer_id_x", "customer_id_y"])["rating_gap"].mean(), left_on=["customer_id_x", "customer_id_y"], right_index=True, how = "inner", suffixes= ["", "_avg"])#.rename(columns={"rating_gap_x": "avg_rating_gap"})
    avg_rating_of_similar_customers_movie_df = cross_join_df[cross_join_df["rating_gap_avg"] <= .5].groupby(["customer_id_x", "movie_id"])["rating_y"].mean().rename("avg_rating_of_similar_customers")
    number_of_similar_customers_movie_df = cross_join_df[cross_join_df["rating_gap_avg"] <= .5].groupby(["customer_id_x", "movie_id"])["rating_y"].size().rename("number_of_similar_customers")
    
    rating_x_era_avg_movie_df = pd.merge(ratings_feature_movie_df[ratings_feature_movie_df["movie_id"] == movie_id].drop_duplicates(["customer_id_x", "movie_id"]),
                    ratings_feature_movie_df[(ratings_feature_movie_df["movie_id"] != movie_id) & (ratings_feature_movie_df["set_x"] == "training")].drop_duplicates(["customer_id_x", "movie_id"]).groupby(["customer_id_x", "release_era_x"])["rating_x"].mean(), left_on=["customer_id_x", "release_era_x"], right_index=True, how = "inner", suffixes= ["", "_era_avg"])[["customer_id_x", "movie_id", "rating_x_era_avg"]]

    if counter == 0:
        avg_rating_of_similar_customers_df = avg_rating_of_similar_customers_movie_df
        number_of_similar_customers_df = number_of_similar_customers_movie_df
        rating_x_era_avg_df = rating_x_era_avg_movie_df
    else:
        avg_rating_of_similar_customers_df = pd.concat([avg_rating_of_similar_customers_df, avg_rating_of_similar_customers_movie_df])
        number_of_similar_customers_df = pd.concat([number_of_similar_customers_df, number_of_similar_customers_movie_df])
        rating_x_era_avg_df = pd.concat([rating_x_era_avg_df, rating_x_era_avg_movie_df])
    counter +=1

In [None]:
rating_x_era_avg_movie_df

In [None]:
ratings_features_export_df = pd.merge(ratings_import_df, avg_rating_of_similar_customers_df, left_on=["customer_id", "movie_id"], right_index=True, how="left").merge(number_of_similar_customers_df, left_on=["customer_id", "movie_id"], right_index=True, how="left").merge(rating_x_era_avg_df, left_on=["customer_id", "movie_id"], right_on=["customer_id_x", "movie_id"], how="left").drop(columns = ["customer_id_x"])

In [None]:
avg_rating_of_similar_customers_imputed_value = ratings_features_export_df.loc[ratings_features_export_df["set"] == "training", "rating"].mean(skipna=True)
number_of_similar_customers_imputed_value = 0 # technically not an imputed value
rating_x_era_avg_imputed_value = ratings_features_export_df.loc[ratings_features_export_df["set"] == "training", "rating"].mean(skipna=True) # quick and dirty (would be better to have it by group)

ratings_features_export_df.loc[ratings_features_export_df["avg_rating_of_similar_customers"].isna(), "avg_rating_of_similar_customers"] = avg_rating_of_similar_customers_imputed_value
ratings_features_export_df.loc[ratings_features_export_df["number_of_similar_customers"].isna(), "number_of_similar_customers"] = number_of_similar_customers_imputed_value
ratings_features_export_df.loc[ratings_features_export_df["rating_x_era_avg"].isna(), "rating_x_era_avg"] = rating_x_era_avg_imputed_value


In [None]:
ratings_features_export_df.to_csv(pathlib.Path(config["OUTPUT_DATA_DIRECTORY_PATH"], "netflix_prize_data_features.csv"), sep=",", index = False)

In [None]:
ratings_features_export_df

In [None]:
ratings_features_export_df[ratings_features_export_df["set"]== "prediction"]

In [None]:
# from sklearn.preprocessing import OneHotEncoder
# from sklearn.ensemble import RandomForestRegressor

# import pickle
# import numpy as np

# #features = ["title", "release_era", "rating_year", "avg_rating_of_similar_customers", "number_of_similar_customers", "rating_x_era_avg"]
# categorical_features = ["title", "release_era"]
# numeric_features = ["rating_year", "avg_rating_of_similar_customers", "number_of_similar_customers", "rating_x_era_avg"]
# numeric_features_np = ratings_features_export_df[numeric_features].to_numpy()
# categorical_features_np = ratings_features_export_df[categorical_features].to_numpy()
# encoder = OneHotEncoder(handle_unknown='ignore')
# encoder.fit(toto)
# categorical_features_onehotencoded_np = encoder.transform(toto).toarray()

# y_np = ratings_features_export_df["rating"].to_numpy()
# #encoder.categories_

In [6]:
class SupervisedLearning:
    """Actions for supervised learning tasks, only RandomForestRegression is supported as of now."""
    def __init__(self,
                 training_df: pd.DataFrame,
                 validation_df: pd.DataFrame,
                 numeric_features_names: list[str],
                 categorical_features_names: list[str],
                 y_name: str,
                 seed: int = 0) -> None:
        """Instantiate object."""
        self.training_df = training_df
        self.validation_df = validation_df
        self.numeric_features_names = numeric_features_names
        self.categorical_features_names = categorical_features_names
        self.y_name = y_name
        self.seed = seed

    def create_training_arrays(self) -> None:
        """Create training arrays."""
        self.numeric_features_training = self.training_df[self.numeric_features_names].to_numpy()
        self.categorical_features_training = self.training_df[self.categorical_features_names].to_numpy()
        self.encoder = OneHotEncoder(handle_unknown='ignore')
        self.encoder.fit(self.categorical_features_training)
        self.encoded_categorical_features_training = self.encoder.transform(self.categorical_features_training).toarray()
        self.X_training = np.concatenate((self.numeric_features_training, self.encoded_categorical_features_training), axis = 1)
        self.y_training = self.training_df[self.y_name].to_numpy()
    
    def create_validation_arrays(self) -> None:
        """Create validation arrays."""
        self.numeric_features_validation = self.validation_df[self.numeric_features_names].to_numpy()
        self.categorical_features_validation = self.validation_df[self.categorical_features_names].to_numpy()
        self.encoded_categorical_features_validation = self.encoder.transform(self.categorical_features_validation).toarray()
        self.X_validation = np.concatenate((self.numeric_features_validation, self.encoded_categorical_features_validation), axis = 1)
        self.y_validation = self.validation_df[self.y_name].to_numpy()
    
    def train(self) -> None:
        """Train model."""
        self.model = RandomForestRegressor(max_depth=1, max_features = 2, random_state=self.seed)
        self.model.fit(self.X_training, self.y_training)
        self.rmse_training = mean_squared_error(
            self.y_training,
            self.model.predict(self.X_training),
        )

    def validate(self) -> None:
        """Pass."""
        self.rmse_validation = mean_squared_error(
            self.y_validation,
            self.model.predict(self.X_validation),
        )

    def predict(self, prediction_df: pd.DataFrame) -> np.ndarray:
        """Pass."""
        self.numeric_features_prediction = prediction_df[self.numeric_features_names].to_numpy()
        self.categorical_features_prediction = prediction_df[self.categorical_features_names].to_numpy()
        self.encoded_categorical_features_prediction = self.encoder.transform(self.categorical_features_prediction).toarray()
        self.X_prediction = np.concatenate((self.numeric_features_prediction, self.encoded_categorical_features_prediction), axis = 1)
        return self.model.predict(self.X_prediction)

In [None]:
# all_features_np = np.concatenate((categorical_features_onehotencoded_np, numeric_features_np), axis = 1)
# with open("./toto.pickle", "wb") as f:
#     pickle.dump(SupervisedLearning, f)

In [None]:
if MODE == "training":

    MODEL_TIMESTAMP_ID_NEW = str(time.time()).split(".")[0]

    os.makedirs(pathlib.Path(config["MODEL_DIRECTORY_PATH"], MODEL_TIMESTAMP_ID_NEW))

    preprocessing_pipeline = Preprocessing(
        mode = "training",
        input_datafilepath = pathlib.Path(config["DATA_DIRECTORY_PATH"], "data_sample.csv"),
        output_datafilepath = pathlib.Path(config["MODEL_DIRECTORY_PATH"], MODEL_TIMESTAMP_ID_NEW, "training_dataset_features.csv"),
    )
    
    df = preprocessing_pipeline.compute_features()

    learning_pipeline = SupervisedLearning(
        training_df = df[df["set"] == "training"],
        validation_df = df[df["set"] == "validation"],
        numeric_features_names = ["rating_year", "avg_rating_of_similar_customers", "number_of_similar_customers", "rating_x_era_avg"],
        categorical_features_names = ["title", "release_era"],
        y_name = "rating",
        seed = config["SEED"],
    )

    learning_pipeline.create_training_arrays()

    learning_pipeline.create_validation_arrays()

    learning_pipeline.train()

    learning_pipeline.validate()

    logger.info(f"RMSE on training set is: {learning_pipeline.rmse_training}")
    logger.info(f"RMSE on validation set is: {learning_pipeline.rmse_validation}")

    summary = {
        "rmse_training": learning_pipeline.rmse_training,
        "rmse_validation": learning_pipeline.rmse_validation,
        }
    
    
    with open(pathlib.Path(config["MODEL_DIRECTORY_PATH"], MODEL_TIMESTAMP_ID_NEW, "model.pickle"), "wb") as file:
        pickle.dump(learning_pipeline, file)

    # ratings_features_export_df.to_csv(pathlib.Path(config["MODEL_DIRECTORY_PATH"], MODEL_TIMESTAMP_ID_NEW, "training_dataset_features.csv"), sep=",", index = False)

    with open(pathlib.Path(config["MODEL_DIRECTORY_PATH"], MODEL_TIMESTAMP_ID_NEW, "summary.yaml"), "w") as file:
        yaml.dump(summary, file)


In [None]:
if MODE == "prediction":
    
    with open(pathlib.Path(config["MODEL_DIRECTORY_PATH"], str(config["MODEL_TIMESTAMP_ID"]), "model.pickle"), "rb") as file:
        learning_pipeline = pickle.load(file)
    
    preprocessing_pipeline = Preprocessing(
        mode = "prediction",
        input_datafilepath = pathlib.Path(config["DATA_DIRECTORY_PATH"], "data_sample.csv"),
        prediction_dict=copy.deepcopy(PREDICTION_DICT), # deep copy to prevent the side-effect change of dict argument inside the function
    )

    logger.info(
        f"Prediction is: {learning_pipeline.predict(prediction_df = learning_pipeline.predict(prediction_df=preprocessing_pipeline.compute_features()))}"
    )

In [10]:
with open(pathlib.Path(config["MODEL_DIRECTORY_PATH"], str(config["MODEL_TIMESTAMP_ID"]), "model.pickle"), "rb") as file:
    learning_pipeline = pickle.load(file)

In [11]:
learning_pipeline

<__main__.SupervisedLearning at 0x79b31677aeb0>

In [16]:
preprocessing_pipeline = Preprocessing(
    mode = "prediction",
    input_datafilepath = pathlib.Path(config["DATA_DIRECTORY_PATH"], "data_sample.csv"),
    prediction_dict=copy.deepcopy(PREDICTION_DICT), # deep copy to prevent the side-effect change of dict argument inside the function
)

In [17]:
toto = preprocessing_pipeline.compute_features()

Movie with id 14146 was added. Dataset has currently 8010 rows.
Movie with id 10403 was added. Dataset has currently 11670 rows.
Movie with id 16676 was added. Dataset has currently 16640 rows.
Movie with id 2505 was added. Dataset has currently 23446 rows.
Movie with id 586 was added. Dataset has currently 23866 rows.
Movie with id 6953 was added. Dataset has currently 24516 rows.
Movie with id 1132 was added. Dataset has currently 26586 rows.
Movie with id 16365 was added. Dataset has currently 31278 rows.
Movie with id 12942 was added. Dataset has currently 34034 rows.
Movie with id 5231 was added. Dataset has currently 35516 rows.
Movie with id 5488 was added. Dataset has currently 41996 rows.
Movie with id 17743 was added. Dataset has currently 44752 rows.
Movie with id 6162 was added. Dataset has currently 48658 rows.
Movie with id 12031 was added. Dataset has currently 54899 rows.
Movie with id 8469 was added. Dataset has currently 57551 rows.
Movie with id 10903 was added. Data

In [None]:
# learning_pipeline.prediction_df#[numeric_features_names

In [18]:
toto

Unnamed: 0,customer_id,rating,rating_date,movie_id,release_year,title,rating_year,set,release_era,avg_rating_of_similar_customers,number_of_similar_customers,rating_x_era_avg
1105,1044034,,2005-02-03,12031,2002,Scotland,2005,prediction,90s and 2000s,3.0,1.0,3.261954


In [19]:

learning_pipeline.predict(prediction_df=toto)

array([3.24585137])