### Import of libraries

In [None]:
import pathlib
import random
import datetime
import pandas as pd

### Global variables

In [None]:
HOMEPATH = ! echo $HOME
INPUT_DIRECTORY_PATH = pathlib.Path(HOMEPATH[0], "Downloads/netflix-prize-data/")
OUTPUT_FILE_PATH = pathlib.Path(HOMEPATH[0], "projects/public-showcase/netflix-recommender-system-microservice/data/netflix_prize_data_sample.csv")
MOVIE_INITIAL_SAMPLE_SIZE = 200
MIN_MOVIES_BY_CUSTOMER = 3
MAX_MOVIES_BY_CUSTOMER = 5
CUSTOMER_SAMPLE_SIZE = 500
MIN_RATINGS_BY_MOVIE_BY_STEP = 500
MAX_RATINGS_BY_MOVIE_BY_STEP = 5000

### Randomness seed

In [None]:
random.seed(1)

### Sampling of movies

In [None]:
# I manually removed beforehand 1 non-UTF8 character from the source file that had caused an error on the file load
with open(pathlib.Path(INPUT_DIRECTORY_PATH, "movie_titles.csv"), mode="r") as file:
    movies_all = [line.split(sep=",") for line in file]
movies_sample = random.sample(movies_all, MOVIE_INITIAL_SAMPLE_SIZE) # the random sampling is weighed by the number of ratings associated
movies_sample = [movie[:-1]+[movie[-1][:-1]] for movie in movies_sample]
movies_sample

### Ratings of the sampled movies

In [None]:
ratings_sample = []
for ratings_file in [   "combined_data_1.txt",
                        "combined_data_2.txt",
                        "combined_data_3.txt",
                        "combined_data_4.txt",
]:
    with open(pathlib.Path(INPUT_DIRECTORY_PATH, ratings_file), mode="r") as file:
        for line in file:
            if line[-2] == ":":
                movie_id = line[:-2] # remove the line return
                if movie_id in [movie[0] for movie in movies_sample]:
                    movie_is_sampled = True
                    next
                else:
                    movie_is_sampled = False
                    next
            else:
                if not movie_is_sampled:
                    next
                else:
                    complete_line = line.split(sep=",") + [x for x in movies_sample if x[0] == movie_id][0]
                    complete_line[2] = complete_line[2][:-1]
                    ratings_sample.append(complete_line)
ratings_sample

In [None]:
print(f"We have now {len(ratings_sample)} ratings.")

### We only consider movies within a certain range of ratings on the 3 periods (training, validation, prediction)

In [None]:
ratings_sample_df = pd.DataFrame([rating[:6] for rating in ratings_sample], columns=["customer_id", "rating", "rating_date", "movie_id", "release_year", "title"])
number_of_ratings_by_movie_training = ratings_sample_df[pd.DatetimeIndex(ratings_sample_df["rating_date"]) <= datetime.datetime(2003, 12, 31)].groupby("movie_id").size()  #.unique()
number_of_ratings_by_movie_validation = ratings_sample_df[(pd.DatetimeIndex(ratings_sample_df["rating_date"]) >= datetime.datetime(2004, 1, 1)) & (pd.DatetimeIndex(ratings_sample_df["rating_date"]) <= datetime.datetime(2004, 12, 31))].groupby("movie_id").size()
number_of_ratings_by_movie_prediction = ratings_sample_df[(pd.DatetimeIndex(ratings_sample_df["rating_date"]) >= datetime.datetime(2005, 1, 1))].groupby("movie_id").size()

movie_id_final_selection = set(number_of_ratings_by_movie_training.index[(number_of_ratings_by_movie_training >= MIN_RATINGS_BY_MOVIE_BY_STEP) & (number_of_ratings_by_movie_training <= MAX_RATINGS_BY_MOVIE_BY_STEP)]).intersection(set(number_of_ratings_by_movie_validation.index[(number_of_ratings_by_movie_validation >= MIN_RATINGS_BY_MOVIE_BY_STEP) & (number_of_ratings_by_movie_validation <= MAX_RATINGS_BY_MOVIE_BY_STEP)])).intersection(set(number_of_ratings_by_movie_prediction.index[(number_of_ratings_by_movie_prediction >= MIN_RATINGS_BY_MOVIE_BY_STEP) & (number_of_ratings_by_movie_prediction <= MAX_RATINGS_BY_MOVIE_BY_STEP)]))
print(movie_id_final_selection)
ratings_sample_df = ratings_sample_df[ratings_sample_df["movie_id"].isin(movie_id_final_selection)]
ratings_sample_df

### We only consider users that have rated enough movies

In [None]:
ratings_count_per_customer_df = ratings_sample_df.groupby("customer_id").count()["rating"]
users_with_enough_movie_ratings = list(ratings_count_per_customer_df[   (ratings_count_per_customer_df >= MIN_MOVIES_BY_CUSTOMER) & 
                                                                        (ratings_count_per_customer_df <= MAX_MOVIES_BY_CUSTOMER)
                                        ].index
                                  )
ratings_sample_export_df = ratings_sample_df[ratings_sample_df["customer_id"].isin(users_with_enough_movie_ratings)]
ratings_sample_export_df

### Sampling of customers

In [None]:
customer_sample = random.sample(list(set(ratings_sample_export_df.customer_id)), CUSTOMER_SAMPLE_SIZE)
ratings_sample_export_df = ratings_sample_export_df[ratings_sample_export_df["customer_id"].isin(customer_sample)]
ratings_sample_export_df

### File export

In [None]:
ratings_sample_export_df.to_csv(OUTPUT_FILE_PATH, sep=",", index = False)