### Import of libraries

In [2]:
import pathlib
import random
import pandas as pd

### Global variables

In [5]:
HOMEPATH = ! echo $HOME
INPUT_DIRECTORY_PATH = pathlib.Path(HOMEPATH[0], "Downloads/netflix-prize-data/")
OUTPUT_DIRECTORY_PATH = pathlib.Path(HOMEPATH[0], "projects/public-showcase/netflix-recommender-system-microservice/data")
MOVIE_SAMPLE_SIZE = 20
MIN_MOVIES_BY_USER = 3
MAX_MOVIES_BY_USER = 5

### Randomness seed

In [6]:
random.seed(1)

### Sampling of movies

In [7]:
# I manually removed beforehand 1 non-UTF8 character from the source file that had caused an error on the file load
with open(pathlib.Path(INPUT_DIRECTORY_PATH, "movie_titles.csv"), mode="r") as file:
    movies_all = [line.split(sep=",") for line in file]
movies_sample = random.sample(movies_all, MOVIE_SAMPLE_SIZE)
movies_sample = [movie[:-1]+[movie[-1][:-1]] for movie in movies_sample]
movies_sample

[['4403', '2000', 'Hey Ram'],
 ['2068', '1994', 'Gargoyles: Season 1'],
 ['8360', '1999', 'Parting Shots'],
 ['3864', '2005', 'Batman Begins'],
 ['16236', '2000', 'The Gift'],
 ['14730', '1949', 'Young Man with a Horn'],
 ['15476', '1998', 'Secrets of War: Vietnam'],
 ['12441', '1980', 'The Martian Chronicles'],
 ['6881', '2000', 'MTV Cribs: Rock'],
 ['3076', '1998', 'Shadow of Doubt'],
 ['15988', '1985', 'Invasion U.S.A.'],
 ['929', '2002', 'Journeys with George'],
 ['12775', '1988', 'A Pup Named Scooby-Doo'],
 ['14182', '2002', 'Air Jaws 1 and 2: Sharks of South Africa'],
 ['70', '1999', 'Tai Chi: The 24 Forms'],
 ['14596', '2002', 'The Outsider'],
 ['8729', '1991', 'Bingo'],
 ['7498', '1989', 'Rhea Gall Force'],
 ['3350', '2000', 'Reindeer Games'],
 ['10403', '1993', 'Amos & Andrew']]

### Ratings of the sampled movies

In [10]:
ratings_sample = []
for ratings_file in [   "combined_data_1.txt",
                        "combined_data_2.txt",
                        "combined_data_3.txt",
                        "combined_data_4.txt",
]:
    with open(pathlib.Path(INPUT_DIRECTORY_PATH, ratings_file), mode="r") as file:
        for line in file:
            if line[-2] == ":":
                movie_id = line[:-2]
                if movie_id in [movie[0] for movie in movies_sample]:
                    movie_is_sampled = True
                    next
                else:
                    movie_is_sampled = False
                    next
            else:
                if not movie_is_sampled:
                    next
                else:
                    complete_line = line.split(sep=",") + [x for x in movies_sample if x[0] == movie_id][0]
                    complete_line[2] = complete_line[2][:-1]
                    ratings_sample.append(complete_line)
ratings_sample

[['846887', '1', '2003-06-30', '70', '1999', 'Tai Chi: The 24 Forms'],
 ['780936', '3', '2004-11-01', '70', '1999', 'Tai Chi: The 24 Forms'],
 ['745867', '5', '2005-05-07', '70', '1999', 'Tai Chi: The 24 Forms'],
 ['1472421', '3', '2005-05-14', '70', '1999', 'Tai Chi: The 24 Forms'],
 ['2296924', '2', '2005-11-23', '70', '1999', 'Tai Chi: The 24 Forms'],
 ['1685273', '3', '2005-03-02', '70', '1999', 'Tai Chi: The 24 Forms'],
 ['279376', '3', '2005-05-21', '70', '1999', 'Tai Chi: The 24 Forms'],
 ['824480', '5', '2004-08-03', '70', '1999', 'Tai Chi: The 24 Forms'],
 ['1222878', '3', '2005-08-15', '70', '1999', 'Tai Chi: The 24 Forms'],
 ['2535515', '1', '2004-01-15', '70', '1999', 'Tai Chi: The 24 Forms'],
 ['2154863', '1', '2005-02-16', '70', '1999', 'Tai Chi: The 24 Forms'],
 ['1811857', '1', '2005-03-01', '70', '1999', 'Tai Chi: The 24 Forms'],
 ['2604455', '3', '2005-05-28', '70', '1999', 'Tai Chi: The 24 Forms'],
 ['1478381', '4', '2005-11-20', '70', '1999', 'Tai Chi: The 24 Forms'

In [11]:
print(f"We have now {len(ratings_sample)} ratings.")

We have now 127432 ratings.


### We only consider users that have rated enough movies

In [18]:
ratings_sample_df = pd.DataFrame([rating[:6] for rating in ratings_sample], columns=["customer_id", "rating", "rating_date", "movie_id", "release_year", "title"])
ratings_count_per_customer_df = ratings_sample_df.groupby("customer_id").count()["rating"]
users_with_enough_movie_ratings = list(ratings_count_per_customer_df[   (ratings_count_per_customer_df >= MIN_MOVIES_BY_USER) & 
                                                                        (ratings_count_per_customer_df <= MAX_MOVIES_BY_USER)
                                        ].index
                                  )
ratings_sample_export_df = ratings_sample_df[ratings_sample_df["customer_id"].isin(users_with_enough_movie_ratings)]
ratings_sample_export_df

Unnamed: 0,customer_id,rating,rating_date,movie_id,release_year,title
0,846887,1,2003-06-30,70,1999,Tai Chi: The 24 Forms
1,780936,3,2004-11-01,70,1999,Tai Chi: The 24 Forms
13,1478381,4,2005-11-20,70,1999,Tai Chi: The 24 Forms
19,1466320,2,2005-04-12,70,1999,Tai Chi: The 24 Forms
38,2241623,3,2004-10-15,70,1999,Tai Chi: The 24 Forms
...,...,...,...,...,...,...
127401,255278,2,2005-06-07,16236,2000,The Gift
127403,1021583,4,2005-06-13,16236,2000,The Gift
127412,738725,3,2005-09-12,16236,2000,The Gift
127422,334005,4,2005-10-24,16236,2000,The Gift


### File export

In [17]:
ratings_sample_export_df.to_csv(pathlib.Path(OUTPUT_DIRECTORY_PATH, "netflix_prize_data_sample.csv"), sep=",", index = False)