In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('./src')

from util import PCAPlotter, SparkSessionFactory, read_csv, write_csv
from data import MovieLensDataSource

import pyspark.sql.types as t
import pyspark.sql as s
import pyspark.sql.functions as f

In [21]:
session = SparkSessionFactory.create()
session

In [22]:
dataset_size = 'ml-latest' # 250MB
# dataset_size = 'ml-25m' # 25MB
# dataset_size = 'ml-latest-small' # 1MB

### Download dataset

Create a data source to download and read dataset files:

In [23]:
print('Dataset sizes:', MovieLensDataSource.sizes())

Dataset sizes: ['ml-latest-small', 'ml-25m', 'ml-latest']


Check: [Movie lens datasets](https://grouplens.org/datasets/movielens/)

In [24]:
ds = MovieLensDataSource(size = dataset_size)

In [25]:
ds.file_paths()

['/home/adrian/.keras/datasets/ml-latest/ratings.csv',
 '/home/adrian/.keras/datasets/ml-latest/genome-scores.csv',
 '/home/adrian/.keras/datasets/ml-latest/links.csv',
 '/home/adrian/.keras/datasets/ml-latest/movies.csv',
 '/home/adrian/.keras/datasets/ml-latest/genome-tags.csv',
 '/home/adrian/.keras/datasets/ml-latest/tags.csv']

In [None]:
ratings = read_csv(session, ds.file_paths()[0])
movies = read_csv(session, ds.file_paths()[3])

In [None]:
ratings.show(5, truncate=False)
ratings.printSchema()
movies.show(5, truncate=False)
movies.printSchema()

In [9]:
ratings = ratings \
    .join(movies, on='movieId', how="left") \
    .select(
        f.col('rating'),    
        f.col('userId').alias('user_id'), 
        f.col('movieId').alias('movie_id'), 
        f.col('genres')
    )

ratings.show(5, truncate=False)

+------+-------+--------+--------------+
|rating|user_id|movie_id|genres        |
+------+-------+--------+--------------+
|3.5   |1      |307     |Drama         |
|3.5   |1      |481     |Drama|Thriller|
|1.5   |1      |1091    |Comedy        |
|4.5   |1      |1257    |Comedy|Romance|
|4.5   |1      |1449    |Comedy        |
+------+-------+--------+--------------+
only showing top 5 rows



In [10]:
genres_count = ratings \
    .groupBy("genres") \
    .count()

genres_count.show(5, truncate=False)

+------------------------------+-----+
|genres                        |count|
+------------------------------+-----+
|Comedy|Horror|Thriller        |33187|
|Adventure|Sci-Fi|Thriller     |8778 |
|Action|Adventure|Drama|Fantasy|61724|
|Action|Drama|Horror           |4022 |
|Action|Animation|Comedy|Sci-Fi|1133 |
+------------------------------+-----+
only showing top 5 rows



In [11]:
def normalize(value): return value.lower().replace('-','_')

In [12]:
genres_columns = genres_count \
        .select('genres') \
        .rdd \
        .flatMap(lambda it: it[0].split('|')) \
        .distinct() \
        .map(lambda it: (it, 'gen_none' if '(' in it else f'gen_{normalize(it)}')) \
        .collect()

genres_columns

[('Fantasy', 'gen_fantasy'),
 ('Adventure', 'gen_adventure'),
 ('(no genres listed)', 'gen_none'),
 ('Comedy', 'gen_comedy'),
 ('Film-Noir', 'gen_film_noir'),
 ('Musical', 'gen_musical'),
 ('Thriller', 'gen_thriller'),
 ('Romance', 'gen_romance'),
 ('Western', 'gen_western'),
 ('Mystery', 'gen_mystery'),
 ('Sci-Fi', 'gen_sci_fi'),
 ('Action', 'gen_action'),
 ('Documentary', 'gen_documentary'),
 ('IMAX', 'gen_imax'),
 ('Drama', 'gen_drama'),
 ('Animation', 'gen_animation'),
 ('Horror', 'gen_horror'),
 ('Crime', 'gen_crime'),
 ('War', 'gen_war'),
 ('Children', 'gen_children')]

In [13]:
for (genre, col_name) in genres_columns:
    ratings = ratings.withColumn(col_name, ratings.genres.contains(genre).cast('integer'))

ratings = ratings.drop('genres')
    
ratings.show(5, truncate=False)

+------+-------+--------+-----------+-------------+--------+----------+-------------+-----------+------------+-----------+-----------+-----------+----------+----------+---------------+--------+---------+-------------+----------+---------+-------+------------+
|rating|user_id|movie_id|gen_fantasy|gen_adventure|gen_none|gen_comedy|gen_film_noir|gen_musical|gen_thriller|gen_romance|gen_western|gen_mystery|gen_sci_fi|gen_action|gen_documentary|gen_imax|gen_drama|gen_animation|gen_horror|gen_crime|gen_war|gen_children|
+------+-------+--------+-----------+-------------+--------+----------+-------------+-----------+------------+-----------+-----------+-----------+----------+----------+---------------+--------+---------+-------------+----------+---------+-------+------------+
|3.5   |1      |307     |0          |0            |0       |0         |0            |0          |0           |0          |0          |0          |0         |0         |0              |0       |1        |0            |0  

In [15]:
from sklearn.preprocessing import LabelEncoder

class Sequencer:
    def __init__(self):
        self.sequence = -1
        self.mapping = {}

    def __call__(self, value):
        if value in self.mapping:
            return self.mapping[value]

        self.sequence += 1
        self.mapping[value] = self.sequence
        return self.sequence

seq = f.udf(Sequencer(), t.IntegerType())

In [16]:
ratings = ratings \
    .withColumn('user_seq', seq(f.col('user_id'))) \
    .withColumn('movie_seq', seq(f.col('movie_id')))

ratings.show(5, truncate=False)

+------+-------+--------+-----------+-------------+--------+----------+-------------+-----------+------------+-----------+-----------+-----------+----------+----------+---------------+--------+---------+-------------+----------+---------+-------+------------+--------+---------+
|rating|user_id|movie_id|gen_fantasy|gen_adventure|gen_none|gen_comedy|gen_film_noir|gen_musical|gen_thriller|gen_romance|gen_western|gen_mystery|gen_sci_fi|gen_action|gen_documentary|gen_imax|gen_drama|gen_animation|gen_horror|gen_crime|gen_war|gen_children|user_seq|movie_seq|
+------+-------+--------+-----------+-------------+--------+----------+-------------+-----------+------------+-----------+-----------+-----------+----------+----------+---------------+--------+---------+-------------+----------+---------+-------+------------+--------+---------+
|3.5   |1      |307     |0          |0            |0       |0         |0            |0          |0           |0          |0          |0          |0         |0     

In [17]:
write_csv(ratings, './dataset')

In [18]:
session.stop()