In [3]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [4]:
import sys
sys.path.append('./src')

from util import PCAPlotter, SparkSessionFactory, read_csv, write_csv
from data import MovieLensDataSource

import pyspark.sql.types as t
import pyspark.sql as s
import pyspark.sql.functions as f

INFO:tensorflow:Using local port 22740
INFO:tensorflow:Using local port 17941
INFO:tensorflow:Using local port 18715
INFO:tensorflow:Using local port 23690
INFO:tensorflow:Using local port 20736
INFO:tensorflow:Using local port 19643
INFO:tensorflow:Using local port 16782
INFO:tensorflow:Using local port 23819
INFO:tensorflow:Using local port 15673
INFO:tensorflow:Using local port 20632


In [5]:
session = SparkSessionFactory.create()
session

In [6]:
# dataset_size = 'ml-latest' # 250MB
# dataset_size = 'ml-25m' # 25MB
dataset_size = 'ml-latest-small' # 1MB

### Download dataset

Create a data source to download and read dataset files:

In [7]:
print('Dataset sizes:', MovieLensDataSource.sizes())

Dataset sizes: ['ml-latest-small', 'ml-25m', 'ml-latest']


Check: [Movie lens datasets](https://grouplens.org/datasets/movielens/)

In [8]:
ds = MovieLensDataSource(size = dataset_size)

In [9]:
ds.file_paths()

['/home/adrian/.keras/datasets/ml-25m/ratings.csv',
 '/home/adrian/.keras/datasets/ml-25m/genome-scores.csv',
 '/home/adrian/.keras/datasets/ml-25m/links.csv',
 '/home/adrian/.keras/datasets/ml-25m/movies.csv',
 '/home/adrian/.keras/datasets/ml-25m/genome-tags.csv',
 '/home/adrian/.keras/datasets/ml-25m/tags.csv']

In [10]:
ratings = read_csv(session, ds.file_paths()[0])
movies = read_csv(session, ds.file_paths()[3])

In [11]:
ratings.show(5, truncate=False)
ratings.printSchema()
movies.show(5, truncate=False)
movies.printSchema()

+------+-------+------+----------+
|userId|movieId|rating|timestamp |
+------+-------+------+----------+
|1     |296    |5.0   |1147880044|
|1     |306    |3.5   |1147868817|
|1     |307    |5.0   |1147868828|
|1     |665    |5.0   |1147878820|
|1     |899    |3.5   |1147868510|
+------+-------+------+----------+
only showing top 5 rows

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)

+-------+----------------------------------+-------------------------------------------+
|movieId|title                             |genres                                     |
+-------+----------------------------------+-------------------------------------------+
|1      |Toy Story (1995)                  |Adventure|Animation|Children|Comedy|Fantasy|
|2      |Jumanji (1995)                    |Adventure|Children|Fantasy                 |
|3      |Grumpier Old Men (1995)           |Comedy

In [12]:
ratings = ratings \
    .join(movies, on='movieId', how="left") \
    .select(
        f.col('rating'),    
        f.col('userId').alias('user_id'), 
        f.col('movieId').alias('movie_id'), 
        f.col('genres')
    )

ratings.show(5, truncate=False)

+------+-------+--------+---------------------------+
|rating|user_id|movie_id|genres                     |
+------+-------+--------+---------------------------+
|5.0   |1      |296     |Comedy|Crime|Drama|Thriller|
|3.5   |1      |306     |Drama                      |
|5.0   |1      |307     |Drama                      |
|5.0   |1      |665     |Comedy|Drama|War           |
|3.5   |1      |899     |Comedy|Musical|Romance     |
+------+-------+--------+---------------------------+
only showing top 5 rows



In [13]:
genres_count = ratings \
    .groupBy("genres") \
    .count()

genres_count.show(5, truncate=False)

+------------------------------+-----+
|genres                        |count|
+------------------------------+-----+
|Action|Adventure|Drama|Fantasy|55169|
|Adventure|Sci-Fi|Thriller     |8817 |
|Comedy|Horror|Thriller        |28816|
|Action|Drama|Horror           |3746 |
|Comedy|Drama|Horror|Thriller  |85   |
+------------------------------+-----+
only showing top 5 rows



In [14]:
def normalize(value): return value.lower().replace('-','_')

In [15]:
genres_columns = genres_count \
        .select('genres') \
        .rdd \
        .flatMap(lambda it: it[0].split('|')) \
        .distinct() \
        .map(lambda it: (it, 'gen_none' if '(' in it else f'gen_{normalize(it)}')) \
        .collect()

genres_columns

[('Fantasy', 'gen_fantasy'),
 ('Adventure', 'gen_adventure'),
 ('(no genres listed)', 'gen_none'),
 ('Comedy', 'gen_comedy'),
 ('Film-Noir', 'gen_film_noir'),
 ('Musical', 'gen_musical'),
 ('Thriller', 'gen_thriller'),
 ('Romance', 'gen_romance'),
 ('Western', 'gen_western'),
 ('Mystery', 'gen_mystery'),
 ('Sci-Fi', 'gen_sci_fi'),
 ('Action', 'gen_action'),
 ('Documentary', 'gen_documentary'),
 ('IMAX', 'gen_imax'),
 ('Drama', 'gen_drama'),
 ('Animation', 'gen_animation'),
 ('Horror', 'gen_horror'),
 ('Crime', 'gen_crime'),
 ('War', 'gen_war'),
 ('Children', 'gen_children')]

In [16]:
for (genre, col_name) in genres_columns:
    ratings = ratings.withColumn(col_name, ratings.genres.contains(genre).cast('integer'))

ratings = ratings.drop('genres')
    
ratings.show(5, truncate=False)

+------+-------+--------+-----------+-------------+--------+----------+-------------+-----------+------------+-----------+-----------+-----------+----------+----------+---------------+--------+---------+-------------+----------+---------+-------+------------+
|rating|user_id|movie_id|gen_fantasy|gen_adventure|gen_none|gen_comedy|gen_film_noir|gen_musical|gen_thriller|gen_romance|gen_western|gen_mystery|gen_sci_fi|gen_action|gen_documentary|gen_imax|gen_drama|gen_animation|gen_horror|gen_crime|gen_war|gen_children|
+------+-------+--------+-----------+-------------+--------+----------+-------------+-----------+------------+-----------+-----------+-----------+----------+----------+---------------+--------+---------+-------------+----------+---------+-------+------------+
|5.0   |1      |296     |0          |0            |0       |1         |0            |0          |1           |0          |0          |0          |0         |0         |0              |0       |1        |0            |0  

In [17]:
from sklearn.preprocessing import LabelEncoder

class Sequencer:
    def __init__(self):
        self.sequence = -1
        self.mapping = {}

    def __call__(self, value):
        if value in self.mapping:
            return self.mapping[value]

        self.sequence += 1
        self.mapping[value] = self.sequence
        return self.sequence

seq = f.udf(Sequencer(), t.IntegerType())

In [18]:
ratings = ratings \
    .withColumn('user_seq', seq(f.col('user_id'))) \
    .withColumn('movie_seq', seq(f.col('movie_id')))

ratings.show(5, truncate=False)

+------+-------+--------+-----------+-------------+--------+----------+-------------+-----------+------------+-----------+-----------+-----------+----------+----------+---------------+--------+---------+-------------+----------+---------+-------+------------+--------+---------+
|rating|user_id|movie_id|gen_fantasy|gen_adventure|gen_none|gen_comedy|gen_film_noir|gen_musical|gen_thriller|gen_romance|gen_western|gen_mystery|gen_sci_fi|gen_action|gen_documentary|gen_imax|gen_drama|gen_animation|gen_horror|gen_crime|gen_war|gen_children|user_seq|movie_seq|
+------+-------+--------+-----------+-------------+--------+----------+-------------+-----------+------------+-----------+-----------+-----------+----------+----------+---------------+--------+---------+-------------+----------+---------+-------+------------+--------+---------+
|5.0   |1      |296     |0          |0            |0       |1         |0            |0          |1           |0          |0          |0          |0         |0     

In [19]:
write_csv(ratings, './dataset')

In [21]:
session.stop()