In [29]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [30]:
import sys
sys.path.append('./src')

import numpy as np
import pandas as pd

from sklearn.decomposition import PCA
from util import PCAPlotter 
from data import Dataset, MovieLensDataSource

import findspark
import pyspark
import random

import pyspark.sql.types as t
import pyspark.sql as s
import pyspark.sql.functions as f
from pyspark.sql import SparkSession

from datetime import date, timedelta, datetime
import time

In [31]:
findspark.init()

In [32]:
session = SparkSession.builder \
    .master("local") \
    .config("spark.driver.extraClassPath", '/opt/apache-spark/jars/rapids-4-spark_2.12-0.2.0.jar;/opt/apache-spark/jars/cudf-0.15-cuda11.jar') \
    .config('spark.rapids.sql.incompatibleOps.enabled', 'true') \
    .config('spark.rapids.sql.enabled','true') \
    .config('spark.rapids.sql.explain', 'ALL') \
    .config('spark.executor.resource.gpu.amount', '1') \
    .config('spark.rapids.sql.batchSizeBytes', '4G') \
    .config('spark.rapids.sql.reader.batchSizeBytes', '4G') \
    .config("spark.driver.maxResultSize", "16G") \
    .config("spark.executor.memory", "16G") \
    .config('spark.driver.memory', '16G') \
    .getOrCreate()

session

http://localhost:4040

In [33]:
dataset_size = 'ml-latest' # 250MB
# dataset_size = 'ml-25m' # 25MB
# dataset_size = 'ml-latest-small' # 1MB

### Download dataset

Create a data source to download and read dataset files:

In [34]:
print('Dataset sizes:', MovieLensDataSource.sizes())

Dataset sizes: ['ml-latest-small', 'ml-25m', 'ml-latest']


Check: [Movie lens datasets](https://grouplens.org/datasets/movielens/)

In [35]:
ds = MovieLensDataSource(size = dataset_size)

In [36]:
ds.file_paths()

['/home/adrian/.keras/datasets/ml-latest/ratings.csv',
 '/home/adrian/.keras/datasets/ml-latest/genome-scores.csv',
 '/home/adrian/.keras/datasets/ml-latest/links.csv',
 '/home/adrian/.keras/datasets/ml-latest/movies.csv',
 '/home/adrian/.keras/datasets/ml-latest/genome-tags.csv',
 '/home/adrian/.keras/datasets/ml-latest/tags.csv']

In [37]:
ratings = session.read.csv(ds.file_paths()[0], inferSchema=True, header=True, mode="DROPMALFORMED").cache()
movies = session.read.csv(ds.file_paths()[3], inferSchema=True, header=True, mode="DROPMALFORMED").cache()

ratings.show(5, False)
movies.show(5, False)

+------+-------+------+----------+
|userId|movieId|rating|timestamp |
+------+-------+------+----------+
|1     |307    |3.5   |1256677221|
|1     |481    |3.5   |1256677456|
|1     |1091   |1.5   |1256677471|
|1     |1257   |4.5   |1256677460|
|1     |1449   |4.5   |1256677264|
+------+-------+------+----------+
only showing top 5 rows

+-------+----------------------------------+-------------------------------------------+
|movieId|title                             |genres                                     |
+-------+----------------------------------+-------------------------------------------+
|1      |Toy Story (1995)                  |Adventure|Animation|Children|Comedy|Fantasy|
|2      |Jumanji (1995)                    |Adventure|Children|Fantasy                 |
|3      |Grumpier Old Men (1995)           |Comedy|Romance                             |
|4      |Waiting to Exhale (1995)          |Comedy|Drama|Romance                       |
|5      |Father of the Bride Part II 

In [38]:
ratings = ratings \
    .join(movies, on='movieId', how="left") \
    .select(
        f.col('rating'),    
        f.col('userId').alias('user_id'), 
        f.col('movieId').alias('movie_id'), 
        f.col('genres')
    )

ratings.show(5, False)

+------+-------+--------+--------------+
|rating|user_id|movie_id|genres        |
+------+-------+--------+--------------+
|3.5   |1      |307     |Drama         |
|3.5   |1      |481     |Drama|Thriller|
|1.5   |1      |1091    |Comedy        |
|4.5   |1      |1257    |Comedy|Romance|
|4.5   |1      |1449    |Comedy        |
+------+-------+--------+--------------+
only showing top 5 rows



In [39]:
genres_count = ratings.groupBy("genres").count()
genres_count.show(5, truncate=False)

+------------------------------+-----+
|genres                        |count|
+------------------------------+-----+
|Comedy|Horror|Thriller        |33187|
|Adventure|Sci-Fi|Thriller     |8778 |
|Action|Adventure|Drama|Fantasy|61724|
|Action|Drama|Horror           |4022 |
|Action|Animation|Comedy|Sci-Fi|1133 |
+------------------------------+-----+
only showing top 5 rows



In [40]:
def normalize(value): return value.lower().replace('-','_')

In [41]:
genres = genres_count \
        .select('genres') \
        .rdd \
        .flatMap(lambda x: x[0].split('|')) \
        .distinct() \
        .collect()

genres_columns = ['gen_none' if '(' in genre else f'gen_{normalize(genre)}' for genre in genres ]
genres_columns

['gen_fantasy',
 'gen_adventure',
 'gen_none',
 'gen_comedy',
 'gen_film_noir',
 'gen_musical',
 'gen_thriller',
 'gen_romance',
 'gen_western',
 'gen_mystery',
 'gen_sci_fi',
 'gen_action',
 'gen_documentary',
 'gen_imax',
 'gen_drama',
 'gen_animation',
 'gen_horror',
 'gen_crime',
 'gen_war',
 'gen_children']

In [42]:
for col_name, genre in zip(genres_columns, genres):
    ratings = ratings.withColumn(col_name, ratings.genres.contains(genre).cast('integer'))

ratings = ratings.drop('genres')
    
ratings.show(5, truncate=False)

+------+-------+--------+-----------+-------------+--------+----------+-------------+-----------+------------+-----------+-----------+-----------+----------+----------+---------------+--------+---------+-------------+----------+---------+-------+------------+
|rating|user_id|movie_id|gen_fantasy|gen_adventure|gen_none|gen_comedy|gen_film_noir|gen_musical|gen_thriller|gen_romance|gen_western|gen_mystery|gen_sci_fi|gen_action|gen_documentary|gen_imax|gen_drama|gen_animation|gen_horror|gen_crime|gen_war|gen_children|
+------+-------+--------+-----------+-------------+--------+----------+-------------+-----------+------------+-----------+-----------+-----------+----------+----------+---------------+--------+---------+-------------+----------+---------+-------+------------+
|3.5   |1      |307     |0          |0            |0       |0         |0            |0          |0           |0          |0          |0          |0         |0         |0              |0       |1        |0            |0  

In [43]:
from sklearn.preprocessing import LabelEncoder

class Sequencer:
    def __init__(self):
        self.sequence = -1
        self.mapping = {}

    def __call__(self, value):
        if value in self.mapping:
            return self.mapping[value]

        self.sequence += 1
        self.mapping[value] = self.sequence
        return self.sequence

def seq(): return f.udf(Sequencer(), t.IntegerType())

In [44]:
ratings = ratings \
    .withColumn('user_seq', seq()(f.col('user_id'))) \
    .withColumn('movie_seq', seq()(f.col('movie_id')))

ratings.show(5)

+------+-------+--------+-----------+-------------+--------+----------+-------------+-----------+------------+-----------+-----------+-----------+----------+----------+---------------+--------+---------+-------------+----------+---------+-------+------------+--------+---------+
|rating|user_id|movie_id|gen_fantasy|gen_adventure|gen_none|gen_comedy|gen_film_noir|gen_musical|gen_thriller|gen_romance|gen_western|gen_mystery|gen_sci_fi|gen_action|gen_documentary|gen_imax|gen_drama|gen_animation|gen_horror|gen_crime|gen_war|gen_children|user_seq|movie_seq|
+------+-------+--------+-----------+-------------+--------+----------+-------------+-----------+------------+-----------+-----------+-----------+----------+----------+---------------+--------+---------+-------------+----------+---------+-------+------------+--------+---------+
|   3.5|      1|     307|          0|            0|       0|         0|            0|          0|           0|          0|          0|          0|         0|      

In [45]:
ratings.write.format('csv').option('header',True).mode('overwrite').option('sep',',').save('dataset')

In [46]:
session.stop()