In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [12]:
import sys
sys.path.append('./src')

from spark import SparkSessionFactory, read_csv, column_values, train_test_split, PageSet, get_columns, get_rows

from data import MovieLensDataSource

import pyspark.sql.types as t
import pyspark.sql as s
import pyspark.sql.functions as f

# Recommedations dataset building

This is the fisrt step of the process to train a model that predict an user rating to a given movie. Before all, we needs to build a table of features from a raw dataset. This table must be contain next columns:

**feature table columns**:
* **user_id**: This is a categorical column because this has discrete values one of for each user in raw dataset.
* **movie_id**: Like user_id column, this has discrete values one for each movie in raw dataset. 
* **one column for each movie genre**
  For example: accion_genre, drama_gender, etc... are possible columns. Each one takes a discrete value 1 or 0.
* **rating**: Score that a user(user_id) gives to a movie(movie_id).

Finally, we have a table with discreate value columns, they are also called categorical columns, because this columns only take one value from a finite list of possible values. An example of this table must be:

| user_id | movie_id | accion_gender | drama_gender | rating |
| --- | --- | --- | --- | --- |
| 1 |  1 | 1 | 0 | 5 |
| 1 |  2 | 0 | 1 | 1 |
| 1 |  3 | 1 | 1 | 3 |

* User 1 rated the movie 1 with a score of 5 points, because he loves accion movies.
* User 1 rated the movie 2 with a score of 1 points, because he is not interested to drama movies.
* User 1 rated the movie 3 with a score of 2 points, given that he loves accion but is not interested to drama.

## Building out features table

**Step 1**: Let's start create a new spark session.

In [19]:
session = SparkSessionFactory.create()
session

**Step 2**: Select a raw dataset size. For this recommendation examples we have 3 raw datasets each with distinct number of examples. Sometimes we need to begins with a tiny dataset to try to create a model that overfit to these data. This tell us that the model understand input patterns although this does not generalize yet but the model understand input patterns. The next step is maybe use more data and try to find the best hiperparameters so that the model generalize as well as possible.

In [22]:
print('Dataset sizes:', MovieLensDataSource.sizes())

Dataset sizes: ['ml-latest-small', 'ml-25m', 'ml-latest']


In [23]:
# dataset_size = 'ml-latest' # 250MB
# dataset_size = 'ml-25m' # 25MB
dataset_size = 'ml-latest-small' # 1MB

### Download dataset

**Step 3**: Fisrt let's check dataset files.

Check: [Movie lens datasets](https://grouplens.org/datasets/movielens/)

In [24]:
ds = MovieLensDataSource(size = dataset_size)

In [25]:
ds.file_paths()

['/home/adrian/.keras/datasets/ml-latest-small/ratings.csv',
 '/home/adrian/.keras/datasets/ml-latest-small/links.csv',
 '/home/adrian/.keras/datasets/ml-latest-small/movies.csv',
 '/home/adrian/.keras/datasets/ml-latest-small/tags.csv']

**Step 4**: Load ratings and movies files to a dataset for each one. 

In [28]:
ratings = read_csv(session, ds.file_paths()[0])
movies = read_csv(session, ds.file_paths()[3])

**Step 5**: Let's see ratins and movies schemas.

In [30]:
ratings.show(5, truncate=False)
ratings.printSchema()

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|1     |1      |4.0   |964982703|
|1     |3      |4.0   |964981247|
|1     |6      |4.0   |964982224|
|1     |47     |5.0   |964983815|
|1     |50     |5.0   |964982931|
+------+-------+------+---------+
only showing top 5 rows

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)



In [31]:
movies.show(5, truncate=False)
movies.printSchema()

+------+-------+---------------+----------+
|userId|movieId|tag            |timestamp |
+------+-------+---------------+----------+
|2     |60756  |funny          |1445714994|
|2     |60756  |Highly quotable|1445714996|
|2     |60756  |will ferrell   |1445714992|
|2     |89774  |Boxing story   |1445715207|
|2     |89774  |MMA            |1445715200|
+------+-------+---------------+----------+
only showing top 5 rows

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- tag: string (nullable = true)
 |-- timestamp: integer (nullable = true)



**Step 6**: Left join bothn tables by movieId  adn select rating score, userId, movieId and genres columns.

In [12]:
ratings = ratings \
    .join(movies, on='movieId', how="left") \
    .select(
        f.col('rating'),    
        f.col('userId').alias('user_id'), 
        f.col('movieId').alias('movie_id'), 
        f.col('genres')
    )

ratings.show(5, truncate=False)

+------+-------+--------+---------------------------+
|rating|user_id|movie_id|genres                     |
+------+-------+--------+---------------------------+
|5.0   |1      |296     |Comedy|Crime|Drama|Thriller|
|3.5   |1      |306     |Drama                      |
|5.0   |1      |307     |Drama                      |
|5.0   |1      |665     |Comedy|Drama|War           |
|3.5   |1      |899     |Comedy|Musical|Romance     |
+------+-------+--------+---------------------------+
only showing top 5 rows



In [13]:
genres_count = ratings \
    .groupBy("genres") \
    .count()

genres_count.show(5, truncate=False)

+------------------------------+-----+
|genres                        |count|
+------------------------------+-----+
|Action|Adventure|Drama|Fantasy|55169|
|Adventure|Sci-Fi|Thriller     |8817 |
|Comedy|Horror|Thriller        |28816|
|Action|Drama|Horror           |3746 |
|Comedy|Drama|Horror|Thriller  |85   |
+------------------------------+-----+
only showing top 5 rows



In [14]:
def normalize(value): return value.lower().replace('-','_')

In [15]:
genres_columns = genres_count \
        .select('genres') \
        .rdd \
        .flatMap(lambda it: it[0].split('|')) \
        .distinct() \
        .map(lambda it: (it, 'gen_none' if '(' in it else f'gen_{normalize(it)}')) \
        .collect()

genres_columns

[('Fantasy', 'gen_fantasy'),
 ('Adventure', 'gen_adventure'),
 ('(no genres listed)', 'gen_none'),
 ('Comedy', 'gen_comedy'),
 ('Film-Noir', 'gen_film_noir'),
 ('Musical', 'gen_musical'),
 ('Thriller', 'gen_thriller'),
 ('Romance', 'gen_romance'),
 ('Western', 'gen_western'),
 ('Mystery', 'gen_mystery'),
 ('Sci-Fi', 'gen_sci_fi'),
 ('Action', 'gen_action'),
 ('Documentary', 'gen_documentary'),
 ('IMAX', 'gen_imax'),
 ('Drama', 'gen_drama'),
 ('Animation', 'gen_animation'),
 ('Horror', 'gen_horror'),
 ('Crime', 'gen_crime'),
 ('War', 'gen_war'),
 ('Children', 'gen_children')]

In [16]:
for (genre, col_name) in genres_columns:
    ratings = ratings.withColumn(col_name, ratings.genres.contains(genre).cast('integer'))

ratings = ratings.drop('genres')
    
ratings.show(5, truncate=False)

+------+-------+--------+-----------+-------------+--------+----------+-------------+-----------+------------+-----------+-----------+-----------+----------+----------+---------------+--------+---------+-------------+----------+---------+-------+------------+
|rating|user_id|movie_id|gen_fantasy|gen_adventure|gen_none|gen_comedy|gen_film_noir|gen_musical|gen_thriller|gen_romance|gen_western|gen_mystery|gen_sci_fi|gen_action|gen_documentary|gen_imax|gen_drama|gen_animation|gen_horror|gen_crime|gen_war|gen_children|
+------+-------+--------+-----------+-------------+--------+----------+-------------+-----------+------------+-----------+-----------+-----------+----------+----------+---------------+--------+---------+-------------+----------+---------+-------+------------+
|5.0   |1      |296     |0          |0            |0       |1         |0            |0          |1           |0          |0          |0          |0         |0         |0              |0       |1        |0            |0  

In [17]:
from sklearn.preprocessing import LabelEncoder

class Sequencer:
    def __init__(self):
        self.sequence = -1
        self.mapping = {}

    def __call__(self, value):
        if value in self.mapping:
            return self.mapping[value]

        self.sequence += 1
        self.mapping[value] = self.sequence
        return self.sequence

seq = f.udf(Sequencer(), t.IntegerType())

In [18]:
ratings = ratings \
    .withColumn('user_seq', seq(f.col('user_id'))) \
    .withColumn('movie_seq', seq(f.col('movie_id')))

ratings.show(5, truncate=False)

+------+-------+--------+-----------+-------------+--------+----------+-------------+-----------+------------+-----------+-----------+-----------+----------+----------+---------------+--------+---------+-------------+----------+---------+-------+------------+--------+---------+
|rating|user_id|movie_id|gen_fantasy|gen_adventure|gen_none|gen_comedy|gen_film_noir|gen_musical|gen_thriller|gen_romance|gen_western|gen_mystery|gen_sci_fi|gen_action|gen_documentary|gen_imax|gen_drama|gen_animation|gen_horror|gen_crime|gen_war|gen_children|user_seq|movie_seq|
+------+-------+--------+-----------+-------------+--------+----------+-------------+-----------+------------+-----------+-----------+-----------+----------+----------+---------------+--------+---------+-------------+----------+---------+-------+------------+--------+---------+
|5.0   |1      |296     |0          |0            |0       |1         |0            |0          |1           |0          |0          |0          |0         |0     

In [19]:
write_csv(ratings, './dataset')

In [21]:
session.stop()