In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('../src')

import numpy as np
import pandas as pd
import tensorflow as tf

from spark import SparkSessionFactory, read_csv, write_csv, column_values, train_test_split, shuffle_df
from util import remove_dir


import pyspark.sql.types as t
import pyspark.sql as s
import pyspark.sql.functions as f

# Prepare model input data

**Step 1**: Create a predefined spack session. this is used to create a pipeline that build the model input features. 

In [3]:
session = SparkSessionFactory.create()
session

In [4]:
session.sparkContext.getConf().getAll()

[('spark.driver.host', 'skynet.local'),
 ('spark.executor.instances', '12'),
 ('spark.executor.id', 'driver'),
 ('spark.executor.memory', '1G'),
 ('spark.driver.port', '40289'),
 ('spark.app.id', 'local-1613176573958'),
 ('spark.app.name', 'recommendations'),
 ('spark.rdd.compress', 'True'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.master', 'local[*]'),
 ('spark.submit.pyFiles', ''),
 ('spark.submit.deployMode', 'client'),
 ('spark.driver.memory', '4G'),
 ('spark.ui.showConsoleProgress', 'true')]

**Note**: When create a spark session this run a new spark cluster with one instance in localhost. You can monitor instance jobs clicking the **Spark UI** link. 

**Step 2**: Load raw dataset to a spark dataset. 

In [5]:
TEMP_PATH = './temp'
DATASET_PATH = f'{TEMP_PATH}/dataset'
TRAIN_PATH = f'{TEMP_PATH}/train'
VAL_PATH = f'{TEMP_PATH}/val'
TEST_PATH = f'{TEMP_PATH}/test'

In [6]:
dataset = read_csv(session, f'{DATASET_PATH}/*.csv')

In [7]:
dataset = shuffle_df(dataset)

**Step 2**: Get all users and movies ids and let's see how many elements has each.

In [8]:
n_users = dataset.select('user_seq').distinct().count()
n_movies = dataset.select('movie_seq').distinct().count()

print(f'Train users: {"{:,}".format(n_users)}.')
print(f'Train movies: {"{:,}".format(n_movies)}.')

Train users: 610.
Train movies: 9,724.


**Step 4:** Let's see all dataset columns.

In [9]:
dataset.columns

['rating',
 'user_id',
 'movie_id',
 'gen_romance',
 'gen_horror',
 'gen_sci_fi',
 'gen_comedy',
 'gen_children',
 'gen_documentary',
 'gen_film_noir',
 'gen_action',
 'gen_fantasy',
 'gen_drama',
 'gen_thriller',
 'gen_none',
 'gen_crime',
 'gen_war',
 'gen_musical',
 'gen_adventure',
 'gen_mystery',
 'gen_western',
 'gen_animation',
 'gen_imax',
 'user_seq',
 'movie_seq']

In [10]:
train_set, val_test_sets = train_test_split(dataset, test_size=0.1)
val_set, test_set = train_test_split(val_test_sets, test_size=0.3)

In [11]:
from sklearn.preprocessing import LabelEncoder

class Sequencer:
    def __init__(self):
        self.sequence = -1
        self.mapping = {}

    def __call__(self, value):
        if value in self.mapping:
            return self.mapping[value]

        self.sequence += 1
        self.mapping[value] = self.sequence
        return self.sequence

user_seq = f.udf(Sequencer(), t.IntegerType())
movie_seq = f.udf(Sequencer(), t.IntegerType())

In [12]:
def show_user_movie(data_set):
    return data_set \
        .select(f.col('user_seq'), f.col('movie_seq')) \
        .limit(5) \
        .toPandas()

In [13]:
show_user_movie(train_set)

Unnamed: 0,user_seq,movie_seq
0,110,1892
1,138,1295
2,297,6097
3,306,2500
4,417,1789


In [14]:
train_set = train_set \
    .withColumn('user_seq', user_seq(f.col('user_id'))) \
    .withColumn('movie_seq', movie_seq(f.col('movie_id')))

show_user_movie(train_set)

Unnamed: 0,user_seq,movie_seq
0,0,0
1,1,1
2,2,2
3,3,3
4,3,4


In [15]:
show_user_movie(val_set)

Unnamed: 0,user_seq,movie_seq
0,297,6097
1,598,5706
2,20,75
3,67,1817
4,86,3041


In [16]:
val_set = val_set \
    .withColumn('user_seq', user_seq(f.col('user_id'))) \
    .withColumn('movie_seq', movie_seq(f.col('movie_id')))

show_user_movie(val_set)

Unnamed: 0,user_seq,movie_seq
0,0,0
1,1,1
2,2,2
3,3,3
4,4,4


In [17]:
show_user_movie(test_set)

Unnamed: 0,user_seq,movie_seq
0,431,1036
1,94,3309
2,371,1407
3,27,2164
4,273,144


In [18]:
test_set = test_set \
    .withColumn('user_seq', user_seq(f.col('user_id'))) \
    .withColumn('movie_seq', movie_seq(f.col('movie_id')))

show_user_movie(test_set)

Unnamed: 0,user_seq,movie_seq
0,0,0
1,1,1
2,2,2
3,3,3
4,4,4


Get only test and val sampes for users an movies that appears in the train set:

In [19]:
user_movies_seqs = train_set.select('user_seq', 'movie_seq').distinct()

In [20]:
val_set_count_before = val_set.count()

In [21]:
val_set2 = val_set \
    .alias("d1") \
    .join(user_movies_seqs.alias("d2"), ['user_seq', 'movie_seq'], how='inner') \
    .select("d1.*")

In [22]:
val_set_count_after = val_set2.count()

Filter valication samples for movies and users that exist in train set:

In [23]:
val_set_count_before, val_set_count_after

(7045, 6871)

In [24]:
print('Filtered:', abs(val_set_count_before - val_set_count_after))

Filtered: 174


In [25]:
test_set_count_before = test_set.count()

In [26]:
test_set2 = test_set \
    .alias("d1") \
    .join(user_movies_seqs.alias("d2"), on=['user_seq', 'movie_seq'], how='inner') \
    .select("d1.*")

In [27]:
test_set_count_after = test_set2.count()

Filter test samples for movies and users that exist in train set:

In [28]:
test_set_count_before, test_set_count_after

(3043, 3003)

In [29]:
print('Filtered:', abs(test_set_count_before - test_set_count_after))

Filtered: 40


In [30]:
print(f'Train set size: {"{:,}".format(train_set.count())} samples.')
print(f'Validation set size: {"{:,}".format(val_set2.count())} samples.')
print(f'Test set size: {"{:,}".format(test_set2.count())} samples.', )

Train set size: 90,707 samples.
Validation set size: 6,886 samples.
Test set size: 3,027 samples.


In [31]:
remove_dir(TRAIN_PATH)
remove_dir(VAL_PATH)
remove_dir(TEST_PATH)

In [32]:
write_csv(train_set, TRAIN_PATH)
write_csv(val_set2, VAL_PATH)
write_csv(test_set2, TEST_PATH)

In [33]:
session.stop()