In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('../src')

import numpy as np
import pandas as pd
import tensorflow as tf

from spark import SparkSessionFactory, read_csv, write_csv, column_values, train_test_split, shuffle_df
from util import remove_dir


import pyspark.sql.types as t
import pyspark.sql as s
import pyspark.sql.functions as f

# Prepare model input data

**Step 1**: Create a predefined spack session. this is used to create a pipeline that build the model input features. 

In [3]:
session = SparkSessionFactory.create()
session

**Note**: When create a spark session this run a new spark cluster with one instance in localhost. You can monitor instance jobs clicking the **Spark UI** link. 

**Step 2**: Load raw dataset to a spark dataset. 

In [4]:
TEMP_PATH = './temp'
DATASET_PATH = f'{TEMP_PATH}/dataset'
TRAIN_PATH = f'{TEMP_PATH}/train'
VAL_PATH = f'{TEMP_PATH}/val'
TEST_PATH = f'{TEMP_PATH}/test'

In [5]:
dataset = read_csv(session, f'{DATASET_PATH}/*.csv')

In [6]:
dataset = shuffle_df(dataset)

**Step 2**: Get all users and movies ids and let's see how many elements has each.

In [7]:
n_users = dataset.select('user_seq').distinct().count()
n_movies = dataset.select('movie_seq').distinct().count()

print(f'Train users: {"{:,}".format(n_users)}.')
print(f'Train movies: {"{:,}".format(n_movies)}.')

Train users: 7,474.
Train movies: 40,099.


**Step 4:** Let's see all dataset columns.

In [8]:
dataset.columns

['rating',
 'user_id',
 'movie_id',
 'gen_comedy',
 'gen_children',
 'gen_western',
 'gen_thriller',
 'gen_romance',
 'gen_action',
 'gen_musical',
 'gen_imax',
 'gen_animation',
 'gen_fantasy',
 'gen_horror',
 'gen_film_noir',
 'gen_war',
 'gen_drama',
 'gen_mystery',
 'gen_sci_fi',
 'gen_none',
 'gen_crime',
 'gen_documentary',
 'gen_adventure',
 'user_seq',
 'movie_seq']

In [9]:
train_set, val_test_sets = train_test_split(dataset, test_size=0.3)
val_set, test_set = train_test_split(val_test_sets, test_size=0.1)

In [10]:
from sklearn.preprocessing import LabelEncoder

class Sequencer:
    def __init__(self):
        self.sequence = -1
        self.mapping = {}

    def __call__(self, value):
        if value in self.mapping:
            return self.mapping[value]

        self.sequence += 1
        self.mapping[value] = self.sequence
        return self.sequence

user_seq = f.udf(Sequencer(), t.IntegerType())
movie_seq = f.udf(Sequencer(), t.IntegerType())

In [11]:
train_set = train_set \
    .withColumn('user_seq', user_seq(f.col('user_id'))) \
    .withColumn('movie_seq', movie_seq(f.col('movie_id')))

train_set.select(f.col('user_seq'), f.col('movie_seq')) \
    .show(5, truncate=False)

+--------+---------+
|user_seq|movie_seq|
+--------+---------+
|0       |0        |
|1       |1        |
|2       |2        |
|3       |3        |
|4       |4        |
+--------+---------+
only showing top 5 rows



In [12]:
val_set = val_set \
    .withColumn('user_seq', user_seq(f.col('user_id'))) \
    .withColumn('movie_seq', movie_seq(f.col('movie_id')))

val_set.select(f.col('user_seq'), f.col('movie_seq')) \
    .show(5, truncate=False)

+--------+---------+
|user_seq|movie_seq|
+--------+---------+
|0       |0        |
|1       |1        |
|2       |2        |
|3       |3        |
|4       |4        |
+--------+---------+
only showing top 5 rows



In [13]:
test_set = test_set \
    .withColumn('user_seq', user_seq(f.col('user_id'))) \
    .withColumn('movie_seq', movie_seq(f.col('movie_id')))

test_set.select(f.col('user_seq'), f.col('movie_seq')) \
    .show(5, truncate=False)

+--------+---------+
|user_seq|movie_seq|
+--------+---------+
|0       |0        |
|1       |1        |
|2       |2        |
|3       |3        |
|4       |4        |
+--------+---------+
only showing top 5 rows



Get only test and val sampes for users an movies that appears in the train set:

In [14]:
user_movies_seqs = train_set.select('user_seq', 'movie_seq').distinct()

In [15]:
val_set.count()

6750525

In [16]:
val_set = val_set \
    .alias("d1") \
    .join(user_movies_seqs.alias("d2"), on=['user_seq', 'movie_seq'], how="left") \
    .select("d1.*")

In [17]:
val_set.count()

6750585

In [18]:
test_set.count()

749329

In [19]:
test_set = test_set \
    .alias("d1") \
    .join(user_movies_seqs.alias("d2"), on=['user_seq', 'movie_seq'], how="left") \
    .select("d1.*")

In [20]:
test_set.count()

749421

In [21]:
print(f'Train set size: {"{:,}".format(train_set.count())} samples.')
print(f'Validation set size: {"{:,}".format(val_set.count())} samples.')
print(f'Test set size: {"{:,}".format(test_set.count())} samples.', )

Train set size: 17,500,695 samples.
Validation set size: 6,750,225 samples.
Test set size: 749,323 samples.


In [22]:
remove_dir(TRAIN_PATH)
remove_dir(VAL_PATH)
remove_dir(TEST_PATH)

In [23]:
write_csv(train_set, TRAIN_PATH)
write_csv(val_set, VAL_PATH)
write_csv(test_set, TEST_PATH)

In [24]:
session.stop()