In [60]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [61]:
import sys
sys.path.append('../src')

import numpy as np
import pandas as pd
import tensorflow as tf

from spark import SparkSessionFactory, read_csv, write_csv, column_values, train_test_split, shuffle_df
from util import remove_dir


import pyspark.sql.types as t
import pyspark.sql as s
import pyspark.sql.functions as f

# Prepare model input data

**Step 1**: Create a predefined spack session. this is used to create a pipeline that build the model input features. 

In [62]:
session = SparkSessionFactory.create()
session

In [63]:
session.sparkContext.getConf().getAll()

[('spark.driver.host', 'skynet.local'),
 ('spark.app.startTime', '1615035641975'),
 ('spark.executor.instances', '12'),
 ('spark.executor.id', 'driver'),
 ('spark.executor.memory', '1G'),
 ('spark.app.name', 'recommendations'),
 ('spark.app.id', 'local-1615035642143'),
 ('spark.rdd.compress', 'True'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.master', 'local[*]'),
 ('spark.submit.pyFiles', ''),
 ('spark.submit.deployMode', 'client'),
 ('spark.driver.memory', '4G'),
 ('spark.driver.port', '37783'),
 ('spark.ui.showConsoleProgress', 'true')]

**Note**: When create a spark session this run a new spark cluster with one instance in localhost. You can monitor instance jobs clicking the **Spark UI** link. 

**Step 2**: Load raw dataset to a spark dataset. 

In [64]:
TEMP_PATH = './temp'
DATASET_PATH = f'{TEMP_PATH}/dataset'
TRAIN_PATH = f'{TEMP_PATH}/train'
VAL_PATH = f'{TEMP_PATH}/val'
TEST_PATH = f'{TEMP_PATH}/test'

In [65]:
dataset = read_csv(session, f'{DATASET_PATH}/*.csv')

In [66]:
dataset = shuffle_df(dataset)

**Step 2**: Get all users and movies ids and let's see how many elements has each.

In [67]:
n_users = dataset.select('user_seq').distinct().count()
n_movies = dataset.select('movie_seq').distinct().count()

print(f'Train users: {"{:,}".format(n_users)}.')
print(f'Train movies: {"{:,}".format(n_movies)}.')

Train users: 610.
Train movies: 9,724.


**Step 4:** Let's see all dataset columns.

In [68]:
dataset.columns

['rating',
 'user_id',
 'movie_id',
 'gen_romance',
 'gen_horror',
 'gen_sci_fi',
 'gen_comedy',
 'gen_children',
 'gen_documentary',
 'gen_film_noir',
 'gen_action',
 'gen_fantasy',
 'gen_drama',
 'gen_thriller',
 'gen_none',
 'gen_crime',
 'gen_war',
 'gen_musical',
 'gen_adventure',
 'gen_mystery',
 'gen_western',
 'gen_animation',
 'gen_imax',
 'user_seq',
 'movie_seq']

In [69]:
# dataset_part, _ = train_test_split(dataset, test_size=0.8)
dataset_part = dataset
train_set, val_test_sets = train_test_split(dataset_part, test_size=0.3)
val_set, test_set = train_test_split(val_test_sets, test_size=0.3)

In [70]:
print(f'Train set size: {"{:,}".format(train_set.count())} samples.')
print(f'Validation set size: {"{:,}".format(val_set.count())} samples.')
print(f'Test set size: {"{:,}".format(test_set.count())} samples.')

Train set size: 70,441 samples.
Validation set size: 21,188 samples.
Test set size: 9,183 samples.


In [71]:
from sklearn.preprocessing import LabelEncoder

class Sequencer:
    def __init__(self):
        self.sequence = -1
        self.mapping = {}

    def __call__(self, value):
        if value in self.mapping:
            return self.mapping[value]

        self.sequence += 1
        self.mapping[value] = self.sequence
        return self.sequence

user_seq = f.udf(Sequencer(), t.IntegerType())
movie_seq = f.udf(Sequencer(), t.IntegerType())

In [72]:
def show_user_movie(data_set):
    return data_set \
        .select(f.col('user_seq'), f.col('movie_seq')) \
        .limit(5) \
        .toPandas()

In [73]:
show_user_movie(train_set)

Unnamed: 0,user_seq,movie_seq
0,27,2349
1,110,1848
2,209,16
3,225,5376
4,489,483


In [74]:
train_set = train_set \
    .withColumn('user_seq', user_seq(f.col('user_id'))) \
    .withColumn('movie_seq', movie_seq(f.col('movie_id')))

show_user_movie(train_set)

Unnamed: 0,user_seq,movie_seq
0,0,0
1,1,1
2,2,2
3,3,3
4,4,4


In [75]:
show_user_movie(val_set)

Unnamed: 0,user_seq,movie_seq
0,111,461
1,488,1486
2,18,1603
3,41,161
4,159,1538


In [76]:
val_set = val_set \
    .withColumn('user_seq', user_seq(f.col('user_id'))) \
    .withColumn('movie_seq', movie_seq(f.col('movie_id')))

show_user_movie(val_set)

Unnamed: 0,user_seq,movie_seq
0,0,0
1,1,1
2,2,2
3,3,3
4,4,4


In [77]:
show_user_movie(test_set)

Unnamed: 0,user_seq,movie_seq
0,489,483
1,159,1538
2,599,2087
3,129,19
4,447,4929


In [78]:
test_set = test_set \
    .withColumn('user_seq', user_seq(f.col('user_id'))) \
    .withColumn('movie_seq', movie_seq(f.col('movie_id')))

show_user_movie(test_set)

Unnamed: 0,user_seq,movie_seq
0,0,0
1,1,1
2,2,2
3,3,3
4,4,4


Get only test and val sampes for users an movies that appears in the train set:

In [79]:
user_movies_seqs = train_set.select('user_seq', 'movie_seq').distinct()

In [80]:
val_set_count_before = val_set.count()

In [81]:
#val_set2 = val_set \
#    .alias("d1") \
#    .join(user_movies_seqs.alias("d2"), ['user_seq'], how='inner') \
#    .select("d1.*")

#val_set2 = val_set2 \
#    .alias("d1") \
#    .join(user_movies_seqs.alias("d2"), ['movie_seq'], how='inner') \
#    .select("d1.*")
val_set2 = val_set

In [82]:
val_set_count_after = val_set2.count()

Filter valication samples for movies and users that exist in train set:

In [83]:
val_set_count_before, val_set_count_after

(21189, 21199)

In [84]:
print('Filtered:', abs(val_set_count_before - val_set_count_after))

Filtered: 10


In [85]:
test_set_count_before = test_set.count()

In [86]:
#test_set2 = test_set \
#    .alias("d1") \
#    .join(user_movies_seqs.alias("d2"), on=['user_seq'], how='inner') \
#    .select("d1.*")

#test_set2 = test_set2 \
#    .alias("d1") \
#    .join(user_movies_seqs.alias("d2"), on=['movie_seq'], how='inner') \
#    .select("d1.*")
test_set2 = test_set

In [87]:
test_set_count_after = test_set2.count()

Filter test samples for movies and users that exist in train set:

In [88]:
test_set_count_before, test_set_count_after

(9163, 9184)

In [89]:
print('Filtered:', abs(test_set_count_before - test_set_count_after))

Filtered: 21


In [90]:
print(f'Train set size: {"{:,}".format(train_set.count())} samples.')
print(f'Validation set size: {"{:,}".format(val_set2.count())} samples.')
print(f'Test set size: {"{:,}".format(test_set2.count())} samples.', )

Train set size: 70,449 samples.
Validation set size: 21,178 samples.
Test set size: 9,166 samples.


In [91]:
remove_dir(TRAIN_PATH)
remove_dir(VAL_PATH)
remove_dir(TEST_PATH)

'./temp/test'

In [92]:
write_csv(train_set, TRAIN_PATH)
write_csv(val_set2, VAL_PATH)
write_csv(test_set2, TEST_PATH)

In [93]:
session.stop()