In [20]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [21]:
import sys
sys.path.append('../src')

import numpy as np
import pandas as pd

from spark import SparkSessionFactory, read_csv, write_csv, column_values, train_test_split, shuffle_df, Column
from util import remove_dir, Config, LoggerFactory

import pyspark.sql.types as t
import pyspark.sql as s
import pyspark.sql.functions as f

import logging

## Helpers

In [22]:
def log_counts(train_set, val_set, test_set):
    logging.info(f'Train set count = {train_set.count()} observations.')
    logging.info(f'Validation set count = {val_set.count()} observations.')
    logging.info(f'Test set count = {test_set.count()} observations.')

def show(df, limit=5): return df.limit(limit).toPandas()

def show_columns(df, columns=[], limit=5): return show(df.select(*[f.col(c) for c in columns]), limit)

def show_counts(df, columns=[]):
    logging.info('Count:')
    for column in columns:
        logging.info(f'- {column}: {df.select(column).distinct().count()}')
        
class TrainUserMovieFilter:
    def __init__(self, train_set):
        self.__train_user_seqs = train_set.select('user_seq').distinct().rdd.map(lambda r: r.user_seq).collect()
        self.__train_movie_seqs = train_set.select('movie_seq').distinct().rdd.map(lambda r: r.movie_seq).collect()

    def perform(self, obs_set):
        obs_set2 = obs_set.filter(obs_set['user_seq'].isin(self.__train_user_seqs))
        obs_set3 = obs_set2.filter(obs_set2['movie_seq'].isin(self.__train_movie_seqs))

        logging.info(f'Excluded users: {abs(obs_set.count() - obs_set2.count())}')
        logging.info(f'Excluded movies: {abs(obs_set2.count() - obs_set3.count())}')
        
        return obs_set3

## Logger config

In [23]:
config = Config(path='../config/config.yaml')
LoggerFactory(config['logger']).create()

<RootLogger root (INFO)>

# Prepare model input data

**Step 1**: Create a predefined spack session. this is used to create a pipeline that build the model input features. 

In [24]:
session = SparkSessionFactory.create()
session

In [25]:
session.sparkContext.getConf().getAll()

[('spark.app.id', 'local-1646082088382'),
 ('spark.driver.memory', '16G'),
 ('spark.executor.instances', '12'),
 ('spark.sql.warehouse.dir',
  'file:/home/adrian/development/machine-learning/recommendations/user-movie-genres-model/spark-warehouse'),
 ('spark.executor.id', 'driver'),
 ('spark.app.name', 'recommendations'),
 ('spark.driver.port', '42073'),
 ('spark.driver.host', 'skynet'),
 ('spark.executor.memory', '16G'),
 ('spark.rdd.compress', 'True'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.app.startTime', '1646082087965'),
 ('spark.master', 'local[*]'),
 ('spark.submit.pyFiles', ''),
 ('spark.submit.deployMode', 'client'),
 ('spark.ui.showConsoleProgress', 'true')]

**Note**: When create a spark session this run a new spark cluster with one instance in localhost. You can monitor instance jobs clicking the **Spark UI** link. 

**Step 2**: Load raw dataset to a spark dataset. 

In [26]:
TEMP_PATH = './temp'
DATASET_PATH = f'{TEMP_PATH}/dataset'
TRAIN_PATH = f'{TEMP_PATH}/train'
VAL_PATH = f'{TEMP_PATH}/val'
TEST_PATH = f'{TEMP_PATH}/test'

In [27]:
dataset = shuffle_df(read_csv(session, f'{DATASET_PATH}/*.csv'))

**Step 2**: Get all users and movies ids and let's see how many elements has each.

In [28]:
show_counts(dataset, ['user_id', 'movie_id'])

2022-02-28 18:01:32 INFO Count:
2022-02-28 18:01:32 INFO - user_id: 610
2022-02-28 18:01:33 INFO - movie_id: 9724


**Step 3:** Let's see all dataset columns.

In [29]:
dataset.columns

['rating',
 'user_id',
 'movie_id',
 'gen_comedy',
 'gen_drama',
 'gen_romance',
 'gen_action',
 'gen_adventure',
 'gen_sci_fi',
 'gen_crime',
 'gen_thriller',
 'gen_war',
 'gen_documentary',
 'gen_mystery',
 'gen_imax',
 'gen_horror',
 'gen_children',
 'gen_fantasy',
 'gen_animation',
 'gen_musical',
 'gen_film_noir',
 'gen_western',
 'gen_none']

**Step 4**: add user an movies sequence index/new ids.

In [30]:
# dataset_part, _ = train_test_split(dataset, test_size=0.8)
dataset_part = dataset

In [31]:
dataset_part2 = Column.sequence(session, dataset_part, 'user_id', 'user_seq')

In [32]:
dataset_part3 = Column.sequence(session, dataset_part2, 'movie_id', 'movie_seq')

In [33]:
show_counts(dataset_part3, ['user_id', 'user_seq', 'movie_id', 'movie_seq'])

2022-02-28 18:01:44 INFO Count:
2022-02-28 18:01:44 INFO - user_id: 610
2022-02-28 18:01:44 INFO - user_seq: 610
2022-02-28 18:01:46 INFO - movie_id: 9724
2022-02-28 18:01:46 INFO - movie_seq: 9724


**Step 5**: Split data into train, validacion, test sets.

In [34]:
train_set, val_test_sets = train_test_split(dataset_part3, test_size=0.3)
val_set, test_set = train_test_split(val_test_sets, test_size=0.3)

log_counts(train_set, val_set, test_set)

2022-02-28 18:01:47 INFO Train set count = 70691 observations.
2022-02-28 18:01:47 INFO Validation set count = 21115 observations.
2022-02-28 18:01:47 INFO Test set count = 9030 observations.


**Step 6**: Get only test and val samples for users and movies that appears in the train set.

In [35]:
filter = TrainUserMovieFilter(train_set)

val_set2 = filter.perform(val_set)
test_set2 = filter.perform(test_set)

log_counts(train_set, val_set2, test_set2)

2022-02-28 18:01:53 INFO Excluded users: 0
2022-02-28 18:01:54 INFO Excluded movies: 985
2022-02-28 18:01:59 INFO Excluded users: 0
2022-02-28 18:02:00 INFO Excluded movies: 407
2022-02-28 18:02:01 INFO Train set count = 70691 observations.
2022-02-28 18:02:01 INFO Validation set count = 20130 observations.
2022-02-28 18:02:02 INFO Test set count = 8623 observations.


In [36]:
remove_dir(TRAIN_PATH)
remove_dir(VAL_PATH)
remove_dir(TEST_PATH)

'./temp/test'

In [37]:
write_csv(train_set, TRAIN_PATH)
write_csv(val_set2, VAL_PATH)
write_csv(test_set2, TEST_PATH)

In [38]:
session.stop()