In [1]:
import numpy as np
import pandas as pd
import datetime
import recommendations as rec
import dataset
import sys
from sklearn.model_selection import train_test_split
import random

In [2]:
def round_of_rating(number):
    """
    Round a number to the closest half integer.
    """

    return round(number * 2) / 2

In [3]:
def dictOfQuantities(prefs):
    quan = {}
    for key in prefs:
        quan[key] = len(prefs[key])
            
    return quan

In [4]:
# on what data perform split?
min_user_ratings = 6
min_movie_ratings = 69

In [5]:
data_file = 'prefs_filtered_u{min_user_ratings}_m{min_movie_ratings}'\
    .format(min_user_ratings=min_user_ratings, min_movie_ratings=min_movie_ratings)

In [6]:
prefs_f = dataset.openJson('datasets/{data_file}.json'.format(data_file=data_file))

In [7]:
users_f = dataset.transformPrefs(prefs_f)

In [8]:
len(users_f)

248781

In [9]:
quan = dictOfQuantities(users_f)

In [10]:
users_pd = pd.DataFrame.from_dict(data=quan,
                                  orient='index', dtype='uint16',
                                  columns=['count'])

In [11]:
print(round(sys.getsizeof(users_pd)/float(1024**2), 4), 'MB')

15.3245 MB


In [12]:
users_pd.shape

(248781, 1)

In [13]:
users_pd.columns

Index(['count'], dtype='object')

In [14]:
type(users_pd)

pandas.core.frame.DataFrame

#### Calculating label for stratified train-test split

In [15]:
stratify_by = 'count'
levels = [16, 35, 102]

In [16]:
users_pd['label'] = np.select(
    [
        users_pd[stratify_by].between(0, levels[0], inclusive=True),
        users_pd[stratify_by].between(levels[0]+1, levels[1], inclusive=True),
        users_pd[stratify_by].between(levels[1]+1, levels[2], inclusive=True),
        users_pd[stratify_by].between(levels[2]+1, 1000000, inclusive=True)
    ],
    [
        'q1',
        'q2',
        'q3',
        'q4'
    ],
    default='unknown'
)

In [17]:
y = users_pd['label']

In [18]:
len(y)

248781

In [19]:
y.head(10)

100004    q4
100008    q4
10001     q1
100015    q1
100018    q1
100021    q4
100022    q4
100039    q3
100044    q3
100046    q4
Name: label, dtype: object

In [20]:
users_pd[['count', 'label']].head(10)

Unnamed: 0,count,label
100004,282,q4
100008,307,q4
10001,11,q1
100015,13,q1
100018,16,q1
100021,2205,q4
100022,292,q4
100039,50,q3
100044,40,q3
100046,230,q4


In [21]:
# users_pd.drop(labels=['count', 'label'], axis='columns', inplace=True)

In [22]:
# y.hist(bins=4)

In [23]:
y.groupby(by=y).count()

label
q1    65527
q2    59217
q3    61974
q4    62063
Name: label, dtype: int64

In [24]:
len(y)

248781

In [25]:
len(users_pd)

248781

### Performing train-test split

In [26]:
test_size = 0.2
random_state = 42

In [27]:
X_train, X_test, y_train, y_test = train_test_split(users_pd, y,
                                                    test_size=test_size,
                                                    random_state=random_state,
                                                    stratify=y
                                                   )

In [28]:
y_train.groupby(y_train).count()

label
q1    52421
q2    47374
q3    49579
q4    49650
Name: label, dtype: int64

In [29]:
y_test.groupby(y_test).count()

label
q1    13106
q2    11843
q3    12395
q4    12413
Name: label, dtype: int64

### Dump train and test sets to files

In [30]:
dataset.savePrefsToJson(target_file='datasets/train_test_sets/test_set_dict_u{min_user_ratings}_m{min_movie_ratings}_str_{stratify_by}_{levels}_ts{test_size}_ran{random_state}.json'\
                        .format(min_user_ratings=min_user_ratings,
                                min_movie_ratings=min_movie_ratings,
                                stratify_by=stratify_by,
                                levels='_'.join([str(x) for x in levels]),
                                test_size=int(test_size*100),
                                random_state=random_state),
                        prefs=X_test.to_dict(orient='index'))

In [31]:
dataset.savePrefsToJson(target_file='datasets/train_test_sets/train_set_dict_u{min_user_ratings}_m{min_movie_ratings}_str_{stratify_by}_{levels}_ts{test_size}_ran{random_state}.json'\
                        .format(min_user_ratings=min_user_ratings,
                                min_movie_ratings=min_movie_ratings,
                                stratify_by=stratify_by,
                                levels='_'.join([str(x) for x in levels]),
                                test_size=int(test_size*100),
                                random_state=random_state),
                        prefs=X_train.to_dict(orient='index'))

### Inner split of test / train set

In [44]:
# what is to be splitted within users? train or test set?
part_type = 'test'

In [45]:
set_file = '{train_test}_set_dict_u{min_user_ratings}_m{min_movie_ratings}_str_{stratify_by}_{levels}_ts{test_size}_ran{random_state}'\
            .format(train_test=part_type,
                    min_user_ratings=min_user_ratings,
                    min_movie_ratings=min_movie_ratings,
                    stratify_by=stratify_by,
                    levels='_'.join([str(x) for x in levels]),
                    test_size=int(test_size*100),
                    random_state=random_state)

In [46]:
X = dataset.openJson(file='datasets/train_test_sets/{set_file}.json'
                     .format(set_file=set_file))

In [47]:
prefs_f = dataset.openJson('datasets/prefs_filtered_u{min_user_ratings}_m{min_movie_ratings}.json'\
                           .format(min_movie_ratings=min_movie_ratings, min_user_ratings=min_user_ratings))

In [48]:
users_f = dataset.transformPrefs(prefs_f)

In [49]:
is_perc = 0.2
random_state = 42

In [50]:
users_train, users_test = dataset\
    .usersInnerSplit(set_dict=X, users_prefs=users_f, perc=is_perc, random_state=random_state)

start ----- 2019-04-26 17:23:28.913202
5000/49757 2019-04-26 17:23:33.710426
10000/49757 2019-04-26 17:23:38.413894
15000/49757 2019-04-26 17:23:42.398811
20000/49757 2019-04-26 17:23:43.945570
25000/49757 2019-04-26 17:23:44.570615
30000/49757 2019-04-26 17:23:45.008147
35000/49757 2019-04-26 17:23:45.336296
40000/49757 2019-04-26 17:23:45.555062
45000/49757 2019-04-26 17:23:45.711322
end ----- 2019-04-26 17:23:45.898818


In [51]:
# saving train set
dataset.savePrefsToJson(target_file='datasets/train_test_sets/{body}_train_is{inner_split_perc}_ran{random_state}.json'\
                        .format(body=set_file,
                                inner_split_perc=int(is_perc*100),
                                random_state=random_state),
                        prefs=users_train)

In [52]:
# saving test set
dataset.savePrefsToJson(target_file='datasets/train_test_sets/{body}_test_is{inner_split_perc}_ran{random_state}.json'\
                        .format(body=set_file, inner_split_perc=int(is_perc*100),
                                random_state=random_state),
                        prefs=users_test)