In [1]:
import numpy as np
import pandas as pd
import datetime
import recommendations as rec
import dataset
import sys
from sklearn.model_selection import train_test_split

In [2]:
def round_of_rating(number):
    """
    Round a number to the closest half integer.
    """

    return round(number * 2) / 2

In [3]:
def dictOfQuantities(prefs):
    quan = {}
    for key in prefs:
        quan[key] = len(prefs[key])
            
    return quan

In [4]:
prefs_f = dataset.openJson('datasets/prefs_filtered.json')

In [5]:
users_f = dataset.transformPrefs(prefs_f)

In [6]:
len(users_f)

248781

In [7]:
quan = dictOfQuantities(users_f)

In [9]:
users_pd = pd.DataFrame.from_dict(data=quan,
                                  orient='index', dtype='uint16',
                                  columns=['count'])

In [23]:
print(round(sys.getsizeof(users_pd)/float(1024**2), 4), 'MB')

25.3245 MB


In [24]:
users_pd.shape

(248781, 1)

In [26]:
users_pd.columns

RangeIndex(start=0, stop=1, step=1)

In [31]:
type(users_pd)

pandas.core.frame.DataFrame

#### Calculating label for stratified train-test split

In [10]:
users_pd['label'] = np.select(
    [
        users_pd['count'].between(0, 16, inclusive=True),
        users_pd['count'].between(17, 35, inclusive=True),
        users_pd['count'].between(36, 102, inclusive=True),
        users_pd['count'].between(103, 1000000, inclusive=True)
    ],
    [
        'q1',
        'q2',
        'q3',
        'q4'
    ],
    default='unknown'
)

In [11]:
y = users_pd['label']

In [12]:
len(y)

248781

In [13]:
y.head(10)

100004    q4
100008    q4
10001     q1
100015    q1
100018    q1
100021    q4
100022    q4
100039    q3
100044    q3
100046    q4
Name: label, dtype: object

In [14]:
users_pd[['count', 'label']].head(10)

Unnamed: 0,count,label
100004,282,q4
100008,307,q4
10001,11,q1
100015,13,q1
100018,16,q1
100021,2205,q4
100022,292,q4
100039,50,q3
100044,40,q3
100046,230,q4


In [15]:
users_pd.drop(labels=['count', 'label'], axis='columns', inplace=True)

In [18]:
# y.hist(bins=4)

In [16]:
y.groupby(by=y).count()

label
q1    65527
q2    59217
q3    61974
q4    62063
Name: label, dtype: int64

In [17]:
len(y)

248781

In [18]:
len(users_pd)

248781

### Performing train-test split

In [19]:
X_train, X_test, y_train, y_test = train_test_split(users_pd, y,
                                                    test_size=0.2,
                                                    random_state=42,
                                                    stratify=y
                                                   )

In [20]:
y_train.groupby(y_train).count()

label
q1    52421
q2    47374
q3    49579
q4    49650
Name: label, dtype: int64

In [21]:
y_test.groupby(y_test).count()

label
q1    13106
q2    11843
q3    12395
q4    12413
Name: label, dtype: int64

### Dump train and test sets to files

In [24]:
dataset.savePrefsToJson(target_file='datasets/test_set_dict.json',
                     prefs=X_test.to_dict(orient='index'))

In [25]:
dataset.savePrefsToJson(target_file='datasets/train_set_dict.json',
                     prefs=X_train.to_dict(orient='index'))

In [None]:
del X_train

### Load files again and convert data to dict

In [None]:
def createNonNullsDict(df):
    

In [None]:
X_test = pd.DataFrame.from_dict(data=dataset.openJson('datasets/test_set_matrix.json'),
                                 orient='index', dtype='float16')

In [None]:
X_test.head(5)

In [None]:
X_test_dict.to_json('datasets/test_set_dict.json')

In [None]:
X_test_dict = X_test.to_dict(orient='index')

In [None]:
X_test_dict.to_json('datasets/test_set_dict.json')

#### dict to ndarray conversion

In [25]:
# # utworzenie prostej arrajki o n-wymiarach
# test_array = np.zeros([nr_rows, nr_cols], float)

In [26]:
# # i=0
# print(datetime.datetime.now())
# for movie_pos, popular_movie in enumerate(popular_movies):
# #     i+=1
#     # if i%100 == 0:
#     for user_pos, (user, ratings) in enumerate(users_dict.items()):
#         for movie, rating in ratings.items():
#             if popular_movie == movie:
#                 test_array.itemset((user_pos, movie_pos), rating)
# #                 print(datetime.datetime.now(), '----- user:', user, '--- movie:', popular_movie, '--- rating:', rating)
                
# test_array[test_array == 0] = np.NaN
# print(datetime.datetime.now())

In [27]:
# np.save(file='datasets/users_filtered_ratings.npy')
# print(datetime.datetime.now())

In [28]:
# print('max of whole array:', np.nanmax(test_array))
# print('max among users:', [x for x in np.nanmax(test_array, axis=0).tolist() if ~np.isnan(x)])
# print('min among users:', [x for x in np.nanmin(test_array, axis=0).tolist() if ~np.isnan(x)])