# Conformal prediction for Recommenders

In [1]:
# !wget https://files.grouplens.org/datasets/movielens/ml-1m.zip
# !unzip -o ml-1m.zip

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.keras.backend as K
from collections import Counter
from itertools import chain
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras.layers import Add, Dense, Concatenate, Embedding, Input, Lambda, Layer, Multiply, Reshape, Subtract
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tqdm import tqdm

In [3]:
users = pd.read_csv('data/ml-1m/users.dat', sep='::', engine='python', names=['userID', 'gender', 'age', 'occupation', 'zipCode'])
movies = pd.read_csv('data/ml-1m/movies.dat', sep='::', engine='python', names=['movieID', 'title', 'genres'], encoding='latin-1')
ratings = pd.read_csv('data/ml-1m/ratings.dat', sep='::', engine='python', names=['userID', 'movieID', 'rating', 'timestamp'])

In [4]:
print("rows:", users.shape[0])
users.head()

rows: 6040


Unnamed: 0,userID,gender,age,occupation,zipCode
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [5]:
print("rows:", movies.shape[0])
movies.head()

rows: 3883


Unnamed: 0,movieID,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
print("rows:", ratings.shape[0])
ratings.head()

rows: 1000209


Unnamed: 0,userID,movieID,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [7]:
genres = movies["genres"].apply(lambda x: x.split("|"))
genres = chain.from_iterable(genres)
count_genres = dict(Counter(genres))
num_genres = len(count_genres)
print(f"num_genres: {num_genres}\n", dict(sorted(count_genres.items(), key=lambda item: item[1], reverse=True)))

num_genres: 18
 {'Drama': 1603, 'Comedy': 1200, 'Action': 503, 'Thriller': 492, 'Romance': 471, 'Horror': 343, 'Adventure': 283, 'Sci-Fi': 276, "Children's": 251, 'Crime': 211, 'War': 143, 'Documentary': 127, 'Musical': 114, 'Mystery': 106, 'Animation': 105, 'Fantasy': 68, 'Western': 68, 'Film-Noir': 44}


In [8]:
# lower=False?
tokenizer = Tokenizer(split='|', filters='')
tokenizer.fit_on_texts(movies["genres"].values)
tokenizer.word_index

{'drama': 1,
 'comedy': 2,
 'action': 3,
 'thriller': 4,
 'romance': 5,
 'horror': 6,
 'adventure': 7,
 'sci-fi': 8,
 "children's": 9,
 'crime': 10,
 'war': 11,
 'documentary': 12,
 'musical': 13,
 'mystery': 14,
 'animation': 15,
 'fantasy': 16,
 'western': 17,
 'film-noir': 18}

In [9]:
# is this the best way?? why not three-hot? with encoding_initializer='eye'

In [10]:
seqs = tokenizer.texts_to_sequences(movies["genres"].values)
seqs = pad_sequences(seqs, maxlen=3,padding='post').tolist()

In [11]:
idx = 2
print(movies["genres"].iloc[idx])
print(seqs[idx])

Comedy|Romance
[2, 5, 0]


In [12]:
movies["genres"] = seqs

In [13]:
matrix = pd.merge(pd.merge(ratings, users), movies)
matrix

Unnamed: 0,userID,movieID,rating,timestamp,gender,age,occupation,zipCode,title,genres
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),"[1, 0, 0]"
1,1,661,3,978302109,F,1,10,48067,James and the Giant Peach (1996),"[15, 9, 13]"
2,1,914,3,978301968,F,1,10,48067,My Fair Lady (1964),"[13, 5, 0]"
3,1,3408,4,978300275,F,1,10,48067,Erin Brockovich (2000),"[1, 0, 0]"
4,1,2355,5,978824291,F,1,10,48067,"Bug's Life, A (1998)","[15, 9, 2]"
...,...,...,...,...,...,...,...,...,...,...
1000204,6040,1091,1,956716541,M,25,6,11106,Weekend at Bernie's (1989),"[2, 0, 0]"
1000205,6040,1094,5,956704887,M,25,6,11106,"Crying Game, The (1992)","[1, 5, 11]"
1000206,6040,562,5,956704746,M,25,6,11106,Welcome to the Dollhouse (1995),"[2, 1, 0]"
1000207,6040,1096,4,956715648,M,25,6,11106,Sophie's Choice (1982),"[1, 0, 0]"


In [14]:
#es podria separar deixant a test els mes recents

In [15]:
train, test = train_test_split(matrix, test_size=0.2, random_state=7)

In [16]:
def define_input_layers():
    # numerical features
    age = Input((1,))

    # single-level categorical features
    userID = Input((1,))
    movieID = Input((1,))

    # multi-level categorical features
    genres = Input((3,))

    return age, userID, movieID, genres

In [17]:
def Tensor_Mean_Pooling2(keepdims=False):
    def f(x, mask):
        mean = K.mean(x, axis=1, keepdims=keepdims)
        #tf.print("inputs2:", x)
        #tf.print("mask2:", mask)
        #tf.print("division2:", mean.shape)
        return mean
    return Lambda(f)

class Tensor_Mean_Pooling(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def call(self, inputs, mask):
        #tf.print("inputs:", inputs.shape)
        #PROVAR AMB 2 UNICS INPUTS A L'ULTIMA CEL·LA I ANAR MIRANT
        tensor = tf.reshape(inputs, [-1, 3])
        mask = tf.cast(mask, inputs.dtype)
        #tf.print("mask:", mask.shape)
        masked_inputs = tf.multiply(tensor, mask)
        #tf.print("masked_tensor:", masked_inputs)
        masked_sum = K.sum(masked_inputs, axis=1)
        valid_count = K.sum(mask, axis=1)
        division = tf.math.divide(masked_sum, valid_count)
        division = tf.reshape(division, [-1, 1])
        #tf.print("division:", division.shape)
        return division
        #tf.print("TMP:", division.shape)
        #return tf.reshape(division, [-1, 1])

    def compute_mask(self, inputs, mask=None):
        return mask

def first_order_interactions(inputs, max_uid, max_mid, num_genres):
    age, userID, movieID, genres = inputs

    # all tensors are reshaped to (None, 1)
    dense_age = Dense(1)(age)

    embedded_uid = Embedding(max_uid+1, 1)(userID)
    reshaped_uid = Reshape((1,))(embedded_uid)

    embedded_mid = Embedding(max_mid+1, 1)(movieID)
    reshaped_mid = Reshape((1,))(embedded_mid)

    embedded_genres = Embedding(num_genres+1, 1, mask_zero=True)(genres)
    genres_mean = Tensor_Mean_Pooling()(embedded_genres, embedded_genres._keras_mask)

    y_1st_order = Add()([dense_age, reshaped_uid, reshaped_mid, genres_mean])
    return y_1st_order

In [18]:
#TODO: CHANGE AGE TO OCCUPATION

In [19]:
genres = Input((3,))
embedded_genres = Embedding(19, 1, mask_zero=True)(genres)
mask = embedded_genres._keras_mask

In [20]:
tmp = Tensor_Mean_Pooling()(embedded_genres, mask)
model = Model(inputs=genres, outputs=tmp)




In [21]:
model.predict(np.array([[18, 18, 18]]))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step


array([[0.00989604]], dtype=float32)

In [22]:
model.predict(np.array([[18, 0, 0]]))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step


array([[0.00989604]], dtype=float32)

In [23]:
def second_order_interactions(inputs, max_uid, max_mid, num_genres, k):
    age, userID, movieID, genres = inputs

    dense_age = Dense(k)(age)  # shape (None, k)
    reshaped_age = Reshape((1,k))(dense_age)  # shape (None, 1, k)

    embedded_uid = Embedding(max_uid+1, k)(userID)

    embedded_mid = Embedding(max_mid+1, k)(movieID)  # shape (None, 1, k)

    embedded_genres = Embedding(num_genres+1, k)(genres)  # shape (None, 3, k)
    genres_mean = Tensor_Mean_Pooling2(keepdims=True)(embedded_genres)  # shape (None, 1, k)

    # concatenate all 2d embedded layers => (None, ?, k)
    embedded_2d = Concatenate(axis=1)([reshaped_age, embedded_uid, embedded_mid, genres_mean])

    # calcuate the interactions by simplication
    # sum of (x1*x2) = 0.5*[sum of (xi)^2 - sum of (xi^2)]
    tensor_sum = Lambda(lambda x: K.sum(x, axis=1))
    tensor_square = Lambda(lambda x: K.square(x))

    sum_of_embedded = tensor_sum(embedded_2d)
    square_of_embedded = tensor_square(embedded_2d)

    square_of_sum = Multiply()([sum_of_embedded, sum_of_embedded])
    sum_of_square = tensor_sum(square_of_embedded)

    sub = Subtract()([square_of_sum, sum_of_square])
    sub = Lambda(lambda x: x*0.5)(sub)
    y_2nd_order = Reshape((1,))(tensor_sum(sub))
    
    return y_2nd_order, embedded_2d

## Put together

In [24]:
def fm_model(max_uid, max_mid, num_genres, k, dnn_dr):
    inputs = define_input_layers()

    y_1st_order = first_order_interactions(inputs, max_uid, max_mid, num_genres)
    y_2nd_order, embedded_2d = second_order_interactions(inputs, max_uid, max_mid, num_genres, k)

    # combined deep and fm parts
    y = Concatenate()([y_1st_order, y_2nd_order])
    y = Dense(1)(y)

    fm_model_1d = Model(inputs, y_1st_order)
    fm_model_2d = Model(inputs, y_2nd_order)
    fm_model = Model(inputs, y)

    return fm_model_1d, fm_model_2d, fm_model

In [25]:
params = {
    'max_uid': ratings["userID"].max(),
    'max_mid': ratings["movieID"].max(),
    'num_genres': 18,
    'k':20,
    'dnn_dr': 0.5
}

fm_model_1d, fm_model_2d, fm_model = fm_model(**params)

In [26]:
params

{'max_uid': 6040, 'max_mid': 3952, 'num_genres': 18, 'k': 20, 'dnn_dr': 0.5}

In [27]:
def df2xy(ratings):
    x = [ratings["occupation"].values, 
         ratings["userID"].values, 
         ratings["movieID"].values, 
         np.concatenate(ratings["genres"].values).reshape(-1,3)]
    y = ratings["rating"].values
    return x,y

train_x, train_y = df2xy(train)
test_x, test_y = df2xy(test)

In [28]:
from tensorflow.keras.callbacks import  EarlyStopping, ModelCheckpoint
# train  model
fm_model.compile(
    loss=tf.keras.losses.MeanSquaredError(), optimizer=keras.optimizers.Adam(learning_rate=0.001)
)
early_stop = EarlyStopping(monitor='val_loss', patience=3)
model_ckp = ModelCheckpoint(filepath='./models/deepfm.weights.h5', 
                            monitor='val_loss',
                            save_weights_only=True, 
                            save_best_only=True)
#callbacks = [model_ckp,early_stop]
callbacks = [model_ckp]
train_history = fm_model.fit(train_x, train_y, 
                                  epochs=30, batch_size=2048, 
                                  validation_data=(test_x, test_y),
                                  callbacks = callbacks)

Epoch 1/30
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - loss: 4.0240 - val_loss: 1.0288
Epoch 2/30
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.9738 - val_loss: 0.9057
Epoch 3/30
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.8839 - val_loss: 0.8669
Epoch 4/30
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.8518 - val_loss: 0.8494
Epoch 5/30
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.8335 - val_loss: 0.8391
Epoch 6/30
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - loss: 0.8214 - val_loss: 0.8316
Epoch 7/30
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.8139 - val_loss: 0.8245
Epoch 8/30
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.8051 - val_loss: 0.8185
Epoch 9/30
[1m391/391[0m [32m━━━━━━━━

In [29]:
df_users = train.head(1)
user_id = df_users["userID"].iloc[0]

In [30]:
df_users

Unnamed: 0,userID,movieID,rating,timestamp,gender,age,occupation,zipCode,title,genres
325594,1926,34,4,974692704,F,1,0,95404,Babe (1995),"[9, 2, 1]"


In [31]:
#aixo es per quedar-se nomes amb les que tenen 1 rating com a minim
all_movies = matrix.drop_duplicates(subset=["movieID"])

In [32]:
all_movies

Unnamed: 0,userID,movieID,rating,timestamp,gender,age,occupation,zipCode,title,genres
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),"[1, 0, 0]"
1,1,661,3,978302109,F,1,10,48067,James and the Giant Peach (1996),"[15, 9, 13]"
2,1,914,3,978301968,F,1,10,48067,My Fair Lady (1964),"[13, 5, 0]"
3,1,3408,4,978300275,F,1,10,48067,Erin Brockovich (2000),"[1, 0, 0]"
4,1,2355,5,978824291,F,1,10,48067,"Bug's Life, A (1998)","[15, 9, 2]"
...,...,...,...,...,...,...,...,...,...,...
919876,5556,2198,3,959445515,M,45,6,92103,Modulations (1998),"[12, 0, 0]"
940262,5675,2703,3,976029116,M,35,14,30030,Broken Vessels (1998),"[1, 0, 0]"
957826,5780,2845,1,958153068,M,18,17,92886,White Boys (1999),"[1, 0, 0]"
970914,5851,3607,5,957756608,F,18,20,55410,One Little Indian (1973),"[2, 1, 17]"


In [33]:
df_user = train[train["userID"] == user_id]

In [34]:
movies_watched = set(df_user["movieID"].unique())

In [35]:
movies_not_watched = all_movies[~all_movies["movieID"].isin(movies_watched)]

In [36]:
movies_not_watched

Unnamed: 0,userID,movieID,rating,timestamp,gender,age,occupation,zipCode,title,genres
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),"[1, 0, 0]"
1,1,661,3,978302109,F,1,10,48067,James and the Giant Peach (1996),"[15, 9, 13]"
2,1,914,3,978301968,F,1,10,48067,My Fair Lady (1964),"[13, 5, 0]"
3,1,3408,4,978300275,F,1,10,48067,Erin Brockovich (2000),"[1, 0, 0]"
4,1,2355,5,978824291,F,1,10,48067,"Bug's Life, A (1998)","[15, 9, 2]"
...,...,...,...,...,...,...,...,...,...,...
919876,5556,2198,3,959445515,M,45,6,92103,Modulations (1998),"[12, 0, 0]"
940262,5675,2703,3,976029116,M,35,14,30030,Broken Vessels (1998),"[1, 0, 0]"
957826,5780,2845,1,958153068,M,18,17,92886,White Boys (1999),"[1, 0, 0]"
970914,5851,3607,5,957756608,F,18,20,55410,One Little Indian (1973),"[2, 1, 17]"


In [37]:
predict_users = test["userID"].unique()

In [38]:
len(predict_users)

6037

In [39]:
def dcg(scores):
    return np.sum((2**scores - 1) / np.log2(np.arange(1, scores.size + 1) + 1))

def ndcg(pred, true):
    return dcg(pred)/dcg(true)

In [47]:
ndcg_sum = 0
for user in tqdm(predict_users):
    test_movies = test[test["userID"] == user]
    pred, true = df2xy(test_movies)
    pred = fm_model.predict(pred, verbose=0).flatten()

    data = pd.DataFrame({
        "movieID": test_movies["movieID"].values,
        "pred": pred,
        "true": true
    })

    data.sort_values(by="pred", ascending=False, inplace=True)
    data = data[:60]
    data.sort_values(by="true", ascending=False, inplace=True)
    true_rel = np.arange(data.shape[0], 0, -1)
    data["relevance"] = true_rel
    data.sort_values(by="pred", ascending=False, inplace=True)

    ndcg_score = ndcg(data["relevance"].values, true_rel)
    #print(count, ndcg_score)
    #print(data)
    #print(f"p:{data['relevance'].values.tolist()}\na:{true_rel.tolist()}")
    ndcg_sum += ndcg_score

  0%|          | 0/6037 [00:00<?, ?it/s]

  1%|▏         | 85/6037 [00:08<09:52, 10.04it/s]


KeyboardInterrupt: 

In [None]:
tmp2_score = ndcg_sum/len(predict_users)
tmp2_score

0.0

In [None]:
tmp_score = ndcg_sum/len(predict_users)
tmp_score

In [49]:
ndcg_sum = 0
for user in tqdm(predict_users):
    test_movies = test[test["userID"] == user]
    pred, true = df2xy(test_movies)
    pred = fm_model.predict(pred, verbose=0).flatten()

    data = pd.DataFrame({
        "movieID": test_movies["movieID"].values,
        "pred": pred,
        "true": true
    })

    data.sort_values(by="pred", ascending=False, inplace=True)
    data = data[:5]
    data.sort_values(by="true", ascending=False, inplace=True)
    true_rel = np.arange(data.shape[0], 0, -1)
    data["relevance"] = true_rel
    data.sort_values(by="pred", ascending=False, inplace=True)

    ndcg_score = ndcg(data["relevance"].values, true_rel)
    #print(count, ndcg_score)
    #print(data)
    #print(f"p:{data['relevance'].values.tolist()}\na:{true_rel.tolist()}")
    ndcg_sum += ndcg_score

100%|██████████| 6037/6037 [07:42<00:00, 13.05it/s]


In [50]:
tmp_score5 = ndcg_sum/len(predict_users)
tmp_score5

0.8930048422033964

In [None]:
#calcular l'score també amb el mean TMP2 dolent
#el paper indi no serveix perque es amb feedback unicament de precedencia

In [40]:
calib_users, test_users = predict_users[:100], predict_users[100:]

In [41]:
calib = test[test["userID"].isin(calib_users)]

In [42]:
calib

Unnamed: 0,userID,movieID,rating,timestamp,gender,age,occupation,zipCode,title,genres
989001,5972,593,5,956952291,F,25,20,55428,"Silence of the Lambs, The (1991)","[1, 4, 0]"
984978,5952,2401,4,957145342,F,45,1,78231,Pale Rider (1985),"[17, 0, 0]"
820569,4933,1805,2,1011684938,M,25,15,94040,Wild Things (1998),"[1, 14, 4]"
373691,2181,587,2,975634724,M,25,0,45245,Ghost (1990),"[2, 5, 4]"
417291,2513,1641,5,974072036,M,56,1,37922,"Full Monty, The (1997)","[2, 0, 0]"
...,...,...,...,...,...,...,...,...,...,...
971854,5854,547,4,958275349,M,45,7,33135,Surviving the Game (1994),"[3, 7, 4]"
951815,5749,2682,4,962843316,M,25,2,94117,Limbo (1999),"[1, 0, 0]"
552681,3401,524,4,980118995,M,35,7,76109,Rudy (1993),"[1, 0, 0]"
813592,4884,3697,2,962745564,M,35,14,90266,Predator 2 (1990),"[3, 8, 4]"


In [43]:
s = calib.groupby("userID").size()

In [95]:
s[s >= 4]

userID
13       18
48      113
69       11
178      27
272      94
       ... 
5854     90
5857     36
5878    109
5952     22
5972     87
Length: 100, dtype: int64

In [44]:
def softmax(x):
    # Subtracting the maximum value for numerical stability
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

# Example usage
x = np.array([1.0, 2.0, 3.0])
softmax_values = softmax(x)
print(softmax_values)

[0.09003057 0.24472847 0.66524096]


In [57]:
def normalize(x):
    return 2**x

In [58]:
def sum_until_rating(pred, true, rating):
    sum_value = 0
    for p, t in zip(pred, true):
        sum_value += p
        if t == rating:
            break
    return sum_value

list_sum_ratings = []
for user in calib_users:
    calib_movies = test[test["userID"] == user]
    pred, true = df2xy(calib_movies)
    pred = fm_model.predict(pred, verbose=0).flatten()

    data = pd.DataFrame({
        "movieID": calib_movies["movieID"].values,
        "pred": pred,
        "true": true
    })

    #data.sort_values(by="pred", ascending=False, inplace=True)
    data = data.sample(min(5, data.shape[0]))
    #data.sort_values(by="true", ascending=False, inplace=True)
    #true_rel = np.arange(data.shape[0], 0, -1)
    #data["relevance"] = true_rel
    data.sort_values(by="pred", ascending=False, inplace=True)
    data["pred"] = normalize(data["pred"].values)
    sur = sum_until_rating(data["pred"].values, data["true"].values, data["true"].max())
    list_sum_ratings.append(sur)
    print(sur)
    #print(count, ndcg_score)
    print(data)
    #print(f"p:{data['relevance'].values.tolist()}\na:{true_rel.tolist()}")

36.1640567779541
    movieID       pred  true
77     2529  18.360205     4
50     1610  17.803852     5
32     2396  16.634554     5
2      2502  14.698814     4
43     2088   5.461722     3
31.005905151367188
    movieID       pred  true
4       951  31.005905     5
16      928  28.883408     5
12     3365  19.843204     5
15     2571  19.019938     4
18     1129  11.537521     4
20.3731689453125
    movieID       pred  true
10     3627  20.373169     5
25     2064  14.199003     4
52      592  13.340383     4
9      3633  11.020936     2
46     2145   6.164054     3
40.858612060546875
     movieID       pred  true
12      2019  24.085711     4
73      1950  16.772902     5
171     3081  15.424977     4
132     3104   9.051175     4
65      2091   5.897467     3
27.363346099853516
    movieID       pred  true
6      2858  27.363346     5
11     1285  18.783113     4
12     2065  15.356855     4
10     3893  13.334956     4
7      3809   9.886120     3
17.374113082885742
    movieID   

In [59]:
q_hat = np.quantile(list_sum_ratings, 0.91, method='higher')

In [60]:
q_hat

43.25739860534668

In [50]:
#q_hat = 5 - q_hat

In [61]:
def recommend(movies, pred, q_hat):
    recom = []
    sum_value = 0
    for m, p in zip(movies, pred):
        if sum_value >= q_hat:
            break
        recom.append(m)
        sum_value += p
    return recom

count = 0
for user in test_users:
    if count == 100:
        break
    test_movies = test[test["userID"] == user]
    pred, true = df2xy(test_movies)
    pred = fm_model.predict(pred, verbose=0).flatten()

    data = pd.DataFrame({
        "movieID": test_movies["movieID"].values,
        "pred": pred,
        "true": true
    })

    #data.sort_values(by="pred", ascending=False, inplace=True)
    data = data.sample(min(5, data.shape[0]))
    #data.sort_values(by="true", ascending=False, inplace=True)
    #true_rel = np.arange(data.shape[0], 0, -1)
    #data["relevance"] = true_rel
    data.sort_values(by="pred", ascending=False, inplace=True)
    data["pred"] = normalize(data["pred"].values)
    recom = recommend(data["movieID"].values, data["pred"].values, q_hat)
    print(recom)
    #print(count, ndcg_score)
    print(data)
    #print(f"p:{data['relevance'].values.tolist()}\na:{true_rel.tolist()}")
    count += 1

[39, 3739, 2463]
    movieID       pred  true
36       39  19.955257     5
53     3739  19.117046     5
38     2463  14.424256     3
29     1393  10.569013     4
52     3362  10.477839     4
[3361, 2167, 2770, 1410, 3392]
     movieID       pred  true
138     3361  12.303308     3
90      2167   7.552934     4
134     2770   6.242207     3
175     1410   5.756388     3
73      3392   3.286511     2
[3556, 3566, 3578]
    movieID       pred  true
12     3556  22.752636     3
11     3566  15.293009     4
9      3578  13.914979     5
14     1036   7.096591     5
4      3717   5.351755     3
[1387, 2058, 2193, 329]
    movieID       pred  true
33     1387  16.768446     3
22     2058  12.165657     3
32     2193  11.272675     3
2       329   7.231174     3
10     2808   6.827260     3
[1208, 3052, 2105]
    movieID       pred  true
6      1208  25.725849     5
13     3052  13.565973     4
46     2105   9.497556     4
25     1801   9.307976     3
29      653   8.639700     3
[2797, 73, 200