# Conformal prediction for Recommenders

In [1]:
# !wget https://files.grouplens.org/datasets/movielens/ml-1m.zip
# !unzip -o ml-1m.zip

In [40]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.keras.backend as K
from collections import Counter
from itertools import chain
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras.layers import Add, Dense, Concatenate, Embedding, Input, Lambda, Layer, Multiply, Reshape, Subtract
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tqdm import tqdm

In [2]:
users = pd.read_csv('data/ml-1m/users.dat', sep='::', engine='python', names=['userID', 'gender', 'age', 'occupation', 'zipCode'])
movies = pd.read_csv('data/ml-1m/movies.dat', sep='::', engine='python', names=['movieID', 'title', 'genres'], encoding='latin-1')
ratings = pd.read_csv('data/ml-1m/ratings.dat', sep='::', engine='python', names=['userID', 'movieID', 'rating', 'timestamp'])

In [3]:
print("rows:", users.shape[0])
users.head()

rows: 6040


Unnamed: 0,userID,gender,age,occupation,zipCode
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [4]:
print("rows:", movies.shape[0])
movies.head()

rows: 3883


Unnamed: 0,movieID,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
print("rows:", ratings.shape[0])
ratings.head()

rows: 1000209


Unnamed: 0,userID,movieID,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [6]:
genres = movies["genres"].apply(lambda x: x.split("|"))
genres = chain.from_iterable(genres)
count_genres = dict(Counter(genres))
num_genres = len(count_genres)
print(f"num_genres: {num_genres}\n", dict(sorted(count_genres.items(), key=lambda item: item[1], reverse=True)))

num_genres: 18
 {'Drama': 1603, 'Comedy': 1200, 'Action': 503, 'Thriller': 492, 'Romance': 471, 'Horror': 343, 'Adventure': 283, 'Sci-Fi': 276, "Children's": 251, 'Crime': 211, 'War': 143, 'Documentary': 127, 'Musical': 114, 'Mystery': 106, 'Animation': 105, 'Fantasy': 68, 'Western': 68, 'Film-Noir': 44}


In [7]:
# lower=False?
tokenizer = Tokenizer(split='|', filters='')
tokenizer.fit_on_texts(movies["genres"].values)
tokenizer.word_index

{'drama': 1,
 'comedy': 2,
 'action': 3,
 'thriller': 4,
 'romance': 5,
 'horror': 6,
 'adventure': 7,
 'sci-fi': 8,
 "children's": 9,
 'crime': 10,
 'war': 11,
 'documentary': 12,
 'musical': 13,
 'mystery': 14,
 'animation': 15,
 'fantasy': 16,
 'western': 17,
 'film-noir': 18}

In [8]:
# is this the best way?? why not three-hot? with encoding_initializer='eye'

In [9]:
seqs = tokenizer.texts_to_sequences(movies["genres"].values)
seqs = pad_sequences(seqs, maxlen=3,padding='post').tolist()

In [10]:
idx = 2
print(movies["genres"].iloc[idx])
print(seqs[idx])

Comedy|Romance
[2, 5, 0]


In [11]:
movies["genres"] = seqs

In [12]:
matrix = pd.merge(pd.merge(ratings, users), movies)
matrix

Unnamed: 0,userID,movieID,rating,timestamp,gender,age,occupation,zipCode,title,genres
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),"[1, 0, 0]"
1,1,661,3,978302109,F,1,10,48067,James and the Giant Peach (1996),"[15, 9, 13]"
2,1,914,3,978301968,F,1,10,48067,My Fair Lady (1964),"[13, 5, 0]"
3,1,3408,4,978300275,F,1,10,48067,Erin Brockovich (2000),"[1, 0, 0]"
4,1,2355,5,978824291,F,1,10,48067,"Bug's Life, A (1998)","[15, 9, 2]"
...,...,...,...,...,...,...,...,...,...,...
1000204,6040,1091,1,956716541,M,25,6,11106,Weekend at Bernie's (1989),"[2, 0, 0]"
1000205,6040,1094,5,956704887,M,25,6,11106,"Crying Game, The (1992)","[1, 5, 11]"
1000206,6040,562,5,956704746,M,25,6,11106,Welcome to the Dollhouse (1995),"[2, 1, 0]"
1000207,6040,1096,4,956715648,M,25,6,11106,Sophie's Choice (1982),"[1, 0, 0]"


In [13]:
#es podria separar deixant a test els mes recents

In [14]:
train, test = train_test_split(matrix, test_size=0.2, random_state=7)

In [15]:
def define_input_layers():
    # numerical features
    age = Input((1,))

    # single-level categorical features
    userID = Input((1,))
    movieID = Input((1,))

    # multi-level categorical features
    genres = Input((3,))

    return age, userID, movieID, genres

In [16]:
def Tensor_Mean_Pooling2(keepdims=False):
    def f(x, mask):
        mean = K.mean(x, axis=1, keepdims=keepdims)
        #tf.print("inputs2:", x)
        #tf.print("mask2:", mask)
        #tf.print("division2:", mean.shape)
        return mean
    return Lambda(f)

class Tensor_Mean_Pooling(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def call(self, inputs, mask):
        #tf.print("inputs:", inputs.shape)
        #PROVAR AMB 2 UNICS INPUTS A L'ULTIMA CEL·LA I ANAR MIRANT
        tensor = tf.reshape(inputs, [-1, 3])
        mask = tf.cast(mask, inputs.dtype)
        #tf.print("mask:", mask.shape)
        masked_inputs = tf.multiply(tensor, mask)
        #tf.print("masked_tensor:", masked_inputs)
        masked_sum = K.sum(masked_inputs, axis=1)
        valid_count = K.sum(mask, axis=1)
        division = tf.math.divide(masked_sum, valid_count)
        division = tf.reshape(division, [-1, 1])
        #tf.print("division:", division.shape)
        return division
        #tf.print("TMP:", division.shape)
        #return tf.reshape(division, [-1, 1])

    def compute_mask(self, inputs, mask=None):
        return mask

def first_order_interactions(inputs, max_uid, max_mid, num_genres):
    age, userID, movieID, genres = inputs

    # all tensors are reshaped to (None, 1)
    dense_age = Dense(1)(age)

    embedded_uid = Embedding(max_uid+1, 1)(userID)
    reshaped_uid = Reshape((1,))(embedded_uid)

    embedded_mid = Embedding(max_mid+1, 1)(movieID)
    reshaped_mid = Reshape((1,))(embedded_mid)

    embedded_genres = Embedding(num_genres+1, 1, mask_zero=True)(genres)
    genres_mean = Tensor_Mean_Pooling()(embedded_genres, embedded_genres._keras_mask)

    y_1st_order = Add()([dense_age, reshaped_uid, reshaped_mid, genres_mean])
    return y_1st_order

In [17]:
#TODO: CHANGE AGE TO OCCUPATION

In [18]:
genres = Input((3,))
embedded_genres = Embedding(19, 1, mask_zero=True)(genres)
mask = embedded_genres._keras_mask

In [19]:
tmp = Tensor_Mean_Pooling()(embedded_genres, mask)
model = Model(inputs=genres, outputs=tmp)

In [20]:
model.predict(np.array([[18, 18, 18]]))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step


array([[0.03914554]], dtype=float32)

In [21]:
model.predict(np.array([[18, 0, 0]]))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step


array([[0.03914554]], dtype=float32)

In [22]:
def second_order_interactions(inputs, max_uid, max_mid, num_genres, k):
    age, userID, movieID, genres = inputs

    dense_age = Dense(k)(age)  # shape (None, k)
    reshaped_age = Reshape((1,k))(dense_age)  # shape (None, 1, k)

    embedded_uid = Embedding(max_uid+1, k)(userID)

    embedded_mid = Embedding(max_mid+1, k)(movieID)  # shape (None, 1, k)

    embedded_genres = Embedding(num_genres+1, k)(genres)  # shape (None, 3, k)
    genres_mean = Tensor_Mean_Pooling2(keepdims=True)(embedded_genres)  # shape (None, 1, k)

    # concatenate all 2d embedded layers => (None, ?, k)
    embedded_2d = Concatenate(axis=1)([reshaped_age, embedded_uid, embedded_mid, genres_mean])

    # calcuate the interactions by simplication
    # sum of (x1*x2) = 0.5*[sum of (xi)^2 - sum of (xi^2)]
    tensor_sum = Lambda(lambda x: K.sum(x, axis=1))
    tensor_square = Lambda(lambda x: K.square(x))

    sum_of_embedded = tensor_sum(embedded_2d)
    square_of_embedded = tensor_square(embedded_2d)

    square_of_sum = Multiply()([sum_of_embedded, sum_of_embedded])
    sum_of_square = tensor_sum(square_of_embedded)

    sub = Subtract()([square_of_sum, sum_of_square])
    sub = Lambda(lambda x: x*0.5)(sub)
    y_2nd_order = Reshape((1,))(tensor_sum(sub))
    
    return y_2nd_order, embedded_2d

## Put together

In [23]:
def fm_model(max_uid, max_mid, num_genres, k, dnn_dr):
    inputs = define_input_layers()

    y_1st_order = first_order_interactions(inputs, max_uid, max_mid, num_genres)
    y_2nd_order, embedded_2d = second_order_interactions(inputs, max_uid, max_mid, num_genres, k)

    # combined deep and fm parts
    y = Concatenate()([y_1st_order, y_2nd_order])
    y = Dense(1)(y)

    fm_model_1d = Model(inputs, y_1st_order)
    fm_model_2d = Model(inputs, y_2nd_order)
    fm_model = Model(inputs, y)

    return fm_model_1d, fm_model_2d, fm_model

In [24]:
params = {
    'max_uid': ratings["userID"].max(),
    'max_mid': ratings["movieID"].max(),
    'num_genres': 18,
    'k':20,
    'dnn_dr': 0.5
}

fm_model_1d, fm_model_2d, fm_model = fm_model(**params)

In [25]:
params

{'max_uid': 6040, 'max_mid': 3952, 'num_genres': 18, 'k': 20, 'dnn_dr': 0.5}

In [26]:
def df2xy(ratings):
    x = [ratings["occupation"].values, 
         ratings["userID"].values, 
         ratings["movieID"].values, 
         np.concatenate(ratings["genres"].values).reshape(-1,3)]
    y = ratings["rating"].values
    return x,y

train_x, train_y = df2xy(train)
test_x, test_y = df2xy(test)

In [27]:
from tensorflow.keras.callbacks import  EarlyStopping, ModelCheckpoint
# train  model
fm_model.compile(
    loss=tf.keras.losses.MeanSquaredError(), optimizer=keras.optimizers.Adam(learning_rate=0.001)
)
early_stop = EarlyStopping(monitor='val_loss', patience=3)
model_ckp = ModelCheckpoint(filepath='./models/deepfm.weights.h5', 
                            monitor='val_loss',
                            save_weights_only=True, 
                            save_best_only=True)
#callbacks = [model_ckp,early_stop]
callbacks = [model_ckp]
train_history = fm_model.fit(train_x, train_y, 
                                  epochs=30, batch_size=2048, 
                                  validation_data=(test_x, test_y),
                                  callbacks = callbacks)

Epoch 1/30
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - loss: 25.7981 - val_loss: 0.9034
Epoch 2/30
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - loss: 0.8840 - val_loss: 0.8744
Epoch 3/30
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - loss: 0.8657 - val_loss: 0.8776
Epoch 4/30
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - loss: 0.8612 - val_loss: 0.8740
Epoch 5/30
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - loss: 0.8615 - val_loss: 0.8764
Epoch 6/30
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - loss: 0.8619 - val_loss: 0.8788
Epoch 7/30
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - loss: 0.8638 - val_loss: 0.8751
Epoch 8/30
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - loss: 0.8613 - val_loss: 0.8743
Epoch 9/30
[1m391/391[0m [32m━━━━━━━

In [28]:
df_users = train.head(1)
user_id = df_users["userID"].iloc[0]

In [29]:
df_users

Unnamed: 0,userID,movieID,rating,timestamp,gender,age,occupation,zipCode,title,genres
325594,1926,34,4,974692704,F,1,0,95404,Babe (1995),"[9, 2, 1]"


In [30]:
#aixo es per quedar-se nomes amb les que tenen 1 rating com a minim
all_movies = matrix.drop_duplicates(subset=["movieID"])

In [31]:
all_movies

Unnamed: 0,userID,movieID,rating,timestamp,gender,age,occupation,zipCode,title,genres
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),"[1, 0, 0]"
1,1,661,3,978302109,F,1,10,48067,James and the Giant Peach (1996),"[15, 9, 13]"
2,1,914,3,978301968,F,1,10,48067,My Fair Lady (1964),"[13, 5, 0]"
3,1,3408,4,978300275,F,1,10,48067,Erin Brockovich (2000),"[1, 0, 0]"
4,1,2355,5,978824291,F,1,10,48067,"Bug's Life, A (1998)","[15, 9, 2]"
...,...,...,...,...,...,...,...,...,...,...
919876,5556,2198,3,959445515,M,45,6,92103,Modulations (1998),"[12, 0, 0]"
940262,5675,2703,3,976029116,M,35,14,30030,Broken Vessels (1998),"[1, 0, 0]"
957826,5780,2845,1,958153068,M,18,17,92886,White Boys (1999),"[1, 0, 0]"
970914,5851,3607,5,957756608,F,18,20,55410,One Little Indian (1973),"[2, 1, 17]"


In [32]:
df_user = train[train["userID"] == user_id]

In [33]:
movies_watched = set(df_user["movieID"].unique())

In [34]:
movies_not_watched = all_movies[~all_movies["movieID"].isin(movies_watched)]

In [35]:
movies_not_watched

Unnamed: 0,userID,movieID,rating,timestamp,gender,age,occupation,zipCode,title,genres
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),"[1, 0, 0]"
1,1,661,3,978302109,F,1,10,48067,James and the Giant Peach (1996),"[15, 9, 13]"
2,1,914,3,978301968,F,1,10,48067,My Fair Lady (1964),"[13, 5, 0]"
3,1,3408,4,978300275,F,1,10,48067,Erin Brockovich (2000),"[1, 0, 0]"
4,1,2355,5,978824291,F,1,10,48067,"Bug's Life, A (1998)","[15, 9, 2]"
...,...,...,...,...,...,...,...,...,...,...
919876,5556,2198,3,959445515,M,45,6,92103,Modulations (1998),"[12, 0, 0]"
940262,5675,2703,3,976029116,M,35,14,30030,Broken Vessels (1998),"[1, 0, 0]"
957826,5780,2845,1,958153068,M,18,17,92886,White Boys (1999),"[1, 0, 0]"
970914,5851,3607,5,957756608,F,18,20,55410,One Little Indian (1973),"[2, 1, 17]"


In [36]:
predict_users = test["userID"].unique()

In [37]:
len(predict_users)

6037

In [42]:
def dcg(scores):
    return np.sum((2**scores - 1) / np.log2(np.arange(1, scores.size + 1) + 1))

def ndcg(pred, true):
    return dcg(pred)/dcg(true)

ndcg_sum = 0
for user in tqdm(predict_users):
    test_movies = test[test["userID"] == user]
    pred, true = df2xy(test_movies)
    pred = fm_model.predict(pred, verbose=0).flatten()

    data = pd.DataFrame({
        "movieID": test_movies["movieID"].values,
        "pred": pred,
        "true": true
    })

    data.sort_values(by="pred", ascending=False, inplace=True)
    data = data[:60]
    data.sort_values(by="true", ascending=False, inplace=True)
    true_rel = np.arange(data.shape[0], 0, -1)
    data["relevance"] = true_rel
    data.sort_values(by="pred", ascending=False, inplace=True)

    ndcg_score = ndcg(data["relevance"].values, true_rel)
    #print(count, ndcg_score)
    #print(data)
    #print(f"p:{data['relevance'].values.tolist()}\na:{true_rel.tolist()}")
    ndcg_sum += ndcg_score

100%|███████████████████████████████████████| 6037/6037 [07:30<00:00, 13.40it/s]


In [44]:
ndcg_sum/len(predict_users)

0.7722454718097682

In [None]:
#calcular l'score també amb el mean TMP2 dolent

In [None]:
a = np.array([56, 61, 58, 59, 47, 70, 55, 51, 73, 44, 38, 54, 62, 57, 71, 49, 40, 50, 43, 76, 68, 64, 69, 39, 75, 74, 66, 60, 35, 41, 48, 52, 15, 36, 28, 72, 33, 11, 63, 16, 1, 32, 67, 37, 10, 24, 4, 34, 53, 26, 46, 42, 20, 27, 65, 29, 3, 19, 21, 14, 17, 18, 30, 13, 45, 9, 25, 31, 22, 2, 12, 5, 8, 23, 6, 7])
b = np.array([76, 75, 74, 73, 72, 71, 70, 69, 68, 67, 66, 65, 64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1])

In [None]:
ar = dcg(a)
br = dcg(b)

In [None]:
print(ar)
print(br)

In [142]:
a == b

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True])

In [145]:
dcg(np.array([2,1,3]))

7.130929753571458

In [163]:
dcg(np.arange(60, 0, -1))

1.774982893370761e+18

In [150]:
dcg(b)

4.62142049923247e+18

In [174]:
2**np.array([60])

array([1152921504606846976])

In [46]:
for user in predict_users:
    a, b = train[train['userID'] == user].shape[0], test[test['userID'] == user].shape[0]
    print(f"user: {user}, ({a}, {b})")
    if a - b <= 0:
        print("UY")
        break
    if b == 0:
        print("OPSIE")
        #break

user: 5972, (350, 87)
user: 5952, (107, 22)
user: 4933, (247, 55)
user: 2181, (731, 184)
user: 2513, (51, 15)
user: 2146, (84, 18)
user: 5111, (628, 172)
user: 801, (233, 52)
user: 929, (283, 72)
user: 3152, (23, 8)
user: 2907, (691, 158)
user: 2968, (142, 28)
user: 13, (90, 18)
user: 1883, (344, 91)
user: 5749, (331, 76)
user: 4735, (94, 12)
user: 5333, (659, 180)
user: 1155, (141, 41)
user: 2340, (318, 81)
user: 1131, (78, 25)
user: 534, (79, 24)
user: 5857, (164, 36)
user: 1335, (248, 59)
user: 3904, (88, 15)
user: 4884, (310, 78)
user: 3426, (93, 27)
user: 5493, (482, 91)
user: 2794, (111, 32)
user: 3907, (41, 11)
user: 4562, (238, 64)
user: 2472, (234, 56)
user: 5482, (296, 79)
user: 2205, (400, 109)
user: 2996, (396, 81)
user: 4014, (185, 40)
user: 2097, (136, 38)
user: 2457, (404, 95)
user: 1105, (120, 41)
user: 3957, (285, 66)
user: 302, (631, 157)
user: 1812, (535, 163)
user: 5455, (113, 22)
user: 4575, (91, 34)
user: 178, (88, 27)
user: 5393, (161, 49)
user: 1639, (441, 110)
