# Conformal prediction for Recommenders

In [1]:
# !wget https://files.grouplens.org/datasets/movielens/ml-1m.zip
# !unzip -o ml-1m.zip

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.keras.backend as K
from collections import Counter
from itertools import chain
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras.layers import Add, Dense, Concatenate, Embedding, Input, Lambda, Layer, Multiply, Reshape, Subtract
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

2024-05-16 03:54:06.650303: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
users = pd.read_csv('data/ml-1m/users.dat', sep='::', engine='python', names=['userID', 'gender', 'age', 'occupation', 'zipCode'])
movies = pd.read_csv('data/ml-1m/movies.dat', sep='::', engine='python', names=['movieID', 'title', 'genres'], encoding='latin-1')
ratings = pd.read_csv('data/ml-1m/ratings.dat', sep='::', engine='python', names=['userID', 'movieID', 'rating', 'timestamp'])

In [4]:
print("rows:", users.shape[0])
users.head()

rows: 6040


Unnamed: 0,userID,gender,age,occupation,zipCode
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [5]:
print("rows:", movies.shape[0])
movies.head()

rows: 3883


Unnamed: 0,movieID,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
print("rows:", ratings.shape[0])
ratings.head()

rows: 1000209


Unnamed: 0,userID,movieID,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [7]:
genres = movies["genres"].apply(lambda x: x.split("|"))
genres = chain.from_iterable(genres)
count_genres = dict(Counter(genres))
num_genres = len(count_genres)
print(f"num_genres: {num_genres}\n", dict(sorted(count_genres.items(), key=lambda item: item[1], reverse=True)))

num_genres: 18
 {'Drama': 1603, 'Comedy': 1200, 'Action': 503, 'Thriller': 492, 'Romance': 471, 'Horror': 343, 'Adventure': 283, 'Sci-Fi': 276, "Children's": 251, 'Crime': 211, 'War': 143, 'Documentary': 127, 'Musical': 114, 'Mystery': 106, 'Animation': 105, 'Fantasy': 68, 'Western': 68, 'Film-Noir': 44}


In [8]:
# lower=False?
tokenizer = Tokenizer(split='|', filters='')
tokenizer.fit_on_texts(movies["genres"].values)
tokenizer.word_index

{'drama': 1,
 'comedy': 2,
 'action': 3,
 'thriller': 4,
 'romance': 5,
 'horror': 6,
 'adventure': 7,
 'sci-fi': 8,
 "children's": 9,
 'crime': 10,
 'war': 11,
 'documentary': 12,
 'musical': 13,
 'mystery': 14,
 'animation': 15,
 'fantasy': 16,
 'western': 17,
 'film-noir': 18}

In [9]:
# is this the best way?? why not three-hot? with encoding_initializer='eye'

In [10]:
seqs = tokenizer.texts_to_sequences(movies["genres"].values)
seqs = pad_sequences(seqs, maxlen=3,padding='post').tolist()

In [11]:
idx = 2
print(movies["genres"].iloc[idx])
print(seqs[idx])

Comedy|Romance
[2, 5, 0]


In [12]:
movies["genres"] = seqs

In [13]:
matrix = pd.merge(pd.merge(ratings, users), movies)
matrix

Unnamed: 0,userID,movieID,rating,timestamp,gender,age,occupation,zipCode,title,genres
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),"[1, 0, 0]"
1,1,661,3,978302109,F,1,10,48067,James and the Giant Peach (1996),"[15, 9, 13]"
2,1,914,3,978301968,F,1,10,48067,My Fair Lady (1964),"[13, 5, 0]"
3,1,3408,4,978300275,F,1,10,48067,Erin Brockovich (2000),"[1, 0, 0]"
4,1,2355,5,978824291,F,1,10,48067,"Bug's Life, A (1998)","[15, 9, 2]"
...,...,...,...,...,...,...,...,...,...,...
1000204,6040,1091,1,956716541,M,25,6,11106,Weekend at Bernie's (1989),"[2, 0, 0]"
1000205,6040,1094,5,956704887,M,25,6,11106,"Crying Game, The (1992)","[1, 5, 11]"
1000206,6040,562,5,956704746,M,25,6,11106,Welcome to the Dollhouse (1995),"[2, 1, 0]"
1000207,6040,1096,4,956715648,M,25,6,11106,Sophie's Choice (1982),"[1, 0, 0]"


In [14]:
train, test = train_test_split(matrix, test_size=0.2, random_state=7)

In [15]:
def define_input_layers():
    # numerical features
    age = Input((1,))

    # single-level categorical features
    userID = Input((1,))
    movieID = Input((1,))

    # multi-level categorical features
    genres = Input((3,))

    return age, userID, movieID, genres

In [146]:
class Tensor_Mean_Pooling(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def call(self, inputs, mask):
        tf.print("inputs:", inputs)
        #tensor = tf.reshape(inputs, [1, 3])
        mask = tf.cast(mask, inputs.dtype)
        tf.print("mask:", mask)
        masked_inputs = tf.multiply(inputs, mask)
        tf.print("masked_tensor:", masked_inputs)
        masked_sum = K.sum(masked_inputs, axis=1)
        valid_count = K.sum(mask, axis=1)
        division = tf.math.divide(masked_sum, valid_count)
        return division
        #tf.print("TMP:", division.shape)
        return tf.reshape(division, [1, 1])

    def compute_mask(self, inputs, mask=None):
        return mask

def first_order_interactions(inputs, max_uid, max_mid, num_genres):
    age, userID, movieID, genres = inputs

    # all tensors are reshaped to (None, 1)
    dense_age = Dense(1)(age)

    embedded_uid = Embedding(max_uid+1, 1)(userID)
    reshaped_uid = Reshape((1,))(embedded_uid)

    embedded_mid = Embedding(max_mid+1, 1)(movieID)
    reshaped_mid = Reshape((1,))(embedded_mid)

    embedded_genres = Embedding(num_genres+1, 1, mask_zero=True)(genres)
    genres_mean = Tensor_Mean_Pooling()(embedded_genres, embedded_genres._keras_mask)

    y_1st_order = Add()([dense_age, reshaped_uid, reshaped_mid, genres_mean])
    return y_1st_order

In [147]:
genres = Input((3,))
embedded_genres = Embedding(19, 1, mask_zero=True)(genres)
mask = embedded_genres._keras_mask

In [148]:
tmp = Tensor_Mean_Pooling()(embedded_genres, mask)
model = Model(inputs=genres, outputs=tmp)

In [149]:
model.predict(np.array([[18, 18, 18]]))

inputs: [[[-0.0340298414]
  [-0.0340298414]
  [-0.0340298414]]]
mask: [[1 1 1]]
masked_tensor: [[[-0.0340298414 -0.0340298414 -0.0340298414]
  [-0.0340298414 -0.0340298414 -0.0340298414]
  [-0.0340298414 -0.0340298414 -0.0340298414]]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step


array([[-0.03402984, -0.03402984, -0.03402984]], dtype=float32)

In [150]:
model.predict(np.array([[18, 0, 0]]))

inputs: [[[-0.0340298414]
  [0.0233243]
  [0.0233243]]]
mask: [[1 0 0]]
masked_tensor: [[[-0.0340298414 -0 -0]
  [0.0233243 0 0]
  [0.0233243 0 0]]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step


array([[0.01261876, 0.        , 0.        ]], dtype=float32)

In [108]:
def Tensor_Mean_Pooling2(keepdims=False):
    def f(x):
        mean = K.mean(x, axis=1, keepdims=keepdims)
        tf.print("TMP2:", x.shape)
        return mean
    return Lambda(f)

def second_order_interactions(inputs, max_uid, max_mid, num_genres, k):
    age, userID, movieID, genres = inputs

    dense_age = Dense(k)(age)  # shape (None, k)
    reshaped_age = Reshape((1,k))(dense_age)  # shape (None, 1, k)

    embedded_uid = Embedding(max_uid+1, k)(userID)

    embedded_mid = Embedding(max_mid+1, k)(movieID)  # shape (None, 1, k)

    embedded_genres = Embedding(num_genres+1, k)(genres)  # shape (None, 3, k)
    genres_mean = Tensor_Mean_Pooling2(keepdims=True)(embedded_genres)  # shape (None, 1, k)

    # concatenate all 2d embedded layers => (None, ?, k)
    embedded_2d = Concatenate(axis=1)([reshaped_age, embedded_uid, embedded_mid, genres_mean])

    # calcuate the interactions by simplication
    # sum of (x1*x2) = 0.5*[sum of (xi)^2 - sum of (xi^2)]
    tensor_sum = Lambda(lambda x: K.sum(x, axis=1))
    tensor_square = Lambda(lambda x: K.square(x))

    sum_of_embedded = tensor_sum(embedded_2d)
    square_of_embedded = tensor_square(embedded_2d)

    square_of_sum = Multiply()([sum_of_embedded, sum_of_embedded])
    sum_of_square = tensor_sum(square_of_embedded)

    sub = Subtract()([square_of_sum, sum_of_square])
    sub = Lambda(lambda x: x*0.5)(sub)
    y_2nd_order = Reshape((1,))(tensor_sum(sub))
    
    return y_2nd_order, embedded_2d

## Put together

In [97]:
def fm_model(max_uid, max_mid, num_genres, k, dnn_dr):
    inputs = define_input_layers()

    y_1st_order = first_order_interactions(inputs, max_uid, max_mid, num_genres)
    y_2nd_order, embedded_2d = second_order_interactions(inputs, max_uid, max_mid, num_genres, k)

    # combined deep and fm parts
    y = Concatenate()([y_1st_order, y_2nd_order])
    y = Dense(1)(y)

    fm_model_1d = Model(inputs, y_1st_order)
    fm_model_2d = Model(inputs, y_2nd_order)
    fm_model = Model(inputs, y)

    return fm_model_1d, fm_model_2d, fm_model

In [98]:
params = {
    'max_uid': ratings["userID"].max(),
    'max_mid': ratings["movieID"].max(),
    'num_genres': 18,
    'k':20,
    'dnn_dr': 0.5
}

fm_model_1d, fm_model_2d, fm_model = fm_model(**params)

In [99]:
params

{'max_uid': 6040, 'max_mid': 3952, 'num_genres': 18, 'k': 20, 'dnn_dr': 0.5}

In [100]:
def df2xy(ratings):
    x = [ratings["age"].values, 
         ratings["userID"].values, 
         ratings["movieID"].values, 
         np.concatenate(ratings["genres"].values).reshape(-1,3)]
    y = ratings["rating"].values
    return x,y

train_x, train_y = df2xy(train)
valid_x, valid_y = df2xy(test)

In [101]:
len(valid_x[0])

200042

In [102]:
from tensorflow.keras.callbacks import  EarlyStopping, ModelCheckpoint
# train  model
fm_model.compile(
    loss=tf.keras.losses.MeanSquaredError(), optimizer=keras.optimizers.Adam(learning_rate=0.001)
)
early_stop = EarlyStopping(monitor='val_loss', patience=3)
model_ckp = ModelCheckpoint(filepath='./models/deepfm.weights.h5', 
                            monitor='val_loss',
                            save_weights_only=True, 
                            save_best_only=True)
callbacks = [model_ckp,early_stop]
train_history = fm_model.fit(train_x, train_y, 
                                  epochs=30, batch_size=2048, 
                                  validation_data=(valid_x, valid_y),
                                  callbacks = callbacks)

Epoch 1/30
TMP2: TensorShape([None, 1, 20])
TMP: TensorShape([])


2024-05-16 04:02:03.114794: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: INVALID_ARGUMENT: Input to reshape is a tensor with 2048 values, but the requested shape has 1
	 [[{{function_node __inference_one_step_on_data_29043}}{{node gradient_tape/functional_59_1/embedding_48_1/Reshape_1}}]]


InvalidArgumentError: Graph execution error:

Detected at node gradient_tape/functional_59_1/embedding_48_1/Reshape_1 defined at (most recent call last):
  File "/usr/local/Cellar/python@3.10/3.10.14/Frameworks/Python.framework/Versions/3.10/lib/python3.10/runpy.py", line 196, in _run_module_as_main

  File "/usr/local/Cellar/python@3.10/3.10.14/Frameworks/Python.framework/Versions/3.10/lib/python3.10/runpy.py", line 86, in _run_code

  File "/Users/bluraygun/Documents/tfg/tflow/lib/python3.10/site-packages/ipykernel_launcher.py", line 18, in <module>

  File "/Users/bluraygun/Documents/tfg/tflow/lib/python3.10/site-packages/traitlets/config/application.py", line 1075, in launch_instance

  File "/Users/bluraygun/Documents/tfg/tflow/lib/python3.10/site-packages/ipykernel/kernelapp.py", line 739, in start

  File "/Users/bluraygun/Documents/tfg/tflow/lib/python3.10/site-packages/tornado/platform/asyncio.py", line 205, in start

  File "/usr/local/Cellar/python@3.10/3.10.14/Frameworks/Python.framework/Versions/3.10/lib/python3.10/asyncio/base_events.py", line 603, in run_forever

  File "/usr/local/Cellar/python@3.10/3.10.14/Frameworks/Python.framework/Versions/3.10/lib/python3.10/asyncio/base_events.py", line 1909, in _run_once

  File "/usr/local/Cellar/python@3.10/3.10.14/Frameworks/Python.framework/Versions/3.10/lib/python3.10/asyncio/events.py", line 80, in _run

  File "/Users/bluraygun/Documents/tfg/tflow/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 545, in dispatch_queue

  File "/Users/bluraygun/Documents/tfg/tflow/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 534, in process_one

  File "/Users/bluraygun/Documents/tfg/tflow/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 437, in dispatch_shell

  File "/Users/bluraygun/Documents/tfg/tflow/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 362, in execute_request

  File "/Users/bluraygun/Documents/tfg/tflow/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 778, in execute_request

  File "/Users/bluraygun/Documents/tfg/tflow/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 449, in do_execute

  File "/Users/bluraygun/Documents/tfg/tflow/lib/python3.10/site-packages/ipykernel/zmqshell.py", line 549, in run_cell

  File "/Users/bluraygun/Documents/tfg/tflow/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3075, in run_cell

  File "/Users/bluraygun/Documents/tfg/tflow/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3130, in _run_cell

  File "/Users/bluraygun/Documents/tfg/tflow/lib/python3.10/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner

  File "/Users/bluraygun/Documents/tfg/tflow/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3334, in run_cell_async

  File "/Users/bluraygun/Documents/tfg/tflow/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3517, in run_ast_nodes

  File "/Users/bluraygun/Documents/tfg/tflow/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3577, in run_code

  File "/var/folders/rl/lb7rvxps5tq8t99c2_f297h00000gn/T/ipykernel_83253/1233095245.py", line 12, in <module>

  File "/Users/bluraygun/Documents/tfg/tflow/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/Users/bluraygun/Documents/tfg/tflow/lib/python3.10/site-packages/keras/src/backend/tensorflow/trainer.py", line 314, in fit

  File "/Users/bluraygun/Documents/tfg/tflow/lib/python3.10/site-packages/keras/src/backend/tensorflow/trainer.py", line 117, in one_step_on_iterator

  File "/Users/bluraygun/Documents/tfg/tflow/lib/python3.10/site-packages/keras/src/backend/tensorflow/trainer.py", line 104, in one_step_on_data

  File "/Users/bluraygun/Documents/tfg/tflow/lib/python3.10/site-packages/keras/src/backend/tensorflow/trainer.py", line 66, in train_step

Input to reshape is a tensor with 2048 values, but the requested shape has 1
	 [[{{node gradient_tape/functional_59_1/embedding_48_1/Reshape_1}}]] [Op:__inference_one_step_on_iterator_29126]