# Conformal prediction for Recommenders

In [1]:
# !wget https://files.grouplens.org/datasets/movielens/ml-1m.zip
# !unzip -o ml-1m.zip

In [28]:
import numpy as np
import pandas as pd
import tensorflow.keras.backend as K
from collections import Counter
from itertools import chain
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

In [3]:
users = pd.read_csv('data/ml-1m/users.dat', sep='::', engine='python', names=['userID', 'gender', 'age', 'occupation', 'zipCode'])
movies = pd.read_csv('data/ml-1m/movies.dat', sep='::', engine='python', names=['movieID', 'title', 'genres'], encoding='latin-1')
ratings = pd.read_csv('data/ml-1m/ratings.dat', sep='::', engine='python', names=['userID', 'movieID', 'rating', 'timestamp'])

In [4]:
print("rows:", users.shape[0])
users.head()

rows: 6040


Unnamed: 0,userID,gender,age,occupation,zipCode
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [5]:
print("rows:", movies.shape[0])
movies.head()

rows: 3883


Unnamed: 0,movieID,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
print("rows:", ratings.shape[0])
ratings.head()

rows: 1000209


Unnamed: 0,userID,movieID,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [7]:
genres = movies["genres"].apply(lambda x: x.split("|"))
genres = chain.from_iterable(genres)
count_genres = dict(Counter(genres))
print(f"num_genres: {len(count_genres)}\n", dict(sorted(count_genres.items(), key=lambda item: item[1], reverse=True)))

num_genres: 18
 {'Drama': 1603, 'Comedy': 1200, 'Action': 503, 'Thriller': 492, 'Romance': 471, 'Horror': 343, 'Adventure': 283, 'Sci-Fi': 276, "Children's": 251, 'Crime': 211, 'War': 143, 'Documentary': 127, 'Musical': 114, 'Mystery': 106, 'Animation': 105, 'Fantasy': 68, 'Western': 68, 'Film-Noir': 44}


In [8]:
# lower=False?
tokenizer = Tokenizer(split='|', filters='')
tokenizer.fit_on_texts(movies["genres"].values)
tokenizer.word_index

{'drama': 1,
 'comedy': 2,
 'action': 3,
 'thriller': 4,
 'romance': 5,
 'horror': 6,
 'adventure': 7,
 'sci-fi': 8,
 "children's": 9,
 'crime': 10,
 'war': 11,
 'documentary': 12,
 'musical': 13,
 'mystery': 14,
 'animation': 15,
 'fantasy': 16,
 'western': 17,
 'film-noir': 18}

In [26]:
# is this the best way?? why not three-hot?

In [9]:
seqs = tokenizer.texts_to_sequences(movies["genres"].values)
seqs = pad_sequences(seqs, maxlen=3,padding='post').tolist()

In [10]:
idx = 2
print(movies["genres"].iloc[idx])
print(seqs[idx])

Comedy|Romance
[2, 5, 0]


In [11]:
movies["genres"] = seqs

In [12]:
matrix = pd.merge(pd.merge(ratings, users), movies)
matrix

Unnamed: 0,userID,movieID,rating,timestamp,gender,age,occupation,zipCode,title,genres
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),"[1, 0, 0]"
1,1,661,3,978302109,F,1,10,48067,James and the Giant Peach (1996),"[15, 9, 13]"
2,1,914,3,978301968,F,1,10,48067,My Fair Lady (1964),"[13, 5, 0]"
3,1,3408,4,978300275,F,1,10,48067,Erin Brockovich (2000),"[1, 0, 0]"
4,1,2355,5,978824291,F,1,10,48067,"Bug's Life, A (1998)","[15, 9, 2]"
...,...,...,...,...,...,...,...,...,...,...
1000204,6040,1091,1,956716541,M,25,6,11106,Weekend at Bernie's (1989),"[2, 0, 0]"
1000205,6040,1094,5,956704887,M,25,6,11106,"Crying Game, The (1992)","[1, 5, 11]"
1000206,6040,562,5,956704746,M,25,6,11106,Welcome to the Dollhouse (1995),"[2, 1, 0]"
1000207,6040,1096,4,956715648,M,25,6,11106,Sophie's Choice (1982),"[1, 0, 0]"


In [16]:
train, test = train_test_split(matrix, test_size=0.2, random_state=7)

In [23]:
def define_input_layers():
    # numerical features
    age = Input((1,), name = 'input_age')

    # single-level categorical features
    userID = Input((1,), name = 'input_uid')
    movieID = Input((1,), name= 'input_mid')

    # multi-level categorical features
    genres = Input((3,), name = 'input_genres')

    return age, userID, movieID, genres

In [30]:
def fm_1st(inputs, n_uid, n_mid, n_genre):
    age, userID, movieID, genres = define_input_layers()

    # all tensors are reshaped to (None, 1)
    num_dense_1d = [Dense(1, name = 'num_dense_1d_fea4')(fea3_input)]
    cat_sl_embed_1d = [Embedding(n_uid + 1, 1, name = 'cat_embed_1d_uid')(uid_input),
                        Embedding(n_mid + 1, 1, name = 'cat_embed_1d_mid')(mid_input)]
    cat_ml_embed_1d = [Embedding(n_genre + 1, 1, mask_zero=True, name = 'cat_embed_1d_genre')(genre_input)]

    cat_sl_embed_1d = [Reshape((1,))(i) for i in cat_sl_embed_1d]
    cat_ml_embed_1d = [Tensor_Mean_Pooling(name = 'embed_1d_mean')(i) for i in cat_ml_embed_1d]

    # add all tensors
    y_fm_1d = Add(name = 'fm_1d_output')(num_dense_1d + cat_sl_embed_1d + cat_ml_embed_1d)

    return y_fm_1d

y_1d = fm_1st(inputs, 10, 10, 10)

SyntaxError: can't use starred expression here (791117344.py, line 2)