In [1]:
from pathlib import Path
Path(".").absolute()

PosixPath('/home/aubustou/git/recommend')

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
import tensorflow_recommenders as tfrs
import tensorflow as tf
from tensorflow import keras

2023-03-12 13:11:27.984522: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-12 13:11:28.438871: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-03-12 13:11:28.438910: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory


In [3]:
class personalisedSearcher:
    def __init__(self):
        self.movies = pd.read_csv("ml-25m/movies.csv")
        print("Movies loaded")
        self.ratings = pd.read_csv("ml-25m/ratings.csv")
        print("Ratings loaded")
        self.embeddings = pd.read_csv("embeddings/data.csv", index_col=0)
        print("Embeddings loaded")
        self.item_tensor = tf.convert_to_tensor(self.embeddings, dtype=tf.float32)
        print("Item tensor generated")
        self.scann = tfrs.layers.factorized_top_k.ScaNN(num_leaves=1000, 
                                                        num_leaves_to_search = 100, 
                                                        k = round(np.sqrt(len(self.item_tensor))))
        self.scann.index(self.item_tensor)
        print("Scann initialized")
        self.model = AutoModel.from_pretrained("sentence-transformers/LaBSE")
        print("Model loaded")
        self.tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/LaBSE")
        print("Tokenizer loaded")
        self.recommender = keras.models.load_model('CF')
        print("CF loaded")
        
    def get_user_encodings(self):
        user_ids = self.ratings["userId"].unique().tolist()
        user2user_encoded = {x: i for i, x in enumerate(user_ids)}
        userencoded2user = {i: x for i, x in enumerate(user_ids)}
        
        return user2user_encoded, userencoded2user

    def get_movie_encodings(self):
        movie_ids = self.ratings["movieId"].unique().tolist()
        movie2movie_encoded = {x: i for i, x in enumerate(movie_ids)}
        movie_encoded2movie = {i: x for i, x in enumerate(movie_ids)}
        
        return movie2movie_encoded, movie_encoded2movie
    
    def update_ratings(self):
        user2user_encoded, _ = self.get_user_encodings()
        movie2movie_encoded, _ = self.get_movie_encodings()
        self.ratings["user"] = self.ratings["userId"].map(user2user_encoded)
        self.ratings["movie"] = self.ratings["movieId"].map(movie2movie_encoded)
        
        return self.ratings
        
    def get_user_history(self, user_id):
        df = self.update_ratings()
        watched_movies = df[df.userId == user_id]
        
    def get_candidate_movies(self, query):
        encoded_input = self.tokenizer(query, 
                                  padding=True, 
                                  truncation=True, 
                                  max_length=64, 
                                  return_tensors='pt')
        with torch.no_grad():
            model_output = self.model(**encoded_input)
        query_embeddings = model_output.pooler_output
        query_embeddings = torch.nn.functional.normalize(query_embeddings)
        test_case = self.scann(np.array(query_embeddings))
        return self.movies.iloc[test_case[1].numpy()[0]][0:11]
    
    def filter_candidates(self, user_id, query):
        movies_watched_by_user = self.ratings[self.ratings.userId == user_id]
        candidates = self.get_candidate_movies(query)
        movies_not_watched = candidates[
            ~candidates["movieId"].isin(movies_watched_by_user.movieId.values)
        ]["movieId"]
        movie2movie_encoded, _ = self.get_movie_encodings()
        movies_not_watched = list(set(movies_not_watched).
                                  intersection(set(movie2movie_encoded.keys())))
        movies_not_watched = [[movie2movie_encoded.get(x)] for x in movies_not_watched]
        user2user_encoded, _ = self.get_user_encodings()
        user_encoder = user2user_encoded.get(user_id)
        movie_array = np.hstack(([[user_encoder]] * len(movies_not_watched), movies_not_watched))
        
        return movie_array, movies_not_watched, movies_watched_by_user
    
    def personalised_search(self, user_id, query):
        movie_array, movies_not_watched, movies_watched_by_user = self.filter_candidates(user_id, query)
        scored_items = self.recommender.predict(movie_array).flatten()
        top_rated = scored_items.argsort()[-10:][::-1]
        _, movie_encoded2movie = self.get_movie_encodings()
        recommended_movie_ids = [movie_encoded2movie.get(movies_not_watched[x][0]) for x in top_rated]
        
        return recommended_movie_ids, movies_watched_by_user
    
    def print_recs(self, user_id, query):
        recommendations, movies_watched_by_user = self.personalised_search(user_id, query)
        
        print("Showing recommendations for user: {}".format(user_id))
        print("====" * 9)
        print("Movies with high ratings from user")
        print("----" * 8)
        top_movies_user = (
            movies_watched_by_user.sort_values(by="rating", ascending=False)
            .head(5)
            .movieId.values
        )
        movie_df_rows = self.movies[self.movies["movieId"].isin(top_movies_user)]
        for row in movie_df_rows.itertuples():
            print(row.title, ":", row.genres)
        print("----" * 8)
        print("Top movie recommendations")
        print("----" * 8)
        recommended_movies = self.movies[self.movies["movieId"].isin(recommendations)]
        for row in recommended_movies.itertuples():
            print(row.title, ":", row.genres)

In [4]:
# Instantiate an instance of it, this will take a few moments as
# in the initialization it loads into memory all of the requisite data
personalisedRecommender = personalisedSearcher()

Movies loaded
Ratings loaded
Embeddings loaded


2023-03-12 13:11:36.012450: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-03-12 13:11:36.027888: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-03-12 13:11:36.028055: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-03-12 13:11:36.028656: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorF

Item tensor generated


2023-03-12 13:11:39.512485: I scann/partitioning/partitioner_factory_base.cc:59] Size of sampled dataset for training partition: 62423
2023-03-12 13:11:42.046129: I ./scann/partitioning/kmeans_tree_partitioner_utils.h:88] PartitionerFactory ran in 2.53359462s.


Scann initialized
Model loaded
Tokenizer loaded
CF loaded


In [14]:
personalisedRecommender.print_recs(4232, "toto")

Showing recommendations for user: 4232
Movies with high ratings from user
--------------------------------
Dead Man Walking (1995) : Crime|Drama
Usual Suspects, The (1995) : Crime|Mystery|Thriller
Shawshank Redemption, The (1994) : Crime|Drama
Fargo (1996) : Comedy|Crime|Drama|Thriller
Insider, The (1999) : Drama|Thriller
--------------------------------
Top movie recommendations
--------------------------------
Tut (2015) : (no genres listed)
Fant : (no genres listed)
Thithi (2015) : (no genres listed)
Tibetana : (no genres listed)
Оно (1990) : (no genres listed)
The OA : (no genres listed)
Zero : (no genres listed)
Since : (no genres listed)
The Body : (no genres listed)
Twice : (no genres listed)
