In [1]:
!pip install -q tensorflow-recommenders

[0m

In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import deque
#from surprise import Dataset, NormalPredictor, Reader
#from surprise.model_selection import cross_validate
import os
from sklearn.model_selection import train_test_split
import pickle
#from surprise import accuracy
#from surprise import accuracy, Dataset, Reader, SVD, KNNBaseline
#from surprise.model_selection import PredefinedKFold
import pprint
import tempfile
from typing import Dict, Text
import tensorflow as tf
import tensorflow_recommenders as tfrs
from collections import defaultdict
import joblib

In [21]:
train_hr0 = pd.read_csv('rmse/hit_rate_folds_actual/train_hr0.csv', header=None)
test_hr0 = pd.read_csv('rmse/hit_rate_folds_actual/test_hr0.csv', header=None)

In [23]:
train_hr0.head()

Unnamed: 0,0,1,2
0,826574,9729,4.0
1,2200645,1719,4.0
2,2150434,5356,5.0
3,2142065,8753,3.0
4,867086,6721,4.0


In [24]:
train_hr0.columns = ['user_id', 'movie_id', 'rating']
test_hr0.columns = ['user_id', 'movie_id', 'rating']

In [25]:
train_movie_ids = list(train_hr0['movie_id'].unique())
train_user_ids = list(train_hr0['user_id'].unique())

In [26]:
class DeepRecRmseModel(tf.keras.Model):
    
    def __init__(self):
        super().__init__()
        embedding_dimension = 16
        
        self.user_embeddings = tf.keras.Sequential([
            tf.keras.layers.IntegerLookup(vocabulary=train_user_ids, mask_token=None),
            tf.keras.layers.Embedding(len(train_user_ids)+1, embedding_dimension)
        ])
        
        self.movie_embeddings = tf.keras.Sequential([
            tf.keras.layers.IntegerLookup(vocabulary=train_movie_ids, mask_token=None),
            tf.keras.layers.Embedding(len(train_movie_ids)+1, embedding_dimension)
        ])
        
        self.ratings = tf.keras.Sequential([
            tf.keras.layers.Dense(32, activation='relu'),
            tf.keras.layers.Dense(16, activation='relu'),
            tf.keras.layers.Dense(1)
        ])
        
    def call(self, inputs):
        user_id, movie_id = inputs
        user_embedding = self.user_embeddings(user_id)
        movie_embedding = self.movie_embeddings(movie_id)
        return self.ratings(tf.concat([user_embedding, movie_embedding], axis=1))
    
    

In [27]:
class NetflixRmseModel(tfrs.models.Model):
    
    def __init__(self):
        super().__init__()
        self.rmse_model: tf.keras.Model = DeepRecRmseModel()
        self.task: tf.keras.layers.Layer = tfrs.tasks.Ranking(loss=tf.keras.losses.MeanSquaredError(),
                                                             metrics=[tf.keras.metrics.RootMeanSquaredError()])
        
    def call(self, features: Dict[str, tf.Tensor]) -> tf.Tensor:
        return self.rmse_model((features['user_id'], features['movie_id']))
    
    def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
        labels = features.pop('rating')
        predicted_ratings = self(features)
        return self.task(labels=labels, predictions=predicted_ratings)

In [8]:
#deep_baseline_model = NetflixRmseModel()
#deep_baseline_model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

In [10]:
#train_hr0_tf = tf.data.Dataset.from_tensor_slices(dict(train_hr0))
#test_hr0_tf = tf.data.Dataset.from_tensor_slices(dict(test_hr0))

In [11]:
train_hr0_tf = train_hr0_tf.map(lambda x: {'user_id': x['user_id'],
  #  'movie_id': x['movie_id'],
  #  'rating': x['rating']
#})

test_hr0_tf = test_hr0_tf.map(lambda x: {'user_id': x['user_id'],
    'movie_id': x['movie_id'],
    'rating': x['rating']
})

In [12]:
#train_hr0_tf = train_hr0_tf.batch(8192).cache()
test_hr0_tf = test_hr0_tf.batch(8192).cache()

In [13]:
#history_baseline = deep_baseline_model.fit(train_hr0_tf, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [14]:
#deep_baseline_model2 = NetflixRmseModel()
#deep_baseline_model2.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

In [15]:
#history_baseline2 = deep_baseline_model2.fit(test_hr0_tf, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [19]:
#predictions_1a = defaultdict(list)

#for i, row in test_hr0.iterrows():
    #user = row['user_id']
   # movie = row['movie_id']
   # predicted_rating = deep_baseline_model({'user_id': np.array([user]),
                                           #'movie_id': np.array([movie])})
   # predictions_1a[user].append((movie, predicted_rating[0][0]))

In [None]:
predictions_1b = defaultdict(list)

for i, row in train_hr0.iterrows():
    user = row['user_id']
    movie = row['movie_id']
    predicted_rating = deep_baseline_model2({'user_id': np.array([user]),
                                           'movie_id': np.array([movie])})
    predictions_1b[user].append((movie, predicted_rating[0][0]))

In [None]:
for user_id, user_ratings in predictions_1a.items():
    user_ratings.sort(key=lambda x: x[1], reverse=True)
    predictions_1a[user_id] = user_ratings[:100]

In [None]:
for user_id, user_ratings in predictions_1b.items():
    user_ratings.sort(key=lambda x: x[1], reverse=True)
    predictions_1b[user_id] = user_ratings[:100]

In [29]:
top_ratings_df = pd.read_csv('rmse/top_ratings_df.csv')
top_ratings_df.head()

Unnamed: 0,user_id,movie_id,rating
0,841872,14454,5.0
1,689085,299,5.0
2,2529854,11165,5.0
3,2597445,10886,4.0
4,2336656,3282,5.0


In [None]:
num_users = 0
hits = 0

for user_id in predictions_1a.keys():
    top_movie = top_ratings_df[top_ratings_df['user_id']==user_id]['movie_id']
    num_users += 1
    num_recs = len(predictions_1a[user_id])
    top_number = int(num_recs*0.25)
    boundary_for_user = [rec[1] for rec in predictions_1a[user_id]][top_number]
    pred_top_movie = deep_baseline_model({'user_id': np.array([user_id]),
                                         'movie_id': np.array([top_movie])})
    if pred_top_movie[0][0] > boundary_for_user:
        hits += 1
        
hit_rate_1a = hits / num_users
print(hit_rate_1a)

In [None]:
num_users = 0
hits = 0

for user_id in predictions_1b.keys():
    top_movie = top_ratings_df[top_ratings_df['user_id']==user_id]['movie_id']
    num_users += 1
    num_recs = len(predictions_1b[user_id])
    top_number = int(num_recs*0.25)
    boundary_for_user = [rec[1] for rec in predictions_1b[user_id]][top_number]
    pred_top_movie = deep_baseline_model2({'user_id': np.array([user_id]),
                                         'movie_id': np.array([top_movie])})
    if pred_top_movie[0][0] > boundary_for_user:
        hits += 1
        
hit_rate_1b = hits / num_users
print(hit_rate_1b)

In [32]:
PATH = 'rmse/hit_rate_folds_actual/'

hit_rates_nn = []


for i in range(5):
    train_file1 = PATH + f'train_hr{i}.csv'
    test_file1 = PATH + f'test_hr{i}.csv'
    df_train1 = pd.read_csv(train_file1, header=None)
    df_test1 = pd.read_csv(test_file1, header=None)
    df_train1.columns = ['user_id', 'movie_id', 'rating']
    df_test1.columns = ['user_id', 'movie_id', 'rating']
    
    deep_model1 = NetflixRmseModel()
    deep_model1.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))
    
    deep_model2 = NetflixRmseModel()
    deep_model2.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))
    
    df_train_tf1 = tf.data.Dataset.from_tensor_slices(dict(df_train1))
    df_test_tf1 = tf.data.Dataset.from_tensor_slices(dict(df_test1))
    
    df_train_tf1 = df_train_tf1.map(lambda x: {'user_id': x['user_id'],
    'movie_id': x['movie_id'],
    'rating': x['rating']})
    
    df_test_tf1 = df_test_tf1.map(lambda x: {'user_id': x['user_id'],
    'movie_id': x['movie_id'],
    'rating': x['rating']})
    
    df_train_tf1 = df_train_tf1.batch(8192).cache()
    df_test_tf1 = df_test_tf1.batch(8192).cache()
    
    print(f'Training fold {i}a')
    history_baseline1 = deep_model1.fit(df_train_tf1, epochs=10)
    
    print(f'Training fold {i}b')
    history_baseline2 = deep_model2.fit(df_test_tf1, epochs=10)
    
    num_users1 = 0
    num_users2 = 0
    num_hits1 = 0
    num_hits2 = 0
    
    predictions_1a = defaultdict(list)
          
    print('Iterating through first dataframe')

    for i, row in df_test1.iterrows():
        user = row['user_id']
        movie = row['movie_id']
        predicted_rating = deep_model1({'user_id': np.array([user]),
                                           'movie_id': np.array([movie])})
        predictions_1a[user].append((movie, predicted_rating[0][0]))
        
    for user_id, user_ratings in predictions_1a.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        predictions_1a[user_id] = user_ratings[:100]
        
    predictions_1b = defaultdict(list)
          
    print('Iterating through second dataframe')

    for i, row in df_train1.iterrows():
        user = row['user_id']
        movie = row['movie_id']
        predicted_rating = deep_model2({'user_id': np.array([user]),
                                           'movie_id': np.array([movie])})
        predictions_1b[user].append((movie, predicted_rating[0][0]))
        
    for user_id in predictions_1a.keys():
        top_movie = top_ratings_df[top_ratings_df['user_id']==user_id]['movie_id']
        num_users1 += 1
        num_recs = len(predictions_1a[user_id])
        top_number = int(num_recs*0.25)
        boundary_for_user = [rec[1] for rec in predictions_1a[user_id]][top_number]
        pred_top_movie = deep_model1({'user_id': np.array([user_id]),
                                         'movie_id': np.array([top_movie])})
        if pred_top_movie[0][0] > boundary_for_user:
            num_hits1 += 1
            
    for user_id in predictions_1b.keys():
        top_movie = top_ratings_df[top_ratings_df['user_id']==user_id]['movie_id']
        num_users2 += 1
        num_recs = len(predictions_1b[user_id])
        top_number = int(num_recs*0.25)
        boundary_for_user = [rec[1] for rec in predictions_1b[user_id]][top_number]
        pred_top_movie = deep_model2({'user_id': np.array([user_id]),
                                         'movie_id': np.array([top_movie])})
        if pred_top_movie[0][0] > boundary_for_user:
            num_hits2 += 1
            
            
    hit_rates_nn.append(num_hits1 / num_users1)
    hit_rates_nn.append(num_hits2 / num_users2)
    
    

Training fold 0a
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Training fold 0b
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Iterating through first dataframe


KeyboardInterrupt: 

In [34]:
for i in range(5):
    train_file1 = PATH + f'train_hr{i}.csv'
    test_file1 = PATH + f'test_hr{i}.csv'
    df_train1 = pd.read_csv(train_file1, header=None)
    df_test1 = pd.read_csv(test_file1, header=None)
    df_train1.columns = ['user_id', 'movie_id', 'rating']
    df_test1.columns = ['user_id', 'movie_id', 'rating']
    
    deep_model1 = NetflixRmseModel()
    deep_model1.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))
    
    deep_model2 = NetflixRmseModel()
    deep_model2.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))
    
    df_train_tf1 = tf.data.Dataset.from_tensor_slices(dict(df_train1))
    df_test_tf1 = tf.data.Dataset.from_tensor_slices(dict(df_test1))
    
    df_train_tf1 = df_train_tf1.map(lambda x: {'user_id': x['user_id'],
    'movie_id': x['movie_id'],
    'rating': x['rating']})
    
    df_test_tf1 = df_test_tf1.map(lambda x: {'user_id': x['user_id'],
    'movie_id': x['movie_id'],
    'rating': x['rating']})
    
    df_train_tf1 = df_train_tf1.batch(8192).cache()
    df_test_tf1 = df_test_tf1.batch(8192).cache()
    
    history_baseline1 = deep_model1.fit(df_train_tf1, epochs=10)
    joblib.dump(deep_model1, f'model{i}a.pkl')
    
    history_baseline2 = deep_model2.fit(df_test_tf1, epochs=10)
    joblib.dump(deep_model2, f'model{i}b.pkl')
    
 

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10




INFO:tensorflow:Assets written to: ram://64cafc07-d2a4-4730-8afd-760a7e532178/assets


INFO:tensorflow:Assets written to: ram://64cafc07-d2a4-4730-8afd-760a7e532178/assets


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10




INFO:tensorflow:Assets written to: ram://51a3ecac-d4e7-4839-9469-3cccf2018f04/assets


INFO:tensorflow:Assets written to: ram://51a3ecac-d4e7-4839-9469-3cccf2018f04/assets


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10




INFO:tensorflow:Assets written to: ram://7d61a8da-a866-4033-943d-1a2a9e813bba/assets


INFO:tensorflow:Assets written to: ram://7d61a8da-a866-4033-943d-1a2a9e813bba/assets


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10




INFO:tensorflow:Assets written to: ram://7e11df5b-152e-4959-b64c-ea82fd1a9f8f/assets


INFO:tensorflow:Assets written to: ram://7e11df5b-152e-4959-b64c-ea82fd1a9f8f/assets


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10




INFO:tensorflow:Assets written to: ram://980855ca-068f-4ea4-bca4-ce51c671c555/assets


INFO:tensorflow:Assets written to: ram://980855ca-068f-4ea4-bca4-ce51c671c555/assets


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10




INFO:tensorflow:Assets written to: ram://0620fb4c-50a2-41c9-8a33-6afa1f680a33/assets


INFO:tensorflow:Assets written to: ram://0620fb4c-50a2-41c9-8a33-6afa1f680a33/assets


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10




INFO:tensorflow:Assets written to: ram://82ec3ea8-014e-4f74-9fbe-e7cc4f43b749/assets


INFO:tensorflow:Assets written to: ram://82ec3ea8-014e-4f74-9fbe-e7cc4f43b749/assets


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10




INFO:tensorflow:Assets written to: ram://f8c0262e-ad70-49aa-98fb-80bf51451ec8/assets


INFO:tensorflow:Assets written to: ram://f8c0262e-ad70-49aa-98fb-80bf51451ec8/assets


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10




INFO:tensorflow:Assets written to: ram://6539461c-7eb0-477d-b35a-2621fc75a129/assets


INFO:tensorflow:Assets written to: ram://6539461c-7eb0-477d-b35a-2621fc75a129/assets


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10




INFO:tensorflow:Assets written to: ram://77e9fef5-8ff4-412a-a0ee-035534dfe2be/assets


INFO:tensorflow:Assets written to: ram://77e9fef5-8ff4-412a-a0ee-035534dfe2be/assets
