In [None]:
!pip install tensorflow_recommenders scann

Collecting tensorflow_recommenders
  Downloading tensorflow_recommenders-0.7.3-py3-none-any.whl (96 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m96.2/96.2 kB[0m [31m860.6 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting scann
  Downloading scann-1.3.1-cp310-cp310-manylinux_2_27_x86_64.whl (10.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.6/10.6 MB[0m [31m27.9 MB/s[0m eta [36m0:00:00[0m
Collecting tensorflow>=2.9.0 (from tensorflow_recommenders)
  Downloading tensorflow-2.16.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (589.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m589.8/589.8 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Collecting h5py>=3.10.0 (from tensorflow>=2.9.0->tensorflow_recommenders)
  Downloading h5py-3.11.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.3/5.3 MB[0m [31m51.3 MB/s[0m eta [

#Loading the Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

!cp /content/drive/MyDrive/Ahana /content/ -r
!unzip /content/Ahana/data.zip -d /content/ml-25m/
!unzip /content/Ahana/1.zip -d /content/Labse/
!unzip /content/Ahana/2.zip -d /content/Labse/

Mounted at /content/drive
Archive:  /content/Ahana/data.zip
   creating: /content/ml-25m/ml-25m/
   creating: /content/ml-25m/ml-25m/ml-25m/
  inflating: /content/ml-25m/ml-25m/ml-25m/genome-scores.csv  
  inflating: /content/ml-25m/ml-25m/ml-25m/genome-tags.csv  
  inflating: /content/ml-25m/ml-25m/ml-25m/links.csv  
  inflating: /content/ml-25m/ml-25m/ml-25m/movies.csv  
  inflating: /content/ml-25m/ml-25m/ml-25m/ratings.csv  
  inflating: /content/ml-25m/ml-25m/ml-25m/README.txt  
  inflating: /content/ml-25m/ml-25m/ml-25m/tags.csv  
Archive:  /content/Ahana/1.zip
  inflating: /content/Labse/model.safetensors  
  inflating: /content/Labse/pytorch_model.bin  
  inflating: /content/Labse/tf_model.h5  
Archive:  /content/Ahana/2.zip
  inflating: /content/Labse/.gitattributes  
   creating: /content/Labse/1_Pooling/
  inflating: /content/Labse/1_Pooling/config.json  
   creating: /content/Labse/2_Dense/
  inflating: /content/Labse/2_Dense/config.json  
  inflating: /content/Labse/2_Dens

Necessary Imports

In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch
import re
import numpy as np


#Creating Embeddings using LaBSE

In [None]:
tokenizer = AutoTokenizer.from_pretrained("/content/Labse/")
print("done")
model = AutoModel.from_pretrained("/content/Labse/")
print("done")
device = "cuda:0" if torch.cuda.is_available() else "cpu"
model = model.to(device)


data = pd.read_csv("/content/ml-25m/ml-25m/ml-25m/movies.csv")

def remove_pars(x):
    x = str(x)
    return re.sub('[()]', "", x)

def remove_pipes(x):
    x = str(x)
    return re.sub('\|', " ", x)

def remove_nulls(a, b, i):
    string_m = a[i] + " " + b[i]
    return re.sub("\(no genres listed\)", "", string_m)

titles = [remove_pars(i) for i in data['title']]
genres = [remove_pipes(i) for i in data['genres']]

input_string = [remove_nulls(titles, genres, i) for i in range(len(genres))]
embeddings_list = []
for _, i in enumerate(input_string):
    encoded_input = tokenizer(i, padding=True, truncation=True, max_length=64, return_tensors='pt').to(device)
    with torch.no_grad():
        model_output = model(**encoded_input)
    embeddings = model_output.pooler_output
    embeddings = torch.nn.functional.normalize(embeddings)
    embeddings_list.append(embeddings)
    if _ % 10000 == 0:
        print(str(_))

# extract embeddings
embeddings_list_tensors = []
for i in embeddings_list:
    d = i.cpu()[0].numpy()
    embeddings_list_tensors.append(d)

embeddings = pd.DataFrame(np.vstack(embeddings_list_tensors))
embeddings.to_csv("/content/data.csv")

done
done
0
10000
20000
30000
40000
50000
60000


#Testing ScaNN

In [None]:
import tensorflow_recommenders as tfrs
import tensorflow as tf
item_tensor = tf.convert_to_tensor(embeddings, dtype=tf.float32)
scann = tfrs.layers.factorized_top_k.ScaNN(num_leaves=1000, num_leaves_to_search = 100, k = round(np.sqrt(len(item_tensor))))
scann.index(item_tensor)
print("ScANN is Ready!")
test = "Horror films with zombies"
encoded_input = tokenizer(test, padding=True, truncation=True, max_length=64, return_tensors='pt').to(device)
with torch.no_grad():
    model_output = model(**encoded_input)
query = model_output.pooler_output
query = torch.nn.functional.normalize(query)
test_case = scann(np.array(query.cpu()))
data.iloc[test_case[1].numpy()[0]][0:10]

ScANN is Ready!


Unnamed: 0,movieId,title,genres
11068,47980,Bio Zombie (Sun faa sau si) (1998),Comedy|Horror
13822,71535,Zombieland (2009),Action|Comedy|Horror
23643,118810,Zombie Women of Satan (2009),Comedy|Horror
23642,118808,Zombie Reanimation (2009),Action|Comedy|Horror
42985,165039,The Zombie Diaries (2006),Action|Horror|Thriller
55180,191327,Teenage Zombies (1960),Horror|Sci-Fi
46049,171651,Redneck Zombies (1989),Horror
41540,161912,Zombie Night (2003),Comedy|Horror|Sci-Fi
45144,169720,Zombies! Zombies! Zombies! (2008),Comedy|Horror
14427,75404,ZMD: Zombies of Mass Destruction (2009),Comedy|Horror


#Create Class RecommenderNet for NCF

In [None]:
EMBEDDING_SIZE = 128
from tensorflow import keras
@keras.utils.register_keras_serializable(package="MyLayers")
class RecommenderNet(keras.Model):
    def __init__(self, **kwargs):
        super(RecommenderNet, self).__init__(**kwargs)
        num_users=200000
        num_movies=200000
        embedding_size=128
        self.num_users = num_users
        self.num_movies = num_movies
        self.embedding_size = embedding_size
        self.user_embedding = layers.Embedding(
            num_users,
            embedding_size,
            embeddings_initializer="he_normal",
            embeddings_regularizer=keras.regularizers.l2(1e-6),
        )
        self.user_bias = layers.Embedding(num_users, 1)
        self.movie_embedding = layers.Embedding(
            num_movies,
            embedding_size,
            embeddings_initializer="he_normal",
            embeddings_regularizer=keras.regularizers.l2(1e-6),
        )
        self.movie_bias = layers.Embedding(num_movies, 1)

    def call(self, inputs):
        user_vector = self.user_embedding(inputs[:, 0])
        user_bias = self.user_bias(inputs[:, 0])
        movie_vector = self.movie_embedding(inputs[:, 1])
        movie_bias = self.movie_bias(inputs[:, 1])
        dot_user_movie = tf.tensordot(user_vector, movie_vector, 2)
        # Add all the components (including bias)
        x = dot_user_movie + user_bias + movie_bias
        # The sigmoid activation forces the rating to between 0 and 1
        return tf.nn.sigmoid(x)

#Collaborative Filtering Training

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from keras import layers

# read in the ratings data
df = pd.read_csv("/content/ml-25m/ml-25m/ml-25m/ratings.csv")

# create unique lists of movies
user_ids = df["userId"].unique().tolist()
movie_ids = df["movieId"].unique().tolist()
min_rating=0
max_rating=5
# create a mapping between the new index ID and the encoded ID
user2user_encoded = {x: i for i, x in enumerate(user_ids)}
userencoded2user = {i: x for i, x in enumerate(user_ids)}
movie2movie_encoded = {x: i for i, x in enumerate(movie_ids)}
movie_encoded2movie = {i: x for i, x in enumerate(movie_ids)}
df["user"] = df["userId"].map(user2user_encoded)
df["movie"] = df["movieId"].map(movie2movie_encoded)

# shuffle the data and normalize the ratings (they are between 0-5 - 0-1)
df = df.sample(frac=1, random_state=42)
x = df[["user", "movie"]].values
y = df["rating"].apply(lambda x: (x - min_rating) / (max_rating - min_rating)).values

# split into train/test splits.
train_indices = int(0.9 * df.shape[0])
x_train, x_val, y_train, y_val = (
    x[:train_indices],
    x[train_indices:],
    y[:train_indices],
    y[train_indices:],
)

# instantiate the model

model = RecommenderNet()

# compile the model
model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer = keras.optimizers.Adam(learning_rate=0.0001),
    metrics=["accuracy",keras.metrics.Precision(), keras.metrics.Recall()],
)

# train the model
history = model.fit(
    x=x_train,
    y=y_train,
    batch_size=4096,
    epochs=5,
    verbose=1,
    validation_data=(x_val, y_val),
)

# save the model after training.
model.save('CF_Final.keras')

#Plotting Necessary Metrics

In [None]:
import matplotlib.pyplot as plt
import pickle

with open('/content/Ahana/history', "rb") as file_pi:
    history = pickle.load(file_pi)

plt.plot(history['loss'])
plt.plot(history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

#Collaborative Filtering Evaluation

In [None]:
movie_df = pd.read_csv("/content/ml-25m/ml-25m/ml-25m/movies.csv")

# Let us get a user and see the top recommendations.
user_id = df.userId.sample(1).iloc[0]
movies_watched_by_user = df[df.userId == user_id]
movies_not_watched = movie_df[
    ~movie_df["movieId"].isin(movies_watched_by_user.movieId.values)
]["movieId"]
movies_not_watched = list(
    set(movies_not_watched).intersection(set(movie2movie_encoded.keys()))
)
movies_not_watched = [[movie2movie_encoded.get(x)] for x in movies_not_watched]
user_encoder = user2user_encoded.get(user_id)
user_movie_array = np.hstack(
    ([[user_encoder]] * len(movies_not_watched), movies_not_watched)
)
ratings = model.predict(user_movie_array).flatten()
top_ratings_indices = ratings.argsort()[-10:][::-1]
recommended_movie_ids = [
    movie_encoded2movie.get(movies_not_watched[x][0]) for x in top_ratings_indices
]

print("Showing recommendations for user: {}".format(user_id))
print("====" * 9)
print("Movies with high ratings from user")
print("----" * 8)
top_movies_user = (
    movies_watched_by_user.sort_values(by="rating", ascending=False)
    .head(5)
    .movieId.values
)
movie_df_rows = movie_df[movie_df["movieId"].isin(top_movies_user)]
for row in movie_df_rows.itertuples():
    print(row.title, ":", row.genres)

print("----" * 8)
print("Top movie recommendations")
print("----" * 8)
recommended_movies = movie_df[movie_df["movieId"].isin(recommended_movie_ids)]
for row in recommended_movies.itertuples():
    print(row.title, ":", row.genres)
model.save('CF_Final.keras')


In [None]:
model.summary()