In [1]:
import marimo as mo
import pandas as pd
import plotly
import plotly.graph_objects as go
import plotly.express as px
import torch
import torch.nn as nn
import torch.optim as optim
import joblib
import os

In [2]:
from pathlib import Path

data_path=Path("data")

genome_scores_data_path=data_path/"genome_scores.csv"
genome_tags_data_path=data_path/"genome_tags.csv"
link_data_path=data_path/"link.csv"
movie_data_path=data_path / "movie.csv"
rating_data_path= data_path / "rating.csv"
tag_data_path= data_path / "tag.csv"

In [3]:
rating=pd.read_csv(rating_data_path)
rating.timestamp=pd.to_datetime(rating.timestamp)
rating["timestamp"]=pd.to_datetime(rating["timestamp"])
rating:pd.DataFrame=rating.sort_values(by=["userId","timestamp"],ascending=[True,False])

In [4]:
genome_df=pd.read_csv(genome_scores_data_path)

In [5]:
genome_df

Unnamed: 0,movieId,tagId,relevance
0,1,1,0.02500
1,1,2,0.02500
2,1,3,0.05775
3,1,4,0.09675
4,1,5,0.14675
...,...,...,...
11709763,131170,1124,0.58775
11709764,131170,1125,0.01075
11709765,131170,1126,0.01575
11709766,131170,1127,0.11450


In [6]:
genome_df.movieId.unique()

array([     1,      2,      3, ..., 131013, 131168, 131170],
      shape=(10381,))

In [7]:
import numpy as np
genome_group=genome_df.groupby("movieId")

genome_matrix=np.array([np.array(genome_group.get_group(x)["relevance"].to_list()) for x in genome_group.groups])

In [8]:
genome_matrix

array([[0.025  , 0.025  , 0.05775, ..., 0.03625, 0.07775, 0.023  ],
       [0.03975, 0.04375, 0.03775, ..., 0.01475, 0.09025, 0.01875],
       [0.0435 , 0.05475, 0.028  , ..., 0.0195 , 0.097  , 0.0185 ],
       ...,
       [0.042  , 0.03175, 0.017  , ..., 0.0095 , 0.0755 , 0.01625],
       [0.05975, 0.10375, 0.1785 , ..., 0.51925, 0.067  , 0.01725],
       [0.05325, 0.08625, 0.017  , ..., 0.01575, 0.1145 , 0.02175]],
      shape=(10381, 1128))

In [9]:
## I wanna create en embedding model for the movies, since we have lot of infos for each movie, but i need to scale down a lot the dimensions, i think the best way is to use an autoencoder

class Encoder(nn.Module):

    def __init__(self, n_labels, hidden_dimensions=512, n_layers=3, output_dim=10, dropout:float=0.2):
        super().__init__()
        layers=[]
        layers.append(nn.Linear(n_labels,hidden_dimensions))
        layers.append(nn.Dropout(0.2))
        layers.append(nn.ReLU())
        for _ in range(n_layers):
            layers.append(nn.Linear(hidden_dimensions,hidden_dimensions))
            layers.append(nn.Dropout(dropout))
            layers.append(nn.ReLU())
        layers.append(nn.Linear(hidden_dimensions,output_dim))
        self.model=nn.Sequential(*layers)

    def forward(self,x):
        return self.model(x)

class Decoder(nn.Module):

    def __init__(self, n_labels, hidden_dimensions=512, n_layers=3, output_dim=10,dropout:float=0.2):
        super().__init__()
        layers=[]
        layers.append(nn.Linear(output_dim,hidden_dimensions))
        layers.append(nn.Dropout(dropout))
        layers.append(nn.ReLU())
        for _ in range(n_layers):
            layers.append(nn.Linear(hidden_dimensions,hidden_dimensions))
            layers.append(nn.Dropout(dropout))
            layers.append(nn.ReLU())
        layers.append(nn.Linear(hidden_dimensions,n_labels))
        self.model=nn.Sequential(*layers)

    def forward(self,x):
        return self.model(x)


class AutoEncoder(nn.Module):

    def __init__(self, n_labels, hidden_dimensions=512, n_layers=3, output_dim=10,dropout:float=0.2):
        super().__init__()
        self.encoder=Encoder(n_labels, hidden_dimensions, n_layers, output_dim, dropout)
        self.decoder=Decoder(n_labels, hidden_dimensions, n_layers, output_dim, dropout)

    def forward(self,x):
        x=self.encode(x)
        x=self.decode(x)
        return x

    def encode(self,x):
        return self.encoder(x)

    def decode(self,x):
        return self.decoder(x)

In [10]:
import random
from sklearn.model_selection import train_test_split


train_sample,test_sample=train_test_split(genome_matrix,train_size=0.8)


test_sample=torch.from_numpy(test_sample)

In [11]:
train_sample

array([[0.0185 , 0.02075, 0.026  , ..., 0.7805 , 0.058  , 0.01575],
       [0.043  , 0.05425, 0.127  , ..., 0.041  , 0.0585 , 0.0175 ],
       [0.0345 , 0.03625, 0.03925, ..., 0.01175, 0.09775, 0.02775],
       ...,
       [0.04525, 0.037  , 0.01825, ..., 0.01175, 0.10775, 0.018  ],
       [0.054  , 0.06025, 0.042  , ..., 0.01375, 0.069  , 0.016  ],
       [0.033  , 0.039  , 0.0975 , ..., 0.01   , 0.056  , 0.019  ]],
      shape=(8304, 1128))

In [12]:
len(test_sample)

2077

In [13]:
from torch.utils.data import DataLoader,Dataset



device="cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
model=AutoEncoder(n_labels=len(genome_matrix[0])).to(device)
dataloader=DataLoader(dataset=torch.from_numpy(train_sample),shuffle=True,batch_size=1024)
optimizer=optim.AdamW(params=model.parameters(),lr=0.0006)
criterion=nn.MSELoss()

In [14]:
losses=[]
val_losses=[]

In [15]:
def validate_model(model,criterion,X,y=None,threshold=0.0005,verbose=False):
    model.eval()
    with torch.no_grad():
        output=model(X.float().to(device))
        y=y or X
        y=y.float().to(device)
        loss=criterion(y,output)
        if verbose: print(f"Validation Loss: {loss.item()}")
        return loss.item()<=threshold,loss.item()

In [16]:
import tqdm


for _ in tqdm.tqdm(range(21)):
    model.train()
    for batch in dataloader:
        batch=batch.float().to(device)
        with torch.autocast(device_type=device):
            output=model(batch)
            loss=criterion(batch,output)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        losses.append(loss.item())
    if _%10==0:
        is_model_validated,val_loss=validate_model(model,criterion=criterion,verbose=False,X=test_sample)
        #print(f"Test Loss:{loss.item()}")
        val_losses.append(val_loss)
        if is_model_validated:
            break

100%|██████████| 21/21 [00:03<00:00,  6.39it/s]


In [17]:
validate_model(model,criterion=criterion,verbose=True,X=test_sample)

Validation Loss: 0.012048648670315742


(False, 0.012048648670315742)

In [18]:
plot_fig2=px.line(val_losses)
plot_fig2.show()

In [19]:
plot_fig=px.line(losses)
plot_fig.show()

In [20]:
genome_matrix[0]

array([0.025  , 0.025  , 0.05775, ..., 0.03625, 0.07775, 0.023  ],
      shape=(1128,))

In [21]:
movie_matrix=torch.from_numpy(genome_matrix).float().to(device)
movie_matrix_encoded=model.encode(movie_matrix)

In [22]:
movie_matrix_encoded_df=pd.DataFrame(movie_matrix_encoded.detach().cpu())
movie_matrix_encoded_df.index=np.array(genome_group.groups.keys())

In [23]:
torch.from_numpy(np.zeros(len(genome_matrix))).float()

tensor([0., 0., 0.,  ..., 0., 0., 0.])

In [24]:
model.encode(torch.from_numpy(np.zeros(len(genome_matrix[0]))).float().to(device))

tensor([-0.0143, -0.0506,  0.0003,  0.0120, -0.0501, -0.0074, -0.0170, -0.0586,
         0.0194,  0.0162], device='mps:0', grad_fn=<LinearBackward0>)

In [25]:
movie_matrix_encoded_df.hist(backend="plotly")

In [26]:
movie_matrix_encoded_df.loc[1,:].values

array([-0.41536108, -0.23976503, -0.02913168, -0.1255222 , -0.35692334,
       -0.1726136 ,  0.01191295, -0.3329451 , -0.03869736,  0.11631032],
      dtype=float32)

In [27]:
movie_matrix_encoded_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
1,-0.415361,-0.239765,-0.029132,-0.125522,-0.356923,-0.172614,0.011913,-0.332945,-0.038697,0.116310
2,-0.342941,-0.214567,-0.042160,-0.120707,-0.326535,-0.147327,0.024373,-0.306981,-0.060213,0.131139
3,-0.055752,-0.070928,-0.006951,-0.008342,-0.089737,-0.025962,-0.008758,-0.094501,0.000480,0.038988
4,0.002037,-0.042802,0.003016,0.020013,-0.034208,-0.001054,-0.020325,-0.043630,0.026590,0.007546
5,-0.064242,-0.075144,-0.008613,-0.012858,-0.098521,-0.029499,-0.007473,-0.102096,-0.003088,0.043254
...,...,...,...,...,...,...,...,...,...,...
130578,-0.062949,-0.074159,-0.007753,-0.011596,-0.095920,-0.029081,-0.008036,-0.100424,-0.001181,0.041776
130840,0.089688,-0.001081,0.025669,0.067221,0.053633,0.037532,-0.049738,0.036769,0.076114,-0.067463
131013,-0.079028,-0.081143,-0.010375,-0.018837,-0.110186,-0.035091,-0.005511,-0.112772,-0.007368,0.047933
131168,0.191277,0.037877,0.062906,0.129918,0.163150,0.074456,-0.081182,0.124010,0.149843,-0.175648


In [28]:
rating.drop_duplicates(subset=["userId","timestamp"],inplace=True)

In [29]:
rating

Unnamed: 0,userId,movieId,rating,timestamp
62,1,1750,3.5,2005-04-02 23:56:41
112,1,3997,3.5,2005-04-02 23:56:32
117,1,4133,3.0,2005-04-02 23:56:09
164,1,7449,3.5,2005-04-02 23:56:03
60,1,1525,3.0,2005-04-02 23:55:50
...,...,...,...,...
20000001,138493,1894,3.0,2009-10-17 18:40:27
19999984,138493,1591,3.0,2009-10-17 18:40:21
20000230,138493,48780,5.0,2009-10-17 18:39:59
20000056,138493,2872,4.0,2009-10-17 18:39:44


In [30]:
movie_matrix_encoded_df.shape

(10381, 10)

### Now we have to create a useful dataset to train a Neural Collaborative Filtering.
To do that we should first understand what we need to teach to the model.

1. We need a list of movies that a person has seen, is very important to order them by the time, since otherwise we would create a look ahead bias
2. We need to insert some movies that the person has not seen, otherwise the model will learn only the movies that somebody likes

In [31]:
class MovieDataset(Dataset):

    def __init__(self, movie_ids,rating,movie_info,n_negatives=1,force_new_dataset=False,path="dataset_2"):
        self.path=path
        if os.path.exists(data_path/path) and not force_new_dataset:
            tmp_dataset=joblib.load(data_path/path)
            self.movie,self.movie_infos,self.users,self.ratings,self.labels=tmp_dataset.movie,tmp_dataset.movie_infos,tmp_dataset.users,tmp_dataset.ratings,tmp_dataset.labels
        else:
            self.movie,self.movie_infos,self.users,self.ratings,self.labels=self.build_dataset(movie_ids,rating,movie_info,n_negatives)
        self.save_dataset()

    def save_dataset(self):
        with open(data_path/self.path,"wb") as f:
            joblib.dump(self,f,compress=9)


    def __getitem__(self,idx):
        return self.movie[idx],self.movie_infos[idx],self.users[idx],self.ratings[idx],self.labels[idx]

    def __len__(self):
        return len(self.labels)

    def build_dataset(self, movie_ids,rating,movie_info,n_negatives=1):

        movies,movie_ids,users,ratings,labels=self.get_dataset(movie_ids,rating,movie_info,n_negatives)
        movies=self.convert_datas(movies,convert_to_float=True)
        movie_ids=self.convert_datas(movie_ids)
        users=self.convert_datas(users)
        ratings=self.convert_datas(ratings,convert_to_float=True)
        labels=self.convert_datas(labels)
        return movies,movie_ids,users,ratings,labels


    def get_constant(self):
        input=torch.from_numpy(np.zeros(len(genome_matrix[0]))).float().to(device)
        output=model.encode(input).detach().cpu().numpy()
        return output


    def get_dataset(self,all_movie_ids,rating,movie_info,n_negatives):

        movies,movie_ids,ratings,users,labels=[],[],[],[],[]

        constant=self.get_constant()
        movie_users=set(zip(rating["movieId"],rating["userId"]))
        movie_users_rating=set(zip(movie_users,rating["rating"]))

        for (movie_id,user_id),rating in tqdm.tqdm(movie_users_rating):
            movie_ids.append(movie_id)
            try:
                movies.append(movie_info.loc[movie_id,:].values)
            except KeyError as e:
                movies.append(constant)
            users.append(user_id)
            ratings.append(rating)
            labels.append(1)

            for _ in range(n_negatives):

                negative_movie=np.random.choice(all_movie_ids)

                while (negative_movie,user_id) in movie_users:
                    negative_movie=np.random.choice(all_movie_ids)

                movie_ids.append(negative_movie)
                try:
                    movies.append(movie_info.loc[movie_id,:].values)
                except KeyError as e:
                    movies.append(constant)
                ratings.append(rating)
                users.append(user_id)
                labels.append(0)
        return np.array(movies),np.array(movie_ids),np.array(users),np.array(ratings),np.array(labels)

    @staticmethod
    def convert_datas(array,convert_to_float=False):
        array=torch.from_numpy(array)
        if convert_to_float:
            array=array.float()
        return array

In [32]:
rating

Unnamed: 0,userId,movieId,rating,timestamp
62,1,1750,3.5,2005-04-02 23:56:41
112,1,3997,3.5,2005-04-02 23:56:32
117,1,4133,3.0,2005-04-02 23:56:09
164,1,7449,3.5,2005-04-02 23:56:03
60,1,1525,3.0,2005-04-02 23:55:50
...,...,...,...,...
20000001,138493,1894,3.0,2009-10-17 18:40:27
19999984,138493,1591,3.0,2009-10-17 18:40:21
20000230,138493,48780,5.0,2009-10-17 18:39:59
20000056,138493,2872,4.0,2009-10-17 18:39:44


In [33]:
if os.path.exists(data_path/"dataset_2") :
    tmp_dataset=joblib.load(data_path/"dataset")

In [34]:
tmp_dataset.ratings

tensor([4., 4., 4.,  ..., 4., 4., 4.])

In [35]:
movie_ids=rating.movieId.unique()

In [36]:
rating.sort_values(by=["userId","timestamp"],ascending=[True,False],inplace=True)
rating

Unnamed: 0,userId,movieId,rating,timestamp
62,1,1750,3.5,2005-04-02 23:56:41
112,1,3997,3.5,2005-04-02 23:56:32
117,1,4133,3.0,2005-04-02 23:56:09
164,1,7449,3.5,2005-04-02 23:56:03
60,1,1525,3.0,2005-04-02 23:55:50
...,...,...,...,...
20000001,138493,1894,3.0,2009-10-17 18:40:27
19999984,138493,1591,3.0,2009-10-17 18:40:21
20000230,138493,48780,5.0,2009-10-17 18:39:59
20000056,138493,2872,4.0,2009-10-17 18:39:44


In [37]:
rating['rank_latest'] = rating.groupby(['userId'])['timestamp'] \
                                .rank(method='first', ascending=False)

KeyboardInterrupt: 

In [39]:
test_ratings=rating.loc[rating.rank_latest<=3]
train_ratings=rating.loc[rating.rank_latest>3]

In [40]:
train_ratings

Unnamed: 0,userId,movieId,rating,timestamp,rank_latest
164,1,7449,3.5,2005-04-02 23:56:03,4.0
60,1,1525,3.0,2005-04-02 23:55:50,5.0
109,1,3889,4.0,2005-04-02 23:55:38,6.0
154,1,6888,3.0,2005-04-02 23:55:27,7.0
78,1,2253,3.5,2005-04-02 23:55:22,8.0
...,...,...,...,...,...
20000001,138493,1894,3.0,2009-10-17 18:40:27,367.0
19999984,138493,1591,3.0,2009-10-17 18:40:21,368.0
20000230,138493,48780,5.0,2009-10-17 18:39:59,369.0
20000056,138493,2872,4.0,2009-10-17 18:39:44,370.0


In [None]:
train_dataset=MovieDataset(movie_ids,train_ratings.iloc[0:1000],movie_matrix_encoded_df,force_new_dataset=True)

100%|██████████| 1000/1000 [00:00<00:00, 7333.31it/s]


In [42]:
class MovieEmbedder(nn.Module):

    def __init__(self,n_movies,input_features_dim,dim_embedding=10):
        super().__init__()
        self.embedding=nn.Embedding(n_movies,embedding_dim=10)
        self.linear_layers=nn.Sequential(
            nn.Linear(10+input_features_dim,128),
            nn.ReLU(),
            nn.Linear(128,128),
            nn.ReLU(),
            nn.Linear(128,128),
            nn.ReLU(),
            nn.Linear(128,dim_embedding),
            nn.ReLU()
        )

    def forward(self,x,features):
        x=self.embedding(x)
        x=torch.cat([x,features],dim=1)
        x=self.linear_layers(x)
        return x

    def embed(self,x):
        x=self.embedding(x)
        return x

class MRS(nn.Module):
    def __init__(self,n_users,n_movies,input_features_dim,dim_movie_embedding=10,dim_users_embedding=10):
        super().__init__()
        self.movie_embedder=MovieEmbedder(n_movies,input_features_dim,dim_movie_embedding)
        self.user_embedding=nn.Embedding(n_users,dim_users_embedding)
        self.common_linear_layers=nn.Sequential(
            nn.Linear(dim_movie_embedding+dim_users_embedding,1024),
            nn.ReLU(),
            nn.Linear(1024,512),
            nn.ReLU(),
            nn.Linear(512,256),
            nn.ReLU(),
            nn.Linear(256,128),
            nn.ReLU(),
            nn.Linear(128,64),
            nn.ReLU(),
        )
        self.watched_ll=nn.Sequential(
            nn.Linear(64,32),
            nn.ReLU(),
            nn.Linear(32,1),
            nn.ReLU(),
            nn.Sigmoid()
        )
        self.rating_ll=nn.Sequential(
            nn.Linear(64,32),
            nn.ReLU(),
            nn.Linear(32,1),
            nn.ReLU(),
        )

    def forward(self,movie,user,movie_feature):
        movie=self.movie_embedder(movie,movie_feature)
        users=self.user_embedding(user)
        x=torch.concat([movie,users],dim=1)
        x=self.common_linear_layers(x)
        watched=self.watched_ll(x)
        rating=self.rating_ll(x)
        return watched,rating

: 

In [None]:
train_data_loader=DataLoader(train_dataset,batch_size=2,shuffle=True)
n_users=rating.userId.max()+1
n_movies=rating.movieId.max()+1
mrs=MRS(n_users=n_users,n_movies=n_movies,input_features_dim=10,dim_movie_embedding=10,dim_users_embedding=10)
optimizer_=optim.AdamW(mrs.parameters(),lr=0.0006)
loss1=nn.BCELoss()
loss2=nn.MSELoss()
for movie_info,movie,user,rat,label in tqdm.tqdm(train_data_loader):
    watched,rating__=mrs(movie.int(),user.int(),movie_info)
    tmp_loss=loss1(watched.view(-1),label)
    _loss=loss2(rating__,rat)
    loss_=_loss+tmp_loss
    optimizer_.zero_grad()
    loss_.backward()
    optimizer_.step()

  0%|          | 0/1000 [00:00<?, ?it/s]