In [2]:
import torch
from torch.utils.data import Dataset,DataLoader
import numpy as np 
import pandas as pd
import tensorflow as tf
import re
from collections import  Counter
from sklearn.model_selection import train_test_split 
from torchmetrics import R2Score
from torch.nn import MSELoss
from torch.optim import Adam
from typing import Dict
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [3]:
rating=pd.read_csv(r'D:\movrec\data_preprocessing\cpdata4v2\ratings.csv')
#movies=pd.read_csv(r'D:\movrec\data_preprocessing\cpdata3v2\movies.csv')
user_num=len(pd.unique(rating['userId']))+1
movie_num=len(pd.unique(rating['movieId']))+1
print(user_num,movie_num)
train,test=train_test_split(rating,test_size=0.09,stratify=rating['userId'])
train=train.to_numpy()
test=test.to_numpy()
user_train=torch.tensor(train[:,0],dtype=torch.long)
movie_train=torch.tensor(train[:,1],dtype=torch.int32)
rating_train=torch.tensor(train[:,2],dtype=torch.float32)
training_data=torch.utils.data.DataLoader(torch.utils.data.TensorDataset(user_train,movie_train,rating_train),batch_size=256)
#delete unneed data
user_train=None
movie_train=None
rating_train=None
user_val=torch.tensor(test[:,0],dtype=torch.long)
movie_val=torch.tensor(test[:,1],dtype=torch.int32)
rating_val=torch.tensor(test[:,2],dtype=torch.float32)
dev_data=torch.utils.data.DataLoader(torch.utils.data.TensorDataset(user_val,movie_val,rating_val),batch_size=1024)
#delete unneed data
user_val=None
movie_val=None
rating_val=None
#  i do not need movies data all data is stored
title=torch.load(r'D:\movrec\model\toknized_tensor\title.pt')
cast=torch.load(r'D:\movrec\model\toknized_tensor\cast.pt')
director=torch.load(r'D:\movrec\model\toknized_tensor\director.pt')
genre=torch.load(r'D:\movrec\model\toknized_tensor\genre.pt')
overrview=torch.load(r'D:\movrec\model\toknized_tensor\overrview.pt')
numeric_movie_data=torch.load(r'D:\movrec\model\toknized_tensor\numeric_movie_data.pt')
production_countries=torch.load(r'D:\movrec\model\toknized_tensor\production_countries.pt')
production_compaines=torch.load(r'D:\movrec\model\toknized_tensor\production_compaines.pt')

151968 9448


In [None]:
class FunctionalModel(torch.nn.Module):
    def __init__(self, user_size):
        super(FunctionalModel, self).__init__()
        self.user_emb = torch.nn.Embedding(user_size, 100)
        self.title_emb=torch.nn.Embedding(torch.max(title)+1,20)
        self.overreview_emb=torch.nn.Embedding(torch.max(overrview)+1,20)
        self.director_emb=torch.nn.Embedding(torch.max(director)+1,8)
        self.cast_emb=torch.nn.Embedding(torch.max(cast)+1,10)
        self.genre_emb=torch.nn.Embedding(torch.max(genre)+1,15)
        self.prod_comp_emb=torch.nn.Embedding(torch.max(production_compaines)+1,10)
        self.prod_count_emb=torch.nn.Embedding(torch.max(production_countries)+1,10)
        self.dropout=torch.nn.Dropout(0.2)
    def forward(self, user_ids, movie_ids):
        #because index of tokenized data start from 0 not one 
        movie_ids=movie_ids-1 
        user =self.user_emb(user_ids)
        tit=self.title_emb(title[movie_ids])
        ovrv=self.overreview_emb(overrview[movie_ids])
        dire=self.director_emb(director[movie_ids])
        ct=self.cast_emb(cast[movie_ids])
        gn=self.genre_emb(genre[movie_ids])
        pd_cmp=self.prod_comp_emb(production_compaines[movie_ids])
        pd_count=self.prod_count_emb(production_countries[movie_ids])
        num_data=numeric_movie_data[movie_ids]
        #because the diff of sequence  we must reduce the dim without lossing of data
        ovrv_vec=ovrv.mean(dim=1)
        ct_vec=ct.mean(dim=1)
        gn_vec=gn.mean(dim=1)
        pd_cmp_vec=pd_cmp.mean(dim=1)
        pd_count_vec=pd_count.mean(dim=1) 
        movie=torch.cat((tit,ovrv_vec,dire,ct_vec,gn_vec,pd_cmp_vec,pd_count_vec,num_data),dim=-1)
        movie=self.dropout(movie)
        user=self.dropout(user)
        x = torch.sum(user * movie, dim=-1, keepdim=True)
        return x.squeeze()
class ResNetBlock(torch.nn.Module):
    def __init__(self,in_f,out_f,activation=torch.nn.ELU()):
        super(ResNetBlock,self).__init__()
        self.f1=torch.nn.Linear(in_f,out_f)
        self.f2=torch.nn.Linear(out_f,out_f)
        self.res=torch.nn.Linear(in_f,out_f)
        self.batchNorm1=torch.nn.BatchNorm1d(out_f, momentum=0.3)
        self.batchNorm2=torch.nn.BatchNorm1d(out_f,momentum=0.3)
        self.activation=activation
        
    def forward(self,x):
        shortcut=self.res(x)
        x=self.f1(x)
        x=self.batchNorm1(x)
        x=self.activation(x)
        x=self.f2(x)
        x=self.batchNorm2(x)
        x=x+shortcut
        x=self.activation(x)
        return x
class Model(torch.nn.Module):
    def __init__(self,user_size,in_out_list:list,activation):
        super(Model,self).__init__()
        self.user_emb = torch.nn.Embedding(user_size, 100)
        self.title_emb=torch.nn.Embedding(torch.max(title)+1,20)
        self.overreview_emb=torch.nn.Embedding(torch.max(overrview)+1,20)
        self.director_emb=torch.nn.Embedding(torch.max(director)+1,8)
        self.cast_emb=torch.nn.Embedding(torch.max(cast)+1,10)
        self.genre_emb=torch.nn.Embedding(torch.max(genre)+1,15)
        self.prod_comp_emb=torch.nn.Embedding(torch.max(production_compaines)+1,10)
        self.prod_count_emb=torch.nn.Embedding(torch.max(production_countries)+1,10)
        self.layers=torch.nn.ModuleList() 
        in_f=100
        for out_f in in_out_list:
            self.layers.append(ResNetBlock(in_f,out_f,activation))
            in_f=out_f
        self.output=torch.nn.Linear(in_out_list[-1],1)

        self.model=torch.nn.Sequential(*self.layers)
    def forward(self,user_ids,movie_ids):
        movie_ids=movie_ids-1 
        user = self.user_emb(user_ids)
        tit=self.title_emb(title[movie_ids])
        ovrv=self.overreview_emb(overrview[movie_ids])
        dire=self.director_emb(director[movie_ids])
        ct=self.cast_emb(cast[movie_ids])
        gn=self.genre_emb(genre[movie_ids])
        pd_cmp=self.prod_comp_emb(production_compaines[movie_ids])
        pd_count=self.prod_count_emb(production_countries[movie_ids])
        ovrv_vec=ovrv.mean(dim=1)
        ct_vec=ct.mean(dim=1)
        gn_vec=gn.mean(dim=1)
        pd_cmp_vec=pd_cmp.mean(dim=1)
        pd_count_vec=pd_count.mean(dim=1) 
        num_data=numeric_movie_data[movie_ids]
        movie=torch.cat((tit,ovrv_vec,dire,ct_vec,gn_vec,pd_cmp_vec,pd_count_vec,num_data),dim=-1)
        result=user*movie
        x=self.model(result)
        x=self.output(x)
        return x.squeeze()
model=Model(user_num,[128,256,512],torch.nn.ReLU())

In [46]:
path_of_base=r'D:\movrec\model\hybrid model\hybird_base_model\base_model_MAE_0.6602.pth'
base_model=FunctionalModel(user_num)
base_model.load_state_dict(torch.load(path_of_base))


<All keys matched successfully>

In [47]:
stat_dict = base_model.state_dict()
with torch.no_grad():
    for name, parameter in model.named_parameters():
        if name in stat_dict:
            parameter.copy_(stat_dict[name])
            parameter.requires_grad = False
            print(parameter.requires_grad)


False
False
False
False
False
False
False
False


In [48]:

optimizer = Adam(model.parameters(), lr=1e-3)
criterion = torch.nn.L1Loss()
model = model.to(device)
torch.backends.cudnn.benchmark = True
num_epochs = 5
r2_metric = R2Score().to(device)
for epoch_idx in range(num_epochs):
    model.train()
    running_loss, batch_count = 0.0, 0
    r2_metric.reset()
    
    for user_ids, movie_ids, ratings in training_data:
        user_ids = user_ids.to(device, non_blocking=True)
        movie_ids = movie_ids.to(device, non_blocking=True)
        ratings = ratings.to(device, non_blocking=True)
        optimizer.zero_grad()
        output = model(user_ids, movie_ids)
        loss = criterion(output, ratings)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        r2_metric.update(output.float(), ratings.float())
        batch_count += 1
        print(
            f"\rBatch {batch_count}/{len(training_data)} - Loss: {running_loss / batch_count:.4f}\r",end='',flush=True)
    print()
    train_loss = running_loss / batch_count
    train_r2 = r2_metric.compute()
    print(
        f"Epoch {epoch_idx+1}/{num_epochs} - "
        f"Train Loss: {train_loss:.4f}, Train R2: {train_r2:.4f}"
    )
    model.eval()
    r2_metric.reset()
    val_loss_total = 0.0
    with torch.inference_mode():
        for user_ids, movie_ids, ratings in dev_data:
            user_ids = user_ids.to(device, non_blocking=True)
            movie_ids = movie_ids.to(device, non_blocking=True)
            ratings = ratings.to(device, non_blocking=True)

            output = model(user_ids, movie_ids)
            val_loss_total += criterion(output, ratings).item()
            r2_metric.update(output.float(), ratings.float())

    avg_val_loss = val_loss_total / len(dev_data)
    val_r2 = r2_metric.compute()
    print(
        f"        Validation: Loss: {avg_val_loss:.4f}, R2: {val_r2:.4f}"
    )


Batch 89043/89043 - Loss: 0.5300
Epoch 1/5 - Train Loss: 0.5300, Train R2: 0.4845
        Validation: Loss: 0.5235, R2: 0.4879
Batch 89043/89043 - Loss: 0.5239
Epoch 2/5 - Train Loss: 0.5239, Train R2: 0.4830
        Validation: Loss: 0.5222, R2: 0.4870
Batch 89043/89043 - Loss: 0.5223
Epoch 3/5 - Train Loss: 0.5223, Train R2: 0.4816
        Validation: Loss: 0.5208, R2: 0.4839
Batch 89043/89043 - Loss: 0.5214
Epoch 4/5 - Train Loss: 0.5214, Train R2: 0.4807
        Validation: Loss: 0.5206, R2: 0.4834
Batch 20939/89043 - Loss: 0.5211

KeyboardInterrupt: 

In [None]:
torch.save(model.state_dict(),r'D:\movrec\model\hybrid model\adv_model\model_4_5211.pth')

In [None]:

optimizer = Adam(model.parameters(), lr=3e-4)
criterion = torch.nn.L1Loss()
model = model.to(device)
torch.backends.cudnn.benchmark = True
num_epochs = 5
r2_metric = R2Score().to(device)
for epoch_idx in range(num_epochs):
    model.train()
    running_loss, batch_count = 0.0, 0
    r2_metric.reset()
    
    for user_ids, movie_ids, ratings in training_data:
        user_ids = user_ids.to(device, non_blocking=True)
        movie_ids = movie_ids.to(device, non_blocking=True)
        ratings = ratings.to(device, non_blocking=True)
        optimizer.zero_grad()
        output = model(user_ids, movie_ids)
        loss = criterion(output, ratings)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        r2_metric.update(output.float(), ratings.float())
        batch_count += 1
        print(
            f"\rBatch {batch_count}/{len(training_data)} - Loss: {running_loss / batch_count:.4f}\r",end='',flush=True)
    print()
    train_loss = running_loss / batch_count
    train_r2 = r2_metric.compute()
    print(
        f"Epoch {epoch_idx+1}/{num_epochs} - "
        f"Train Loss: {train_loss:.4f}, Train R2: {train_r2:.4f}"
    )
    model.eval()
    r2_metric.reset()
    val_loss_total = 0.0
    with torch.inference_mode():
        for user_ids, movie_ids, ratings in dev_data:
            user_ids = user_ids.to(device, non_blocking=True)
            movie_ids = movie_ids.to(device, non_blocking=True)
            ratings = ratings.to(device, non_blocking=True)

            output = model(user_ids, movie_ids)
            val_loss_total += criterion(output, ratings).item()
            r2_metric.update(output.float(), ratings.float())

    avg_val_loss = val_loss_total / len(dev_data)
    val_r2 = r2_metric.compute()
    print(
        f"        Validation: Loss: {avg_val_loss:.4f}, R2: {val_r2:.4f}"
    )


Batch 3/89043 - Loss: 0.5284

Batch 89043/89043 - Loss: 0.5215
Epoch 1/5 - Train Loss: 0.5215, Train R2: 0.4841
        Validation: Loss: 0.5212, R2: 0.4854
Batch 89043/89043 - Loss: 0.5211
Epoch 2/5 - Train Loss: 0.5211, Train R2: 0.4839
        Validation: Loss: 0.5210, R2: 0.4850
Batch 89043/89043 - Loss: 0.5210
Epoch 3/5 - Train Loss: 0.5210, Train R2: 0.4836
        Validation: Loss: 0.5209, R2: 0.4848
Batch 89043/89043 - Loss: 0.5209
Epoch 4/5 - Train Loss: 0.5209, Train R2: 0.4834
        Validation: Loss: 0.5208, R2: 0.4845
Batch 89043/89043 - Loss: 0.5208
Epoch 5/5 - Train Loss: 0.5208, Train R2: 0.4832
        Validation: Loss: 0.5208, R2: 0.4843
