In [1]:
import pandas as pd
import torch
import pytorch_lightning as pl
from tqdm import tqdm
import torchmetrics
import math
from urllib.request import urlretrieve
from zipfile import ZipFile
import os
import torch.nn as nn
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
users = pd.read_csv(
    "data/users.csv",
    sep=",",
)

ratings = pd.read_csv(
    "data/ratings.csv",
    sep=",",
)

movies = pd.read_csv(
    "data/movies.csv", sep=","
)

## Pytorch dataset

In [3]:
import pandas as pd
import torch
import torch.utils.data as data
from torchvision import transforms
import ast
from torch.nn.utils.rnn import pad_sequence

class MovieDataset(data.Dataset):
    """Movie dataset."""

    def __init__(
        self, ratings_file,test=False
    ):
        """
        Args:
            csv_file (string): Path to the csv file with user,past,future.
        """
        self.ratings_frame = pd.read_csv(
            ratings_file,
            delimiter=",",
            # iterator=True,
        )
        self.test = test

    def __len__(self):
        return len(self.ratings_frame)

    def __getitem__(self, idx):
        data = self.ratings_frame.iloc[idx]
        user_id = data.user_id
        
        movie_history = eval(data.sequence_movie_ids)
        movie_history_ratings = eval(data.sequence_ratings)
        target_movie_id = movie_history[-1:][0]
        target_movie_rating = movie_history_ratings[-1:][0]
        
        movie_history = torch.LongTensor(movie_history[:-1])
        movie_history_ratings = torch.LongTensor(movie_history_ratings[:-1])

        
        
        sex = data.sex
        age_group = data.age_group
        occupation = data.occupation
        
        return user_id, movie_history, target_movie_id,  movie_history_ratings, target_movie_rating, sex, age_group, occupation

In [4]:
genres = [
    "Action",
    "Adventure",
    "Animation",
    "Children's",
    "Comedy",
    "Crime",
    "Documentary",
    "Drama",
    "Fantasy",
    "Film-Noir",
    "Horror",
    "Musical",
    "Mystery",
    "Romance",
    "Sci-Fi",
    "Thriller",
    "War",
    "Western",
]

for genre in genres:
    movies[genre] = movies["genres"].apply(
        lambda values: int(genre in values.split("|"))
    )
    
sequence_length = 8

In [6]:
movies["Children's"]

0       1
1       1
2       0
3       0
4       0
       ..
3878    0
3879    0
3880    0
3881    0
3882    0
Name: Children's, Length: 3883, dtype: int64

In [5]:
# from torch.utils.data import DataLoader, random_split

# train_dataset = MovieDataset("data/train_data.csv")
# val_dataset = MovieDataset("data/test_data.csv")
# test_dataset = MovieDataset("data/test_data.csv")

# train_dataloader = DataLoader(
#     train_dataset,
#     batch_size=128,
#     shuffle=False,
#     num_workers=os.cpu_count(),
#     )

# val_dataloader = DataLoader(
#     val_dataset,
#     batch_size=128,
#     shuffle=False,
#     num_workers=os.cpu_count(),
#     )

# test_dataloader = DataLoader(
#     test_dataset,
#     batch_size=128,
#     shuffle=False,
#     num_workers=os.cpu_count(),
#     )   


In [7]:
class BST(pl.LightningModule):
    def __init__(
        self, args=None,
    ):
        super().__init__()
        super(BST, self).__init__()
        
        self.save_hyperparameters()
        self.args = args
        #-------------------
        # Embedding layers
        ##Users 
        self.embeddings_user_id = nn.Embedding(
            int(users.user_id.max())+1, int(math.sqrt(users.user_id.max()))+1
        )
        ###Users features embeddings
        self.embeddings_user_sex = nn.Embedding(
            len(users.sex.unique()), int(math.sqrt(len(users.sex.unique())))
        )
        self.embeddings_age_group = nn.Embedding(
            len(users.age_group.unique()), int(math.sqrt(len(users.age_group.unique())))
        )
        self.embeddings_user_occupation = nn.Embedding(
            len(users.occupation.unique()), int(math.sqrt(len(users.occupation.unique())))
        )
        self.embeddings_user_zip_code = nn.Embedding(
            len(users.zip_code.unique()), int(math.sqrt(len(users.sex.unique())))
        )
        
        ##Movies
        self.embeddings_movie_id = nn.Embedding(
            int(movies.movie_id.max())+1, int(math.sqrt(movies.movie_id.max()))+1
        )
        self.embeddings_position  = nn.Embedding(
           sequence_length, int(math.sqrt(len(movies.movie_id.unique())))+1
        )
        ###Movies features embeddings
        genre_vectors = movies[genres].to_numpy()
        self.embeddings_movie_genre = nn.Embedding(
            genre_vectors.shape[0], genre_vectors.shape[1]
        )
        
        self.embeddings_movie_genre.weight.requires_grad = False #Not training genres
        
        
        self.embeddings_movie_year = nn.Embedding(
            len(movies.year.unique()), int(math.sqrt(len(movies.year.unique())))
        )
        
        
        # Network
        self.transfomerlayer = nn.TransformerEncoderLayer(63, 3, dropout=0.2)
        self.linear = nn.Sequential(
            nn.Linear(
                589,
                1024,
            ),
            nn.LeakyReLU(),
            nn.Linear(1024, 512),
            nn.LeakyReLU(),
            nn.Linear(512, 256),
            nn.LeakyReLU(),
            nn.Linear(256, 1),
        )
        self.criterion = torch.nn.MSELoss()
        self.mae = torchmetrics.MeanAbsoluteError()
        self.mse = torchmetrics.MeanSquaredError()
        
    def encode_input(self,inputs):
        user_id, movie_history, target_movie_id,  movie_history_ratings, target_movie_rating, sex, age_group, occupation = inputs
        
        
        #MOVIES
        movie_history = self.embeddings_movie_id(movie_history)
        target_movie = self.embeddings_movie_id(target_movie_id)
        
        positions = torch.arange(0,sequence_length-1,1,dtype=int,device=self.device)
        positions = self.embeddings_position(positions)
        
        encoded_sequence_movies_with_poistion_and_rating = (movie_history + positions) #Yet to multiply by rating
        
        target_movie = torch.unsqueeze(target_movie, 1)
        transfomer_features = torch.cat((encoded_sequence_movies_with_poistion_and_rating, target_movie),dim=1)
        
        #USERS
        user_id = self.embeddings_user_id(user_id)
        
        sex = self.embeddings_user_sex(sex)
        age_group = self.embeddings_age_group(age_group)
        occupation = self.embeddings_user_occupation(occupation)
        user_features = torch.cat((user_id, sex, age_group,occupation), 1)
        
        return transfomer_features, user_features, target_movie_rating.float()
    
    def forward(self, batch):
        transfomer_features, user_features, target_movie_rating = self.encode_input(batch)
        transformer_output = self.transfomerlayer(transfomer_features)
        transformer_output = torch.flatten(transformer_output,start_dim=1)
        
        #Concat with other features
        features = torch.cat((transformer_output,user_features),dim=1)
        
        output = self.linear(features)
        return output, target_movie_rating
        
    def training_step(self, batch, batch_idx):
        out, target_movie_rating = self(batch)
        out = out.flatten()
        loss = self.criterion(out, target_movie_rating)
        
        mae = self.mae(out, target_movie_rating)
        mse = self.mse(out, target_movie_rating)
        rmse =torch.sqrt(mse)
        self.log(
            "train/mae", mae, on_step=True, on_epoch=False, prog_bar=False
        )
        
        self.log(
            "train/rmse", rmse, on_step=True, on_epoch=False, prog_bar=False
        )
        
        self.log("train/step_loss", loss, on_step=True, on_epoch=False, prog_bar=False)
        return loss
    
    def validation_step(self, batch, batch_idx):
        out, target_movie_rating = self(batch)
        out = out.flatten()
        loss = self.criterion(out, target_movie_rating)
        
        mae = self.mae(out, target_movie_rating)
        mse = self.mse(out, target_movie_rating)
        rmse =torch.sqrt(mse)
        self.log_dict({"val_loss": loss, "mae": mae.detach(), "rmse":rmse.detach()})
        
        return {"val_loss": loss, "mae": mae.detach(), "rmse":rmse.detach()}

    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
        avg_mae = torch.stack([x["mae"] for x in outputs]).mean()
        avg_rmse = torch.stack([x["rmse"] for x in outputs]).mean()
        
        self.log("val/loss", avg_loss, on_step=False, on_epoch=True, prog_bar=False)
        self.log("val/mae", avg_mae, on_step=False, on_epoch=True, prog_bar=False)
        self.log("val/rmse", avg_rmse, on_step=False, on_epoch=True, prog_bar=False)

    def test_step(self, batch, batch_idx):
        # TODO: out, target_movie_rating 어떤 shape 가지는지 이거를 testepochend로 넘기면 문제가 해결될까? 
        out, target_movie_rating = self(batch)
        out = out.flatten()
        loss = self.criterion(out, target_movie_rating)
        
        mae = self.mae(out, target_movie_rating)
        mse = self.mse(out, target_movie_rating)
        rmse =torch.sqrt(mse)
        self.log_dict({"test_loss": loss, "mae": mae.detach(), "rmse":rmse.detach()})
        return {"test_loss": loss, "mae": mae.detach(), "rmse":rmse.detach()}
        # dictt = {}
        # dictt['out'] = out
        # dictt['target_movie_rating'] = target_movie_rating
        # return dictt

# TODO: 여기로 넘어가는 outputs가 무엇인지?
    def test_epoch_end(self, outputs):
        users = torch.cat([x["users"] for x in outputs])
        y_hat = torch.cat([x["top14"] for x in outputs])
        users = users.tolist()
        y_hat = y_hat.tolist()
        
        data = {"users": users, "top14": y_hat}
        df = pd.DataFrame.from_dict(data)
        print(len(df))
        df.to_csv("lightning_logs/predict.csv", index=False)
        # print('hi')
        # return outputs

    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=0.0005)

    @staticmethod
    def add_model_specific_args(parent_parser):
        parser = ArgumentParser(parents=[parent_parser], add_help=False)
        parser.add_argument("--learning_rate", type=float, default=0.01)
        return parser

    ####################
    # DATA RELATED HOOKS
    ####################
    def setup(self, stage=None):
        print("Loading datasets")
        # TODO: size 줄인 train_csv
        self.train_dataset = MovieDataset("/opt/ml/BST/data/train_data copy.csv")
        self.val_dataset = MovieDataset("data/test_data.csv")
        self.test_dataset = MovieDataset("data/test_data.csv")
        print("Done")

    def train_dataloader(self):
        return torch.utils.data.DataLoader(
            self.train_dataset,
            batch_size=128,
            shuffle=False,
            num_workers=os.cpu_count(),
        )

    def val_dataloader(self):
        return torch.utils.data.DataLoader(
            self.val_dataset,
            batch_size=128,
            shuffle=False,
            num_workers=os.cpu_count(),
        )

    def test_dataloader(self):
        return torch.utils.data.DataLoader(
            self.test_dataset,
            batch_size=128,
            shuffle=False,
            num_workers=os.cpu_count(),
        )
        


In [None]:
pl.seed_everything(args.seed)


In [8]:
model = BST()
# TODO: gpus 1이면 오류남
trainer = pl.Trainer(gpus=0,max_epochs=1)
trainer.fit(model)
# torch.save("/opt/ml/BST/samplemodel",model)


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


Loading datasets



   | Name                       | Type                    | Params
------------------------------------------------------------------------
0  | embeddings_user_id         | Embedding               | 471 K 
1  | embeddings_user_sex        | Embedding               | 2     
2  | embeddings_age_group       | Embedding               | 14    
3  | embeddings_user_occupation | Embedding               | 84    
4  | embeddings_user_zip_code   | Embedding               | 3.4 K 
5  | embeddings_movie_id        | Embedding               | 249 K 
6  | embeddings_position        | Embedding               | 504   
7  | embeddings_movie_genre     | Embedding               | 69.9 K
8  | embeddings_movie_year      | Embedding               | 729   
9  | transfomerlayer            | TransformerEncoderLayer | 276 K 
10 | linear                     | Sequential              | 1.3 M 
11 | criterion                  | MSELoss                 | 0     
12 | mae                        | MeanAbsoluteError    

Done
                                                                      

  rank_zero_warn(


Epoch 0: 100%|██████████| 1136/1136 [00:46<00:00, 24.59it/s, loss=6.08, v_num=18]


In [9]:
# trainer = pl.Trainer(gpus=0,max_epochs=1)

# trainer.test(model)
# loaded_model = BST.load_from_checkpoint('/opt/ml/BST/lightning_logs/version_4/checkpoints/epoch=0-step=6405.ckpt')
# trainer.test(loaded_model)
oooo=trainer.test()



  rank_zero_warn(


Loading datasets


Restoring states from the checkpoint path at /opt/ml/BST/lightning_logs/version_18/checkpoints/epoch=0-step=9.ckpt
Loaded model weights from checkpoint at /opt/ml/BST/lightning_logs/version_18/checkpoints/epoch=0-step=9.ckpt


Done
Testing: 100%|█████████▉| 1124/1126 [00:43<00:00, 50.73it/s]--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'mae': 1.4727592468261719,
 'rmse': 1.6709264516830444,
 'test_loss': 2.8259942531585693}
--------------------------------------------------------------------------------
Testing: 100%|██████████| 1126/1126 [00:43<00:00, 26.01it/s]


In [47]:
oooo

[{'test_loss': 2.0338521003723145,
  'mae': 1.2207138538360596,
  'rmse': 1.4179977178573608}]

In [28]:
model.eval()
t_output = model(tuser_id, tmovie_history, ttarget_movie_id,  tmovie_history_ratings, ttarget_movie_rating, tsex, tage_group, toccupation)

TypeError: forward() takes 2 positional arguments but 9 were given