In [1]:
import pandas as pd
import numpy as np
from scipy import sparse
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from tqdm.notebook import tqdm
import pickle

In [2]:
from collections import namedtuple

import pandas as pd
import numpy as np

import pickle

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as td

import pytorch_lightning as pl

import tqdm
import json
import sklearn.metrics as sm

import tensorboardX as tb
import datetime, os

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
np.random.seed(31337)

In [3]:
train_joke_df = pd.read_csv(r'..\data\recsys-in-practice\train_joke_df.csv')

In [4]:
train_joke_df["UID"] = train_joke_df["UID"].astype(int)
train_joke_df["JID"] = train_joke_df["JID"].astype(int)

train_joke_df['UID'] = train_joke_df['UID'] - 1
train_joke_df['JID'] = train_joke_df['JID'] - 1

In [5]:
train_joke_df

Unnamed: 0,UID,JID,Rating
0,18028,5,-1.26
1,3297,63,-4.17
2,3365,57,0.92
3,12734,91,3.69
4,11364,37,-6.60
...,...,...,...
1448359,22603,25,2.82
1448360,22254,35,-1.94
1448361,21055,39,-9.56
1448362,12327,96,0.87


In [6]:
train_df, valid_df = train_test_split(train_joke_df, test_size=0.0007, random_state=42)

len(valid_df)

1014

In [7]:
train_df["UID"] = train_df["UID"].astype(int)
train_df["JID"] = train_df["JID"].astype(int)
valid_df["UID"] = valid_df["UID"].astype(int)
valid_df["JID"] = valid_df["JID"].astype(int)


# сделаем сортировку и перепишем index
train_df = train_df.sort_values(by=['UID', 'JID'])
train_df = train_df.reset_index(drop=True)

valid_df = valid_df.sort_values(by=['UID', 'JID'])
valid_df = valid_df.reset_index(drop=True)

In [8]:
def RMSE_loss(prediction, target):
    return torch.sqrt(nn.MSELoss()(prediction, target))

In [9]:
class ContextualRanker(pl.LightningModule):
    def __init__(self, embedding_dim, n_users, n_items, ratings_range):
        super().__init__()
        self.embedding_dim = embedding_dim
        self.y_range = ratings_range
        
        self.user_embedding = nn.Embedding(n_users+1, embedding_dim, padding_idx=0)
        self.item_embedding = nn.Embedding(n_items+1, embedding_dim, padding_idx=0)

        self.user_bias = nn.Embedding(n_users+1, 1, padding_idx=0)
        self.item_bias = nn.Embedding(n_items+1, 1, padding_idx=0)

    def forward(self, x):
        users, items = x[ : , 0], x[ : , 1]
        dot = self.user_embedding(users) * self.item_embedding(items)
        result = dot.sum(1)
        result = (result + self.user_bias(users).squeeze() + self.item_bias(items).squeeze())
        return (torch.sigmoid(result) * (self.y_range[1] - self.y_range[0]) + self.y_range[0])
    
    def step(self, batch, batch_idx, metric, prog_bar=False):
        x, y = batch
        predictions = self.forward(x)
        loss = RMSE_loss(predictions, y.float())
        self.log(metric, loss, prog_bar=prog_bar)
        return loss

    def test_step(self, batch, batch_idx, prog_bar=False):
        return self.step(batch, batch_idx, "test_loss")

    def training_step(self, batch, batch_idx):
        return self.step(batch, batch_idx, "train_loss")
    
    def validation_step(self, batch, batch_idx):
        return self.step(batch, batch_idx, "val_loss", True)
        
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3, weight_decay=1e-5)
        lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, verbose=True)
        scheduler = {
            'scheduler': lr_scheduler,
            'reduce_on_plateau': True,
            'monitor': 'val_loss'
        }
        return [optimizer], [scheduler]

In [10]:
class ContextualRankerData(pl.LightningDataModule):
    def __init__(self, train_data, val_data, test_data, features):
        super().__init__()
        self.train_data = train_data
        self.val_data = val_data
        self.test_data = test_data
        self.features = features

        
    def prepare_data(self):
        self.test_data = self.test_data.assign(rdm = np.random.random(len(self.test_data)))

    def setup(self, stage=None):
        if stage == "fit" or stage is None:
            self.train_dataset = td.TensorDataset(
              torch.from_numpy(self.train_data[self.features].values), 
              torch.from_numpy(self.train_data["Rating"].values)
              )
  
            self.val_dataset = td.TensorDataset(
              torch.from_numpy(self.val_data[self.features].values), 
              torch.from_numpy(self.val_data["Rating"].values)
              )
          
        if stage == "test" or stage is None:  
            self.test_dataset = td.TensorDataset(
                torch.from_numpy(self.test_data[self.features].values),
                torch.from_numpy(self.test_data[["rdm"]].values)
            )
            
    def train_dataloader(self):
        return td.DataLoader(self.train_dataset, batch_size=2048, shuffle=True, num_workers=0)
  
    def val_dataloader(self):
        return td.DataLoader(self.val_dataset, batch_size=2048, num_workers=0)

    def test_dataloader(self):
        return td.DataLoader(self.test_dataset, batch_size=512, shuffle=False, num_workers=0)  

In [11]:

embed = 32
n_users = np.unique(train_joke_df['UID']).size
n_items = np.unique(train_joke_df['JID']).size
n_users, n_items

(24983, 100)

In [12]:

net = ContextualRanker(embedding_dim=embed, n_users=n_users, n_items=n_items, ratings_range=[-10, 10])
data_module = ContextualRankerData(train_df, valid_df, valid_df, features = ["UID", "JID"])

checkpoint_callback = pl.callbacks.ModelCheckpoint(monitor="val_loss")

trainer = pl.Trainer(
    max_epochs=1000,
    devices=1, accelerator="gpu",
    #gpus=1, 
    callbacks=[
        pl.callbacks.early_stopping.EarlyStopping(monitor="val_loss", patience=5),
        pl.callbacks.LearningRateMonitor(logging_interval="step"),
        checkpoint_callback
    ])

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [13]:
trainer.fit(
    net, 
    data_module
)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name           | Type      | Params
---------------------------------------------
0 | user_embedding | Embedding | 799 K 
1 | item_embedding | Embedding | 3.2 K 
2 | user_bias      | Embedding | 25.0 K
3 | item_bias      | Embedding | 101   
---------------------------------------------
827 K     Trainable params
0         Non-trainable params
827 K     Total params
3.311     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Epoch 00025: reducing learning rate of group 0 to 1.0000e-04.


Validation: 0it [00:00, ?it/s]

In [14]:
%load_ext tensorboard

In [15]:
%tensorboard --logdir lightning_logs --host localhost

In [16]:
checkpoint_callback.best_model_path

'E:\\competitions\\kaggle\\magnit_recsys-in-practice\\part2\\lightning_logs\\version_3\\checkpoints\\epoch=20-step=14847.ckpt'