# 기본 설정

구글 드라이브 마운트

패키지 설치

In [None]:
from google.colab import drive
drive.mount('/gdrive', force_remount=True)

Mounted at /gdrive


In [None]:
!pip install transformers
!pip install datasets
!pip install pytorch_lightning

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 5.3 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 55.7 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 53.6 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 6.6 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstallin

In [None]:
import pandas as pd
import numpy as np
from glob import glob
import os
import tqdm
from transformers import AutoTokenizer, AutoModel, DataCollatorWithPadding
from datasets import Dataset
import torch
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl
from typing import List, Optional, Tuple, Union
import re

In [None]:
cd /gdrive/MyDrive/Lectures/2023/RecSys/content-based

/gdrive/MyDrive/Lectures/2022/IntelligentMarketing/content-based


In [None]:
#@title 마이너 패키지 로딩
%matplotlib inline
from datetime import datetime
import matplotlib.pyplot as plt
plt.style.use('seaborn-darkgrid')

# 데이터 로드

raw/ml-100k
- u1.base: 학습 (80K)
- u1.test: 검증 (20K)

In [None]:
#@title get_user_item_map
def get_user_item_map(X):
  """Function to generate a ratings matrx and mappings for
  the user and item ids to the row and column indices

  Parameters
  ----------
  X : pandas.DataFrame, shape=(n_ratings,>=3)
      First 3 columns must be in order of user, item, rating.

  Returns
  -------
  user_map : pandas Series, shape=(n_users,)
      Mapping from the original user id to an integer in the range [0,n_users)
  item_map : pandas Series, shape=(n_items,)
      Mapping from the original item id to an integer in the range [0,n_items)
  """
  user_col, item_col, rating_col = X.columns[:3]
  rating = X[rating_col]
  user_map = pd.Series(
      index=np.unique(X[user_col]),
      data=np.arange(X[user_col].nunique()),
      name='user_map',
  )
  item_map = pd.Series(
      index=np.unique(X[item_col]),
      data=np.arange(X[item_col].nunique()),
      name='columns_map',
  )

  return user_map, item_map

In [None]:
item_plot_df = pd.read_csv('movie_plots_80_missings.csv', index_col=0)

In [None]:
def load_data(file_path):
  ratings_df = pd.read_csv(file_path, sep='\t', header=None, 
                          names=['userId', 'movieId', 'rating', 'timestamp'])
  ratings_df['timestamp'] = ratings_df['timestamp'].apply(datetime.fromtimestamp)
  ratings_df = ratings_df.sort_values('timestamp')
  return ratings_df

In [None]:
train_df = load_data('../raw/ml-100k/u1.base')
val_df = load_data('../raw/ml-100k/u1.test')

In [None]:
user_map, item_map = get_user_item_map(pd.concat((train_df, val_df), axis=0))

In [None]:
train_df['plots'] = train_df.movieId.map(item_plot_df['plot'])
val_df['plots'] = val_df.movieId.map(item_plot_df['plot'])

# 모델링

아이템 모델링
- 플롯(plot) 활용: Distilbert (https://huggingface.co/docs/transformers/model_doc/distilbert#distilbert)

유저 모델링
- 단순 임베딩

결합
- 연결: concatenation

예측
- Linear regression

아이템 플롯 데이터
- movie_plots_80_missings.csv

참고
- https://ratsgo.github.io/nlpbook/docs/lm
- https://huggingface.co/docs/transformers/index


## 토크나이저

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [None]:
tokenizer.is_fast

True

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["plots"], truncation=True, max_length=256)

In [None]:
def make_dataset(df):
  dataset = Dataset.from_pandas(
      pd.concat([df.userId.map(user_map).astype(np.int64).rename('user'),
                df.movieId.map(item_map).astype(np.int64).rename('item'),
                df.plots.astype('string'),
                df.rating.astype(np.float32).rename('y')], 
                axis=1)
  )
  dataset = dataset.map(preprocess_function, batched=True)
  dataset.set_format(type='torch', columns=['user', 'item', 'input_ids', 'attention_mask', 'y'])
  return dataset

In [None]:
train_dataset = make_dataset(train_df)
val_dataset = make_dataset(val_df)

  0%|          | 0/80 [00:00<?, ?ba/s]

  0%|          | 0/20 [00:00<?, ?ba/s]

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=32, collate_fn=data_collator, shuffle=True)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=32, collate_fn=data_collator)

## 모델 서브클래싱

In [None]:
from typing import List, Optional, Tuple, Union

In [None]:
class ContentBasedModel(pl.LightningModule):
  def __init__(self, user_num, factor_num, dropout):
    super().__init__()
    self.user_num = user_num
    self.factor_num = factor_num
    self.dropout = dropout
    self.save_hyperparameters()

    self.embed_user = nn.Embedding(self.user_num, self.factor_num)
    self.bert_model = AutoModel.from_pretrained("distilbert-base-uncased")
    # self.bert_model = AutoModel.from_pretrained("beomi/kcbert-base")

    self.predict_layer = torch.nn.Sequential(
      torch.nn.Dropout(p=self.dropout),
        torch.nn.Linear(self.bert_model.config.hidden_size + self.factor_num, 32),
        torch.nn.ReLU(),
        torch.nn.Linear(32, 1)
    )

  def configure_optimizers(self):
    optimizer = torch.optim.AdamW(self.parameters(), lr=5e-5)#self.config.learning_rate)
    scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)
    return {
      'optimizer': optimizer,
      'scheduler': scheduler,
    }

  def forward(self,
        user: Optional[torch.LongTensor] = None,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        item: Optional[torch.LongTensor] = None,
        y: Optional[torch.FloatTensor] = None
        ):
    embed_user_output = self.embed_user(user)
    bert_output = self.bert_model(input_ids=input_ids, attention_mask=attention_mask)
    hidden_state = bert_output[0]
    pooled_output = hidden_state[:, 0]
    concat = torch.cat((pooled_output, embed_user_output), -1)
    prediction = self.predict_layer(concat)
    return prediction.view(-1)

  def training_step(self, batch, batch_idx):
    prediction = self.forward(**batch)
    mse_loss = F.mse_loss(prediction, batch['y'])
    self.log("mse_loss", mse_loss, prog_bar=False, logger=True, on_step=True, on_epoch=False)
    return mse_loss

  def validation_step(self, batch, batch_idx):
    prediction = self.forward(**batch)
    mse_loss = F.mse_loss(prediction, batch['y'])
    mae_loss = F.l1_loss(prediction, batch['y'])
    self.log("val_mse_loss", mse_loss, prog_bar=False, logger=True, on_step=True, on_epoch=False)
    self.log("val_mae_loss", mae_loss, prog_bar=True, logger=True, on_step=False, on_epoch=True)
    return mse_loss, mae_loss

In [None]:
model = ContentBasedModel(user_num=len(user_map), factor_num=32, dropout=0.2)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## 트레이너 설정

In [None]:
ckpt_path = os.path.abspath('learning_results')

checkpoint_callback = pl.callbacks.ModelCheckpoint(
    dirpath=ckpt_path,
    save_top_k=2,
    monitor='val_mae_loss',
    mode='min',
    filename='{epoch}-{val_mae_loss:.4f}',
)

trainer = pl.Trainer(default_root_dir=ckpt_path,
                    #  fast_dev_run=7,
                     max_epochs=10,
                     accelerator="gpu",
                     devices=1,
                     callbacks=[checkpoint_callback]
                     )

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [None]:
# !rm -rf learning_results/lightning_logs

### 텐서보드 실행

In [None]:
%reload_ext tensorboard
%tensorboard --logdir=learning_results/lightning_logs --host 0.0.0.0 --port=6006

### 학습

In [None]:
trainer.fit(model=model, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader)

  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name          | Type            | Params
--------------------------------------------------
0 | embed_user    | Embedding       | 30.2 K
1 | bert_model    | DistilBertModel | 66.4 M
2 | predict_layer | Sequential      | 25.7 K
--------------------------------------------------
66.4 M    Trainable params
0         Non-trainable params
66.4 M    Total params
265.675   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [None]:
!nvidia-smi

Mon May 30 01:56:30 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   50C    P0    28W /  70W |   1444MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces