In [1]:
import torch
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertModel

In [2]:
main_df = pd.read_csv('/opt/ml/input/data/train/train_ratings.csv') # user-item-time
title_df = pd.read_csv('/opt/ml/input/data/train/titles.tsv', sep='\t') # item-title
year_df = pd.read_csv('/opt/ml/input/data/train/years.tsv', sep='\t') # item-year
director_df = pd.read_csv('/opt/ml/input/data/train/directors.tsv', sep='\t') # item-director
genre_name_df = pd.read_csv('/opt/ml/input/data/train/genres.tsv', sep='\t') # item-genre(name)
writer_df = pd.read_csv('/opt/ml/input/data/train/writers.tsv', sep='\t') # item-writer

In [3]:
# genres_df 생성
genres_df = pd.read_csv("/opt/ml/input/data/train/genres.tsv", sep="\t")
array, index = pd.factorize(genres_df["genre"])
genres_df["genre"] = array # item-genre(number)

# Genre Embedding

한 영화에 여러개의 장르가 있다.  
장르는 각각의 성질을 가지고 있다고 가정하고 (ex: "액션"과 "범죄"의 거리는 "액션"과 "드라마"의 거리보다 가깝다)  
각 단어들을 임베딩하여 숫자로 표현한다.
이때 각 단어는 크기가 약 760인 벡터로 임베딩 되는데 이 760개의 성분의 평균을 <U>**장르의 임베딩값**</U>으로 설정했다

여러 장르를 동시에 갖는 영화는 해당 장르들의 임베딩 값의 평균으로 설정하였다

In [4]:
# Load the BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

genre_vectors = {} # 각 장르의 임베딩 dict

# Tokenize a genre
for i,j in enumerate(genre_name_df['genre'].unique()):
    input_ids = torch.tensor(tokenizer.encode(f"{j}")).unsqueeze(0)  # Tokenize a genre
    output = model(input_ids)[0]  # Extract the hidden states of the model at the last layer, The last layer hidden states
    token_embedding = output[0][0]  # Extract the first token's embedding
    np_token_embedding = token_embedding.detach().numpy() # convert to numpy
    v = np_token_embedding.mean()
    genre_vectors[i] = v


ref_genre_dict = {} # 각 영화가 어떤 장르를 가지고 있는지 나타내는 dict
for i, j in list(zip(genres_df['item'], genres_df['genre'])):
    try:
        ref_genre_dict[i].append(j)
    except KeyError:
        ref_genre_dict[i] = [j]


genre_df_dict = {} # 각 영화의 장르 통합 임베딩 dict
for i, j in list(zip(ref_genre_dict.keys(), ref_genre_dict.values())):
    genre_df_dict[i] = 0
    for k in j:
        genre_df_dict[i] += genre_vectors[k]
    genre_df_dict[i] = (genre_df_dict[i])/len(j)

# 데이터 프레임에 적용
ref_df = main_df.copy()
ref_df['item_2'] = ref_df['item']
ref_df.item_2 = ref_df.item_2.map(genre_df_dict)
ref_df.rename(columns={'item_2':'genre_embedding'})
ref_df

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Unnamed: 0,user,item,time,item_2
0,11,4643,1230782529,-0.009958
1,11,170,1230782534,-0.010141
2,11,531,1230782539,-0.010972
3,11,616,1230782542,-0.010493
4,11,2140,1230782563,-0.010183
...,...,...,...,...
5154466,138493,44022,1260209449,-0.010516
5154467,138493,4958,1260209482,-0.010298
5154468,138493,68319,1260209720,-0.009911
5154469,138493,40819,1260209726,-0.010697
