# import

In [32]:
import os
import re
import nltk
from nltk import tokenize
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import torch
from torch.utils.data import DataLoader, Dataset
from torch.autograd import Variable
from transformers import BertModel, BertTokenizer

import os
import numpy as np
import tqdm
import torch
import torch.nn as nn

import time
import argparse

In [33]:
class RMSELoss(torch.nn.Module):
    def __init__(self):
        super(RMSELoss,self).__init__()
        self.eps = 1e-6

    def forward(self, x, y):
        criterion = nn.MSELoss()
        loss = torch.sqrt(criterion(x, y)+self.eps)
        return loss


class FeaturesEmbedding(nn.Module):

    def __init__(self, field_dims: np.ndarray, embed_dim: int):
        super().__init__()
        self.embedding = torch.nn.Embedding(sum(field_dims), embed_dim)
        self.offsets = np.array((0, *np.cumsum(field_dims)[:-1]), dtype=np.long)
        torch.nn.init.xavier_uniform_(self.embedding.weight.data)

    def forward(self, x: torch.Tensor):
        """
        :param x: Long tensor of size ``(batch_size, num_fields)``
        """
        x = x + x.new_tensor(self.offsets).unsqueeze(0)
        return self.embedding(x)


class FactorizationMachine_v(nn.Module):

    def __init__(self, input_dim, latent_dim):
        super().__init__()
        self.v = nn.Parameter(torch.rand(input_dim, latent_dim), requires_grad = True)
        self.linear = nn.Linear(input_dim, 1, bias=True)

    def forward(self, x):
        linear = self.linear(x)
        square_of_sum = torch.mm(x, self.v) ** 2
        sum_of_square = torch.mm(x ** 2, self.v ** 2)
        pair_interactions = torch.sum(square_of_sum - sum_of_square, dim=1, keepdim=True)
        output = linear + (0.5 * pair_interactions)
        return output

# data preprocessing

In [34]:
parser = argparse.ArgumentParser(description='parser')
arg = parser.add_argument
arg('--DATA_PATH', type=str, default='/opt/ml/input/code/data/', help='Data path를 설정할 수 있습니다.')
args = parser.parse_args('')

users = pd.read_csv(args.DATA_PATH + 'users.csv')
books = pd.read_csv(args.DATA_PATH + 'books.csv')
train = pd.read_csv(args.DATA_PATH + 'train_ratings.csv')
test = pd.read_csv(args.DATA_PATH + 'test_ratings.csv')
sub = pd.read_csv(args.DATA_PATH + 'sample_submission.csv')

In [35]:
train.head()

Unnamed: 0,user_id,isbn,rating
0,8,2005018,4
1,67544,2005018,7
2,123629,2005018,8
3,200273,2005018,8
4,210926,2005018,9


In [36]:
sub.head()

Unnamed: 0,user_id,isbn,rating
0,11676,2005018,0
1,116866,2005018,0
2,152827,60973129,0
3,157969,374157065,0
4,67958,399135782,0


In [37]:
users.head(10)

Unnamed: 0,user_id,location,age
0,8,"timmins, ontario, canada",
1,11400,"ottawa, ontario, canada",49.0
2,11676,"n/a, n/a, n/a",
3,67544,"toronto, ontario, canada",30.0
4,85526,"victoria, british columbia, canada",36.0
5,96054,"ottawa, ontario, canada",29.0
6,116866,"ottawa, ,",
7,123629,"kingston, ontario, canada",
8,177458,"ottawa, ontario, canada",29.0
9,200273,"comber, ontario, canada",


In [38]:
ids = pd.concat([train['user_id'], sub['user_id']]).unique()
isbns = pd.concat([train['isbn'], sub['isbn']]).unique()

idx2user = {idx:id for idx, id in enumerate(ids)}
idx2isbn = {idx:isbn for idx, isbn in enumerate(isbns)}

user2idx = {id:idx for idx, id in idx2user.items()}
isbn2idx = {isbn:idx for idx, isbn in idx2isbn.items()}

train['user_id'] = train['user_id'].map(user2idx)
sub['user_id'] = sub['user_id'].map(user2idx)

train['isbn'] = train['isbn'].map(isbn2idx)
sub['isbn'] = sub['isbn'].map(isbn2idx)

In [49]:
train['rating'].max()
train['rating'].min()

1

In [39]:
train.head(10) # 왜 바꿨을까

Unnamed: 0,user_id,isbn,rating
0,0,0,4
1,1,0,7
2,2,0,8
3,3,0,8
4,4,0,9
5,5,0,7
6,6,0,5
7,7,1,8
8,8,2,6
9,9,2,10


In [40]:
books_ = books.copy()
books_['isbn'] = books_['isbn'].map(isbn2idx) # books df 를 복사하고, isbns을 id로 바꿈

In [46]:
import copy
df = copy.deepcopy(train)
Train = False

if Train == True:
    df_ = df.copy()
else:
    df_ = df.copy()
    df_['user_id'] = df_['user_id'].map(user2idx)
    df_['isbn'] = df_['isbn'].map(isbn2idx)

In [47]:
df_

Unnamed: 0,user_id,isbn,rating
0,,,4
1,,,7
2,,,8
3,,,8
4,,,9
...,...,...,...
306790,62897.0,,7
306791,,,6
306792,,,7
306793,,,7


In [14]:
if train == True:
    df_ = df.copy()
else:
    df_ = df.copy()
    df_['user_id'] = df_['user_id'].map(user2idx)
    df_['isbn'] = df_['isbn'].map(isbn2idx)

df_ = pd.merge(df_, books_[['isbn', 'summary']], on='isbn', how='left')
df_['summary'].fillna('None', inplace=True)
df_['summary'] = df_['summary'].apply(lambda x:text_preprocessing(x))
df_['summary'].replace({'':'None', ' ':'None'}, inplace=True)
df_['summary_length'] = df_['summary'].apply(lambda x:len(x))

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)

if user_summary_merge_vector and item_summary_vector:
    print('Create User Summary Merge Vector')
    user_summary_merge_vector_list = []
    for user in tqdm(df_['user_id'].unique()):
        vector = text_to_vector(summary_merge(df_, user, 5), tokenizer, model, device)
        user_summary_merge_vector_list.append(vector)
    user_review_text_df = pd.DataFrame(df_['user_id'].unique(), columns=['user_id'])
    user_review_text_df['user_summary_merge_vector'] = user_summary_merge_vector_list
    vector = np.concatenate([
                            user_review_text_df['user_id'].values.reshape(1, -1),
                            user_review_text_df['user_summary_merge_vector'].values.reshape(1, -1)
                            ])
    if not os.path.exists('/opt/ml/input/code/data/text_vector'):
        os.makedirs('/opt/ml/input/code/data/text_vector')
    if train == True:
        np.save('/opt/ml/input/code/data/text_vector/train_user_summary_merge_vector.npy', vector)
    else:
        np.save('/opt/ml/input/code/data/text_vector/test_user_summary_merge_vector.npy', vector)

    print('Create Item Summary Vector')
    item_summary_vector_list = []
    books_text_df = df_[['isbn', 'summary']].copy()
    books_text_df= books_text_df.drop_duplicates().reset_index(drop=True)
    books_text_df['summary'].fillna('None', inplace=True)
    for summary in tqdm(books_text_df['summary']):
        vector = text_to_vector(summary, tokenizer, model, device)
        item_summary_vector_list.append(vector)
    books_text_df['item_summary_vector'] = item_summary_vector_list
    vector = np.concatenate([
                            books_text_df['isbn'].values.reshape(1, -1),
                            books_text_df['item_summary_vector'].values.reshape(1, -1)
                            ])
    if not os.path.exists('./data/text_vector'):
        os.makedirs('./data/text_vector')
    if train == True:
        np.save('./data/text_vector/train_item_summary_vector.npy', vector)
    else:
        np.save('./data/text_vector/test_item_summary_vector.npy', vector)
else:
    print('Check Vectorizer')
    print('Vector Load')
    if train == True:
        user = np.load('/opt/ml/input/code/data/text_vector/train_user_summary_merge_vector.npy', allow_pickle=True)
    else:
        user = np.load('/opt/ml/input/code/data/text_vector/test_user_summary_merge_vector.npy', allow_pickle=True)
    user_review_text_df = pd.DataFrame([user[0], user[1]]).T
    user_review_text_df.columns = ['user_id', 'user_summary_merge_vector']
    user_review_text_df['user_id'] = user_review_text_df['user_id'].astype('int')

    if train == True:
        item = np.load('data/text_vector/train_item_summary_vector.npy', allow_pickle=True)
    else:
        item = np.load('data/text_vector/test_item_summary_vector.npy', allow_pickle=True)
    books_text_df = pd.DataFrame([item[0], item[1]]).T
    books_text_df.columns = ['isbn', 'item_summary_vector']
    books_text_df['isbn'] = books_text_df['isbn'].astype('int')


df_ = pd.merge(df_, user_review_text_df, on='user_id', how='left')
df_ = pd.merge(df_, books_text_df[['isbn', 'item_summary_vector']], on='isbn', how='left')

NameError: name 'train' is not defined

In [None]:
text_train = process_text_data(
    train,
    books,
    user2idx,
    isbn2idx,
    args.DEVICE,
    train=True,
    user_summary_merge_vector=args.DEEPCONN_VECTOR_CREATE,
    item_summary_vector=args.DEEPCONN_VECTOR_CREATE
)

text_test = process_text_data(
    test,
    books,
    user2idx,
    isbn2idx,
    args.DEVICE,
    train=False, 
    user_summary_merge_vector=args.DEEPCONN_VECTOR_CREATE,
    item_summary_vector=args.DEEPCONN_VECTOR_CREATE
)

In [15]:
import argparse
######################## BASIC ENVIRONMENT SETUP
parser = argparse.ArgumentParser(description='parser')
arg = parser.add_argument

############### BASIC OPTION
arg('--DATA_PATH', type=str, default='data/', help='Data path를 설정할 수 있습니다.')
arg('--MODEL', type=str, choices=['FM', 'FFM', 'NCF', 'WDN', 'DCN', 'CNN_FM', 'DeepCoNN'],
                            help='학습 및 예측할 모델을 선택할 수 있습니다.')
arg('--DATA_SHUFFLE', type=bool, default=True, help='데이터 셔플 여부를 조정할 수 있습니다.')
arg('--TEST_SIZE', type=float, default=0.2, help='Train/Valid split 비율을 조정할 수 있습니다.')
arg('--SEED', type=int, default=42, help='seed 값을 조정할 수 있습니다.')

############### TRAINING OPTION
arg('--BATCH_SIZE', type=int, default=1024, help='Batch size를 조정할 수 있습니다.')
arg('--EPOCHS', type=int, default=10, help='Epoch 수를 조정할 수 있습니다.')
arg('--LR', type=float, default=1e-3, help='Learning Rate를 조정할 수 있습니다.')
arg('--WEIGHT_DECAY', type=float, default=1e-6, help='Adam optimizer에서 정규화에 사용하는 값을 조정할 수 있습니다.')

############### GPU
arg('--DEVICE', type=str, default='cuda', choices=['cuda', 'cpu'], help='학습에 사용할 Device를 조정할 수 있습니다.')

############### FM
arg('--FM_EMBED_DIM', type=int, default=16, help='FM에서 embedding시킬 차원을 조정할 수 있습니다.')

############### FFM
arg('--FFM_EMBED_DIM', type=int, default=16, help='FFM에서 embedding시킬 차원을 조정할 수 있습니다.')

############### NCF
arg('--NCF_EMBED_DIM', type=int, default=16, help='NCF에서 embedding시킬 차원을 조정할 수 있습니다.')
arg('--NCF_MLP_DIMS', type=list, default=(16, 16), help='NCF에서 MLP Network의 차원을 조정할 수 있습니다.')
arg('--NCF_DROPOUT', type=float, default=0.2, help='NCF에서 Dropout rate를 조정할 수 있습니다.')

############### WDN
arg('--WDN_EMBED_DIM', type=int, default=16, help='WDN에서 embedding시킬 차원을 조정할 수 있습니다.')
arg('--WDN_MLP_DIMS', type=list, default=(16, 16), help='WDN에서 MLP Network의 차원을 조정할 수 있습니다.')
arg('--WDN_DROPOUT', type=float, default=0.2, help='WDN에서 Dropout rate를 조정할 수 있습니다.')

############### DCN
arg('--DCN_EMBED_DIM', type=int, default=16, help='DCN에서 embedding시킬 차원을 조정할 수 있습니다.')
arg('--DCN_MLP_DIMS', type=list, default=(16, 16), help='DCN에서 MLP Network의 차원을 조정할 수 있습니다.')
arg('--DCN_DROPOUT', type=float, default=0.2, help='DCN에서 Dropout rate를 조정할 수 있습니다.')
arg('--DCN_NUM_LAYERS', type=int, default=3, help='DCN에서 Cross Network의 레이어 수를 조정할 수 있습니다.')

############### CNN_FM
arg('--CNN_FM_EMBED_DIM', type=int, default=128, help='CNN_FM에서 user와 item에 대한 embedding시킬 차원을 조정할 수 있습니다.')
arg('--CNN_FM_LATENT_DIM', type=int, default=8, help='CNN_FM에서 user/item/image에 대한 latent 차원을 조정할 수 있습니다.')

############### DeepCoNN
arg('--DEEPCONN_VECTOR_CREATE', type=bool, default=False, help='DEEP_CONN에서 text vector 생성 여부를 조정할 수 있으며 최초 학습에만 True로 설정하여야합니다.')
arg('--DEEPCONN_EMBED_DIM', type=int, default=32, help='DEEP_CONN에서 user와 item에 대한 embedding시킬 차원을 조정할 수 있습니다.')
arg('--DEEPCONN_LATENT_DIM', type=int, default=10, help='DEEP_CONN에서 user/item/image에 대한 latent 차원을 조정할 수 있습니다.')
arg('--DEEPCONN_CONV_1D_OUT_DIM', type=int, default=50, help='DEEP_CONN에서 1D conv의 출력 크기를 조정할 수 있습니다.')
arg('--DEEPCONN_KERNEL_SIZE', type=int, default=3, help='DEEP_CONN에서 1D conv의 kernel 크기를 조정할 수 있습니다.')
arg('--DEEPCONN_WORD_DIM', type=int, default=768, help='DEEP_CONN에서 1D conv의 입력 크기를 조정할 수 있습니다.')
arg('--DEEPCONN_OUT_DIM', type=int, default=32, help='DEEP_CONN에서 1D conv의 출력 크기를 조정할 수 있습니다.')

args = parser.parse_args('')


print(args)

Namespace(DATA_PATH='data/', MODEL=None, DATA_SHUFFLE=True, TEST_SIZE=0.2, SEED=42, BATCH_SIZE=1024, EPOCHS=10, LR=0.001, WEIGHT_DECAY=1e-06, DEVICE='cuda', FM_EMBED_DIM=16, FFM_EMBED_DIM=16, NCF_EMBED_DIM=16, NCF_MLP_DIMS=(16, 16), NCF_DROPOUT=0.2, WDN_EMBED_DIM=16, WDN_MLP_DIMS=(16, 16), WDN_DROPOUT=0.2, DCN_EMBED_DIM=16, DCN_MLP_DIMS=(16, 16), DCN_DROPOUT=0.2, DCN_NUM_LAYERS=3, CNN_FM_EMBED_DIM=128, CNN_FM_LATENT_DIM=8, DEEPCONN_VECTOR_CREATE=False, DEEPCONN_EMBED_DIM=32, DEEPCONN_LATENT_DIM=10, DEEPCONN_CONV_1D_OUT_DIM=50, DEEPCONN_KERNEL_SIZE=3, DEEPCONN_WORD_DIM=768, DEEPCONN_OUT_DIM=32)


In [16]:
import os
import re
import nltk
from nltk import tokenize
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import torch
from torch.utils.data import DataLoader, Dataset
from torch.autograd import Variable
from transformers import BertModel, BertTokenizer


def text_preprocessing(summary):
    summary = re.sub("[.,\'\"''""!?]", "", summary)
    summary = re.sub("[^0-9a-zA-Z\\s]", " ", summary)
    summary = re.sub("\s+", " ", summary)
    summary = summary.lower()
    return summary


def summary_merge(df, user_id, max_summary):
    return " ".join(df[df['user_id'] == user_id].sort_values(by='summary_length', ascending=False)['summary'].values[:max_summary])
# rating을 고려하는 방식으로

def text_to_vector(text, tokenizer, model, device):
    for sent in tokenize.sent_tokenize(text):
        text_ = "[CLS] " + sent + " [SEP]"
        tokenized = tokenizer.tokenize(text_)
        indexed = tokenizer.convert_tokens_to_ids(tokenized)
        segments_idx = [1] * len(tokenized)
        token_tensor = torch.tensor([indexed])
        sgments_tensor = torch.tensor([segments_idx])
        with torch.no_grad():
            outputs = model(token_tensor.to(device), sgments_tensor.to(device))
            encode_layers = outputs[0]
            sentence_embedding = torch.mean(encode_layers[0], dim=0)
    return sentence_embedding.cpu().detach().numpy()


def process_text_data(df, books, user2idx, isbn2idx, device, train=False, user_summary_merge_vector=False, item_summary_vector=False):
    books_ = books.copy()
    books_['isbn'] = books_['isbn'].map(isbn2idx)

    if train == True:
        df_ = df.copy()
    else:
        df_ = df.copy()
        df_['user_id'] = df_['user_id'].map(user2idx)
        df_['isbn'] = df_['isbn'].map(isbn2idx)

    df_ = pd.merge(df_, books_[['isbn', 'summary']], on='isbn', how='left')
    df_['summary'].fillna('None', inplace=True)
    df_['summary'] = df_['summary'].apply(lambda x:text_preprocessing(x))
    df_['summary'].replace({'':'None', ' ':'None'}, inplace=True)
    df_['summary_length'] = df_['summary'].apply(lambda x:len(x))

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased').to(device)

    if user_summary_merge_vector and item_summary_vector:
        print('Create User Summary Merge Vector')
        user_summary_merge_vector_list = []
        for user in tqdm(df_['user_id'].unique()):
            vector = text_to_vector(summary_merge(df_, user, 5), tokenizer, model, device)
            user_summary_merge_vector_list.append(vector)
        user_review_text_df = pd.DataFrame(df_['user_id'].unique(), columns=['user_id'])
        user_review_text_df['user_summary_merge_vector'] = user_summary_merge_vector_list
        vector = np.concatenate([
                                user_review_text_df['user_id'].values.reshape(1, -1),
                                user_review_text_df['user_summary_merge_vector'].values.reshape(1, -1)
                                ])
        if not os.path.exists('./data/text_vector'):
            os.makedirs('./data/text_vector')
        if train == True:
            np.save('./data/text_vector/train_user_summary_merge_vector.npy', vector)
        else:
            np.save('./data/text_vector/test_user_summary_merge_vector.npy', vector)

        print('Create Item Summary Vector')
        item_summary_vector_list = []
        books_text_df = df_[['isbn', 'summary']].copy()
        books_text_df= books_text_df.drop_duplicates().reset_index(drop=True)
        books_text_df['summary'].fillna('None', inplace=True)
        for summary in tqdm(books_text_df['summary']):
            vector = text_to_vector(summary, tokenizer, model, device)
            item_summary_vector_list.append(vector)
        books_text_df['item_summary_vector'] = item_summary_vector_list
        vector = np.concatenate([
                                books_text_df['isbn'].values.reshape(1, -1),
                                books_text_df['item_summary_vector'].values.reshape(1, -1)
                                ])
        if not os.path.exists('./data/text_vector'):
            os.makedirs('./data/text_vector')
        if train == True:
            np.save('./data/text_vector/train_item_summary_vector.npy', vector)
        else:
            np.save('./data/text_vector/test_item_summary_vector.npy', vector)
    else:
        print('Check Vectorizer')
        print('Vector Load')
        if train == True:
            user = np.load('data/text_vector/train_user_summary_merge_vector.npy', allow_pickle=True)
        else:
            user = np.load('data/text_vector/test_user_summary_merge_vector.npy', allow_pickle=True)
        user_review_text_df = pd.DataFrame([user[0], user[1]]).T
        user_review_text_df.columns = ['user_id', 'user_summary_merge_vector']
        user_review_text_df['user_id'] = user_review_text_df['user_id'].astype('int')

        if train == True:
            item = np.load('data/text_vector/train_item_summary_vector.npy', allow_pickle=True)
        else:
            item = np.load('data/text_vector/test_item_summary_vector.npy', allow_pickle=True)
        books_text_df = pd.DataFrame([item[0], item[1]]).T
        books_text_df.columns = ['isbn', 'item_summary_vector']
        books_text_df['isbn'] = books_text_df['isbn'].astype('int')


    df_ = pd.merge(df_, user_review_text_df, on='user_id', how='left')
    df_ = pd.merge(df_, books_text_df[['isbn', 'item_summary_vector']], on='isbn', how='left')

    return df_


class Text_Dataset(Dataset):
    def __init__(self, user_isbn_vector, user_summary_merge_vector, item_summary_vector, label):
        self.user_isbn_vector = user_isbn_vector
        self.user_summary_merge_vector = user_summary_merge_vector
        self.item_summary_vector = item_summary_vector
        self.label = label

    def __len__(self):
        return self.user_isbn_vector.shape[0]

    def __getitem__(self, i):
        return {
                'user_isbn_vector' : torch.tensor(self.user_isbn_vector[i], dtype=torch.long),
                'user_summary_merge_vector' : torch.tensor(self.user_summary_merge_vector[i].reshape(-1, 1), dtype=torch.float32),
                'item_summary_vector' : torch.tensor(self.item_summary_vector[i].reshape(-1, 1), dtype=torch.float32),
                'label' : torch.tensor(self.label[i], dtype=torch.float32),
                }


def text_data_load(args):

    users = pd.read_csv(args.DATA_PATH + 'users.csv')
    books = pd.read_csv(args.DATA_PATH + 'books.csv')
    train = pd.read_csv(args.DATA_PATH + 'train_ratings.csv')
    test = pd.read_csv(args.DATA_PATH + 'test_ratings.csv')
    sub = pd.read_csv(args.DATA_PATH + 'sample_submission.csv')

    ids = pd.concat([train['user_id'], sub['user_id']]).unique()
    isbns = pd.concat([train['isbn'], sub['isbn']]).unique()

    idx2user = {idx:id for idx, id in enumerate(ids)}
    idx2isbn = {idx:isbn for idx, isbn in enumerate(isbns)}

    user2idx = {id:idx for idx, id in idx2user.items()}
    isbn2idx = {isbn:idx for idx, isbn in idx2isbn.items()}

    train['user_id'] = train['user_id'].map(user2idx)
    sub['user_id'] = sub['user_id'].map(user2idx)

    train['isbn'] = train['isbn'].map(isbn2idx)
    sub['isbn'] = sub['isbn'].map(isbn2idx)

    text_train = process_text_data(train, books, user2idx, isbn2idx, args.DEVICE, train=True, user_summary_merge_vector=args.DEEPCONN_VECTOR_CREATE, item_summary_vector=args.DEEPCONN_VECTOR_CREATE)
    text_test = process_text_data(test, books, user2idx, isbn2idx, args.DEVICE, train=False, user_summary_merge_vector=args.DEEPCONN_VECTOR_CREATE, item_summary_vector=args.DEEPCONN_VECTOR_CREATE)

    data = {
            'train':train,
            'test':test,
            'users':users,
            'books':books,
            'sub':sub,
            'idx2user':idx2user,
            'idx2isbn':idx2isbn,
            'user2idx':user2idx,
            'isbn2idx':isbn2idx,
            'text_train':text_train,
            'text_test':text_test,
            }

    return data


def text_data_split(args, data):
    X_train, X_valid, y_train, y_valid = train_test_split(
                                                        data['text_train'][['user_id', 'isbn', 'user_summary_merge_vector', 'item_summary_vector']],
                                                        data['text_train']['rating'],
                                                        test_size=args.TEST_SIZE,
                                                        random_state=args.SEED,
                                                        shuffle=True
                                                        )
    data['X_train'], data['X_valid'], data['y_train'], data['y_valid'] = X_train, X_valid, y_train, y_valid
    return data


def text_data_loader(args, data):
    train_dataset = Text_Dataset(
                                data['X_train'][['user_id', 'isbn']].values,
                                data['X_train']['user_summary_merge_vector'].values,
                                data['X_train']['item_summary_vector'].values,
                                data['y_train'].values
                                )
    valid_dataset = Text_Dataset(
                                data['X_valid'][['user_id', 'isbn']].values,
                                data['X_valid']['user_summary_merge_vector'].values,
                                data['X_valid']['item_summary_vector'].values,
                                data['y_valid'].values
                                )
    test_dataset = Text_Dataset(
                                data['text_test'][['user_id', 'isbn']].values,
                                data['text_test']['user_summary_merge_vector'].values,
                                data['text_test']['item_summary_vector'].values,
                                data['text_test']['rating'].values
                                )


    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=args.BATCH_SIZE, num_workers=0, shuffle=True)
    valid_dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size=args.BATCH_SIZE, num_workers=0, shuffle=True)
    test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=args.BATCH_SIZE, num_workers=0, shuffle=False)
    data['train_dataloader'], data['valid_dataloader'], data['test_dataloader'] = train_dataloader, valid_dataloader, test_dataloader

    return data


In [17]:
import argparse
parser = argparse.ArgumentParser(description='parser')
arg = parser.add_argument

############### BASIC OPTION
arg('--DATA_PATH', type=str, default='/opt/ml/input/code/data/', help='Data path를 설정할 수 있습니다.')
arg('--MODEL', type=str, choices=['FM', 'FFM', 'NCF', 'WDN', 'DCN', 'CNN_FM', 'DeepCoNN'],
                            help='학습 및 예측할 모델을 선택할 수 있습니다.')
arg('--DATA_SHUFFLE', type=bool, default=True, help='데이터 셔플 여부를 조정할 수 있습니다.')
arg('--TEST_SIZE', type=float, default=0.2, help='Train/Valid split 비율을 조정할 수 있습니다.')
arg('--SEED', type=int, default=42, help='seed 값을 조정할 수 있습니다.')

############### TRAINING OPTION
arg('--BATCH_SIZE', type=int, default=1024, help='Batch size를 조정할 수 있습니다.')
arg('--EPOCHS', type=int, default=10, help='Epoch 수를 조정할 수 있습니다.')
arg('--LR', type=float, default=1e-3, help='Learning Rate를 조정할 수 있습니다.')
arg('--WEIGHT_DECAY', type=float, default=1e-6, help='Adam optimizer에서 정규화에 사용하는 값을 조정할 수 있습니다.')

############### GPU
arg('--DEVICE', type=str, default='cuda', choices=['cuda', 'cpu'], help='학습에 사용할 Device를 조정할 수 있습니다.')

############### FM
arg('--FM_EMBED_DIM', type=int, default=16, help='FM에서 embedding시킬 차원을 조정할 수 있습니다.')

############### FFM
arg('--FFM_EMBED_DIM', type=int, default=16, help='FFM에서 embedding시킬 차원을 조정할 수 있습니다.')

############### NCF
arg('--NCF_EMBED_DIM', type=int, default=16, help='NCF에서 embedding시킬 차원을 조정할 수 있습니다.')
arg('--NCF_MLP_DIMS', type=list, default=(16, 16), help='NCF에서 MLP Network의 차원을 조정할 수 있습니다.')
arg('--NCF_DROPOUT', type=float, default=0.2, help='NCF에서 Dropout rate를 조정할 수 있습니다.')

############### WDN
arg('--WDN_EMBED_DIM', type=int, default=16, help='WDN에서 embedding시킬 차원을 조정할 수 있습니다.')
arg('--WDN_MLP_DIMS', type=list, default=(16, 16), help='WDN에서 MLP Network의 차원을 조정할 수 있습니다.')
arg('--WDN_DROPOUT', type=float, default=0.2, help='WDN에서 Dropout rate를 조정할 수 있습니다.')

############### DCN
arg('--DCN_EMBED_DIM', type=int, default=16, help='DCN에서 embedding시킬 차원을 조정할 수 있습니다.')
arg('--DCN_MLP_DIMS', type=list, default=(16, 16), help='DCN에서 MLP Network의 차원을 조정할 수 있습니다.')
arg('--DCN_DROPOUT', type=float, default=0.2, help='DCN에서 Dropout rate를 조정할 수 있습니다.')
arg('--DCN_NUM_LAYERS', type=int, default=3, help='DCN에서 Cross Network의 레이어 수를 조정할 수 있습니다.')

############### CNN_FM
arg('--CNN_FM_EMBED_DIM', type=int, default=128, help='CNN_FM에서 user와 item에 대한 embedding시킬 차원을 조정할 수 있습니다.')
arg('--CNN_FM_LATENT_DIM', type=int, default=8, help='CNN_FM에서 user/item/image에 대한 latent 차원을 조정할 수 있습니다.')

############### DeepCoNN
arg('--DEEPCONN_VECTOR_CREATE', type=bool, default=False, help='DEEP_CONN에서 text vector 생성 여부를 조정할 수 있으며 최초 학습에만 True로 설정하여야합니다.')
arg('--DEEPCONN_EMBED_DIM', type=int, default=32, help='DEEP_CONN에서 user와 item에 대한 embedding시킬 차원을 조정할 수 있습니다.')
arg('--DEEPCONN_LATENT_DIM', type=int, default=10, help='DEEP_CONN에서 user/item/image에 대한 latent 차원을 조정할 수 있습니다.')
arg('--DEEPCONN_CONV_1D_OUT_DIM', type=int, default=50, help='DEEP_CONN에서 1D conv의 출력 크기를 조정할 수 있습니다.')
arg('--DEEPCONN_KERNEL_SIZE', type=int, default=3, help='DEEP_CONN에서 1D conv의 kernel 크기를 조정할 수 있습니다.')
arg('--DEEPCONN_WORD_DIM', type=int, default=768, help='DEEP_CONN에서 1D conv의 입력 크기를 조정할 수 있습니다.')
arg('--DEEPCONN_OUT_DIM', type=int, default=32, help='DEEP_CONN에서 1D conv의 출력 크기를 조정할 수 있습니다.')

args = parser.parse_args('')


In [18]:
import nltk
nltk.download('punkt')
data = text_data_load(args)

[nltk_data] Downloading package punkt to /opt/ml/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Check Vectorizer
Vector Load


FileNotFoundError: [Errno 2] No such file or directory: 'data/text_vector/train_user_summary_merge_vector.npy'