In [61]:
from src.data import text_data_load, text_data_split, text_data_loader
import argparse
import json
import pandas as pd
from src import seed_everything
import numpy as np
import torch
import torch.nn as nn

parser = argparse.ArgumentParser()
args = argparse.Namespace()
with open('config.json','rt') as f:
    args.__dict__.update(json.load(f))

In [156]:
item = np.load('data/text_vector/test_item_summary_vector.npy', allow_pickle=True)

In [140]:
item[1].shape

(129777,)

In [157]:
books_text_df = pd.DataFrame([item[0], item[1]]).T
books_text_df.columns = ['isbn', 'item_summary_vector']

In [158]:
books_text_df['item_summary_vector'][0].shape

(16,)

In [7]:
books_text_df['item_summary_vector'][0].shape

(768,)

In [26]:
user = pd.read_csv('data/users.csv')
books = pd.read_csv('data/books.csv')
test = pd.read_csv('data/test_ratings.csv')
train = pd.read_csv('data/train_ratings.csv')

In [40]:
df = pd.merge(test,books[['isbn','summary']],on='isbn')
df

Unnamed: 0,user_id,isbn,rating,summary
0,11676,0002005018,0,"In a small town in Canada, Clara Callan reluct..."
1,116866,0002005018,0,"In a small town in Canada, Clara Callan reluct..."
2,152827,0060973129,0,"Here, for the first time in paperback, is an o..."
3,157969,0374157065,0,"Describes the great flu epidemic of 1918, an o..."
4,67958,0399135782,0,A Chinese immigrant who is convinced she is dy...
...,...,...,...,...
76694,278543,1576734218,0,On Becoming Childwise responds to this need by...
76695,278563,3492223710,0,Respektlos macht der Autor mit der griechische...
76696,278633,1896095186,0,The fascinating characters in this short story...
76697,278668,8408044079,0,


In [41]:
sum(df['summary'].isna())

29790

In [10]:
dft = pd.merge(train,books[['isbn','summary']],on='isbn')
dft

Unnamed: 0,user_id,isbn,rating,summary
0,8,0002005018,4,"In a small town in Canada, Clara Callan reluct..."
1,67544,0002005018,7,"In a small town in Canada, Clara Callan reluct..."
2,123629,0002005018,8,"In a small town in Canada, Clara Callan reluct..."
3,200273,0002005018,8,"In a small town in Canada, Clara Callan reluct..."
4,210926,0002005018,9,"In a small town in Canada, Clara Callan reluct..."
...,...,...,...,...
306790,278843,0743525493,7,
306791,278851,067161746X,6,A tongue-in-cheek survival guide for single pe...
306792,278851,0884159221,7,
306793,278851,0912333022,7,These hilarious stories by the creator of publ...


In [24]:
for i in books_text_df['item_summary_vector'].values:
    if sum(i) == 0:
        print(i)

In [27]:
isbns = pd.concat([train['isbn'], test['isbn']]).unique()

In [34]:
idx2isbn = {idx:isbn for idx, isbn in enumerate(isbns)}
isbn2idx = {isbn:idx for idx, isbn in idx2isbn.items()}

In [48]:
isbn2idx['0671870432']

129777

In [58]:
books_text_df[books_text_df['isbn']==129777]

Unnamed: 0,isbn,item_summary_vector
4,129777,"[0.063244164, 0.08317142, -0.20397493, 0.01717..."


In [103]:
len(books_text_df['item_summary_vector'])

129777

In [74]:
class Factorize(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.dim = dim
        self.linear1 = nn.Linear(768,self.dim)
        self.linear2 = nn.Linear(self.dim,768)

    def forward(self, x):
        emb = self.linear1(x)
        x = self.linear2(emb)
        return x, emb

In [75]:
class RMSELoss(torch.nn.Module):
    def __init__(self):
        super(RMSELoss,self).__init__()
        self.eps = 1e-6

    def forward(self, x, y):
        criterion = nn.MSELoss()
        loss = torch.sqrt(criterion(x, y)+self.eps)
        return loss

In [77]:
from torch.utils.data import DataLoader, TensorDataset, Dataset

In [78]:
class Text_Dataset(Dataset):
    def __init__(self, textcon):
        self.textcon = textcon

    def __len__(self):
        return self.textcon.shape[0]

    def __getitem__(self, i):
        return torch.tensor(self.textcon[i], dtype=torch.float32)

In [104]:
model = Factorize(16)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = RMSELoss()

data = Text_Dataset(books_text_df['item_summary_vector'].values)
dataload = DataLoader(data,batch_size=1024,num_workers=0,shuffle=False)

In [80]:
import tqdm

In [105]:
model.to('cuda')

Factorize(
  (linear1): Linear(in_features=768, out_features=16, bias=True)
  (linear2): Linear(in_features=16, out_features=768, bias=True)
)

In [107]:
tk0 = tqdm.tqdm(range(1,11), smoothing=0, mininterval=1.0)
loss_list = []
for epoch in tk0:
    model.train()
    total_loss = 0
    n = 0
    for i, data in enumerate(dataload):
        data = data.to('cuda')
        y,emb = model(data)
        loss = criterion(y,data)
        model.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        n += 1
    tk0.set_postfix(train_loss = total_loss/n)
# train_loss 20epoch 0.111
# test_loss 40~50 epoch 0.115

100%|██████████| 10/10 [00:18<00:00,  1.85s/it, train_loss=0.111]


In [108]:
model.eval()
total_loss = 0
n = 0
emb_np = np.array([[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]])
with torch.no_grad():
    for i, data in enumerate(dataload):
        data = data.to('cuda')
        y,emb = model(data)
        total_loss += loss.item()
        # print(emb_np.shape)
        # print(emb.shape)
        emb_np = np.concatenate((emb_np,emb.cpu()))

In [109]:
emb_np = emb_np[1:]

In [111]:
emb_np.shape

(129777, 16)

In [112]:
np.save('./data/text_vector/train_item_summary_vector.npy',emb_np)

In [114]:
save_emb_train = np.array([books_text_df['isbn'].values,emb_np]

array([0, 1, 2, ..., 129774, 129775, 129776], dtype=object)

In [120]:
list_emb = [i for i in emb_np]

In [121]:
train_emb_df = pd.DataFrame({'isbn':books_text_df['isbn'].values,'emb':list_emb})

In [136]:
train_emb_vector = np.concatenate([
    train_emb_df['isbn'].values.reshape(1,-1),
    train_emb_df['emb'].values.reshape(1,-1)
])

In [138]:
np.save('./data/text_vector/train_item_summary_vector.npy',train_emb_vector)

In [149]:
emb_np = np.load('./data/text_vector/test_item_summary_vector.npy',allow_pickle=True)
list_emb = [i for i in emb_np]
train_emb_df = pd.DataFrame({'isbn':books_text_df['isbn'].values,'emb':list_emb})
train_emb_vector = np.concatenate([
    train_emb_df['isbn'].values.reshape(1,-1),
    train_emb_df['emb'].values.reshape(1,-1)
])

In [154]:
train_emb_vector.shape

(2, 52000)

In [155]:
np.save('./data/text_vector/test_item_summary_vector.npy',train_emb_vector)

# Optuna FFDCN 모델

In [6]:
import time
import argparse
import json
import pandas as pd
import tqdm
import argparse
import warnings
import joblib
from src import seed_everything

from src.data import context_data_load, context_data_split, context_data_loader

from src import FFDCNModel

import optuna
from optuna import Trial, visualization
from optuna.samplers import TPESampler

from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

warnings.filterwarnings(action='ignore')

In [7]:
parser = argparse.ArgumentParser()
args = argparse.Namespace()
with open('config.json','rt') as f:
    args.__dict__.update(json.load(f))

In [8]:
seed_everything(42)

In [9]:
ffmdataset = context_data_load(args)

In [5]:
def objective(trial):
    seed_everything(args.SEED)
    args.BATCH_SIZE = trial.suggest_categorical('BATCH_SIZE',[256, 512, 1024])
    args.EPOCHS = 1 #trial.suggest_int('EPOCH',5,10)
    args.LR = trial.suggest_loguniform('LR',0.001,0.01)
    args.WEIGHT_DECAY = trial.suggest_loguniform('WEIGHT_DECAY',1e-07,5e-06)
    args.FFM_EMBED_DIM = trial.suggest_int('FFM_EMBED_DIM', 3, 32)
    args.DCN_EMBED_DIM = trial.suggest_int('DCN_EMBED_DIM', 1, 16)
    DCN_MLP_DIM_LAYERS = trial.suggest_int('DCN_MLP_DIM_LAYERS',1,3)
    args.DCN_MLP_DIMS = [trial.suggest_int('DCN_MLP_DIM_NUM',1,16)]*DCN_MLP_DIM_LAYERS
    args.DCN_DROPOUT = trial.suggest_categorical("DCN_DROPOUT",[0.2,0.25,0.3])
    args.DCN_NUM_LAYERS = trial.suggest_int('DCN_NUM_LAYERS',1 , 4)
    # args.USER_N_D = trial.suggest_int('USER_N_D',0,3)
    # args.USER_F_D = trial.suggest_int('USER_N_F',3,6)
    # args.ISBN_N_D = trial.suggest_categorical('ISBN_N_D',[12,14,16,18,20,22])
    # args.ISBN_N_F = trial.suggest_int('ISBN_N_F',28,32)
    # ffmdataset = context_data_load(args)
    dataffm = context_data_split(args,ffmdataset)
    dataffm = context_data_loader(args,dataffm)
    model = FFDCNModel(args,dataffm)
    model.train()
    log_score = model.predict_train()
    
    return log_score

In [None]:
sampler = optuna.samplers.TPESampler(seed=49)
study = optuna.create_study(
    study_name = 'FFDCN_parameter_opt',
    direction = 'minimize',
    sampler = sampler,
)
study.optimize(objective, n_trials=200)
print("Best Score:",study.best_value)
print("Best trial",study.best_trial.params)

In [7]:
study.get_trials()[-1].params

{'BATCH_SIZE': 257,
 'LR': 0.004801638061928483,
 'WEIGHT_DECAY': 2.8409501258311365e-07,
 'FFM_EMBED_DIM': 26,
 'DCN_EMBED_DIM': 15,
 'DCN_MLP_DIM_LAYERS': 1,
 'DCN_MLP_DIM_NUM': 5,
 'DCN_DROPOUT': 0.2,
 'DCN_NUM_LAYERS': 3}

In [7]:
joblib.dump(study,'./valid/studysave1003.pkl')

['./valid/studysave1003.pkl']

In [8]:
joblib.dump(study,'./valid/studysave_NEW.pkl')

['./valid/studysave_NEW.pkl']

In [8]:
jl = joblib.load('./valid/studysave1003.pkl')

In [9]:
jl.best_params

{'BATCH_SIZE': 858,
 'LR': 0.004313538772296255,
 'WEIGHT_DECAY': 2.64921217501965e-07,
 'FFM_EMBED_DIM': 25,
 'DCN_EMBED_DIM': 6,
 'DCN_MLP_DIM_LAYERS': 3,
 'DCN_MLP_DIM_NUM': 1,
 'DCN_DROPOUT': 0.25,
 'DCN_NUM_LAYERS': 3}

## K-Fold for FFDCN

In [1]:
import time
import argparse
import json
import pandas as pd
import numpy as np
import tqdm
import argparse
import warnings
import joblib
from src import seed_everything

from src.data import context_data_load, context_data_split, context_data_loader

from src import FFDCNModel

import optuna
from optuna import Trial, visualization
from optuna.samplers import TPESampler

from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

warnings.filterwarnings(action='ignore')



In [2]:
parser = argparse.ArgumentParser()
args = argparse.Namespace()
with open('config.json','rt') as f:
    args.__dict__.update(json.load(f))

In [3]:
seed_everything(42)

In [4]:
data = context_data_load(args)

In [5]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
folds = []
for train_idx, valid_idx in skf.split(data['train'].drop(['rating'], axis=1), data['train']['rating']):
    folds.append((train_idx, valid_idx))

In [6]:
ffdcn_predicts = []
for fold in range(5):
    seed_everything(42)
    print('='*15,fold+1,'='*15)
    train_idx, valid_idx = folds[fold]
    # Fold i Data Split
    X_train = data['train'].drop(['rating'], axis=1).iloc[train_idx]
    X_valid = data['train'].drop(['rating'], axis=1).iloc[valid_idx]
    y_train = data['train']['rating'][train_idx]
    y_valid = data['train']['rating'][valid_idx]
    # Create Fold i Dataloader
    fold_data = {
            'X_train':X_train,
            'X_valid':X_valid,
            'y_train':y_train,
            'y_valid':y_valid,
            'test':data['test'],
            'field_dims':data['field_dims'],
            'sub':data['sub'],
            'idx2user':data['idx2user'],
            'idx2isbn':data['idx2isbn']
    }
    fold_data = context_data_loader(args,fold_data)
    # Create Fold i FFDCN Model and train
    print(f'--------------- {args.MODEL} TRAINING ---------------')
    model = FFDCNModel(args,fold_data)
    model.train()
    log_score = model.predict_train()

    # Fold i Model's Predict Test data
    print(f'--------------- {args.MODEL} PREDICT ---------------')
    predicts = model.predict(fold_data['test_dataloader'])
    ffdcn_predicts.append(predicts)



--------------- FFDCN TRAINING ---------------


100%|██████████| 959/959 [00:27<00:00, 34.29it/s, loss=2.19]
100%|██████████| 240/240 [00:01<00:00, 216.16it/s]
  0%|          | 0/240 [00:00<?, ?it/s]

epoch: 0 validation: rmse: 2.1640109036934394


100%|██████████| 240/240 [00:01<00:00, 205.61it/s]
  0%|          | 0/240 [00:00<?, ?it/s]

--------------- Saving Valid ---------------


100%|██████████| 240/240 [00:01<00:00, 219.25it/s]
  0%|          | 0/300 [00:00<?, ?it/s]

--------------- FFDCN PREDICT ---------------


100%|██████████| 300/300 [00:01<00:00, 228.57it/s]


--------------- FFDCN TRAINING ---------------


100%|██████████| 959/959 [00:27<00:00, 34.79it/s, loss=2.17]
100%|██████████| 240/240 [00:01<00:00, 220.33it/s]
  0%|          | 0/240 [00:00<?, ?it/s]

epoch: 0 validation: rmse: 2.165730645553602


100%|██████████| 240/240 [00:01<00:00, 176.13it/s]
  0%|          | 0/240 [00:00<?, ?it/s]

--------------- Saving Valid ---------------


100%|██████████| 240/240 [00:01<00:00, 218.84it/s]
  0%|          | 0/300 [00:00<?, ?it/s]

--------------- FFDCN PREDICT ---------------


100%|██████████| 300/300 [00:01<00:00, 268.20it/s]


--------------- FFDCN TRAINING ---------------


100%|██████████| 959/959 [00:27<00:00, 34.61it/s, loss=2.17]
100%|██████████| 240/240 [00:01<00:00, 213.14it/s]
  0%|          | 0/240 [00:00<?, ?it/s]

epoch: 0 validation: rmse: 2.181752089298212


100%|██████████| 240/240 [00:01<00:00, 200.99it/s]
  0%|          | 0/240 [00:00<?, ?it/s]

--------------- Saving Valid ---------------


100%|██████████| 240/240 [00:01<00:00, 218.17it/s]
  0%|          | 0/300 [00:00<?, ?it/s]

--------------- FFDCN PREDICT ---------------


100%|██████████| 300/300 [00:01<00:00, 263.96it/s]


--------------- FFDCN TRAINING ---------------


100%|██████████| 959/959 [00:28<00:00, 34.20it/s, loss=2.17]
100%|██████████| 240/240 [00:01<00:00, 207.65it/s]
  0%|          | 0/240 [00:00<?, ?it/s]

epoch: 0 validation: rmse: 2.160661193155635


100%|██████████| 240/240 [00:01<00:00, 204.64it/s]
  0%|          | 0/240 [00:00<?, ?it/s]

--------------- Saving Valid ---------------


100%|██████████| 240/240 [00:01<00:00, 187.69it/s]
  0%|          | 0/300 [00:00<?, ?it/s]

--------------- FFDCN PREDICT ---------------


100%|██████████| 300/300 [00:01<00:00, 243.87it/s]


--------------- FFDCN TRAINING ---------------


100%|██████████| 959/959 [00:27<00:00, 34.69it/s, loss=2.17]
100%|██████████| 240/240 [00:01<00:00, 184.72it/s]
  0%|          | 0/240 [00:00<?, ?it/s]

epoch: 0 validation: rmse: 2.1739715771745107


100%|██████████| 240/240 [00:01<00:00, 205.13it/s]
  0%|          | 0/240 [00:00<?, ?it/s]

--------------- Saving Valid ---------------


100%|██████████| 240/240 [00:01<00:00, 219.58it/s]
  0%|          | 0/300 [00:00<?, ?it/s]

--------------- FFDCN PREDICT ---------------


100%|██████████| 300/300 [00:01<00:00, 263.98it/s]


In [7]:
# Fold i Save Predicted test data
print(f'--------------- SAVE {args.MODEL} PREDICT ---------------')
submission = pd.read_csv(args.DATA_PATH + 'sample_submission.csv')
for fold_predict in ffdcn_predicts:
    submission['rating'] += np.array(fold_predict) / 5

now = time.localtime()
now_date = time.strftime('%Y%m%d', now)
now_hour = time.strftime('%X', now)
save_time = now_date + '_' + now_hour.replace(':', '')
submission.to_csv('submit/5fold_{}_{}.csv'.format(save_time, args.MODEL), index=False)

--------------- SAVE FFDCN PREDICT ---------------
