In [1]:
import numpy as np
import pandas as pd
import regex
import glob

from torch.utils.data import TensorDataset, DataLoader
import torch

from sklearn.model_selection import train_test_split

In [2]:
data_path = '/data/ephemeral/home/jay/data/'
users = pd.read_csv(data_path + 'users.csv')
books = pd.read_csv(data_path + 'books.csv')
train = pd.read_csv(data_path + 'train_ratings.csv')
test = pd.read_csv(data_path + 'test_ratings.csv')
sub = pd.read_csv(data_path + 'sample_submission.csv')

In [3]:
def str2list(x: str) -> list:
    '''문자열을 리스트로 변환하는 함수'''
    return x[1:-1].split(', ')


def split_location(x: str) -> list:
    '''
    Parameters
    ----------
    x : str
        location 데이터

    Returns
    -------
    res : list
        location 데이터를 나눈 뒤, 정제한 결과를 반환합니다.
        순서는 country, state, city, ... 입니다.
    '''
    res = x.split(',')
    res = [i.strip().lower() for i in res]
    res = [regex.sub(r'[^a-zA-Z/ ]', '', i) for i in res]  # remove special characters
    res = [i if i not in ['n/a', ''] else np.nan for i in res]  # change 'n/a' into NaN
    res.reverse()  # reverse the list to get country, state, city, ... order

    for i in range(len(res)-1, 0, -1):
        if (res[i] in res[:i]) and (not pd.isna(res[i])):  # remove duplicated values if not NaN
            res.pop(i)

    return res
    
def process_context_data(users, books):
    """
    Parameters
    ----------
    users : pd.DataFrame
        users.csv를 인덱싱한 데이터
    books : pd.DataFrame
        books.csv를 인덱싱한 데이터
    ratings1 : pd.DataFrame
        train 데이터의 rating
    ratings2 : pd.DataFrame
        test 데이터의 rating
    
    Returns
    -------
    label_to_idx : dict
        데이터를 인덱싱한 정보를 담은 딕셔너리
    idx_to_label : dict
        인덱스를 다시 원래 데이터로 변환하는 정보를 담은 딕셔너리
    train_df : pd.DataFrame
        train 데이터
    test_df : pd.DataFrame
        test 데이터
    """

    users_ = users.copy()
    books_ = books.copy()

    # 데이터 전처리 (전처리는 각자의 상황에 맞게 진행해주세요!)
    books_['category'] = books_['category'].apply(lambda x: str2list(x)[0] if not pd.isna(x) else np.nan)
    books_['language'] = books_['language'].fillna(books_['language'].mode()[0])
    books_['publication_range'] = books_['year_of_publication'].apply(lambda x: x // 10 * 10)  # 1990년대, 2000년대, 2010년대, ...

    users_['age'] = users_['age'].fillna(users_['age'].mode()[0])
    users_['age_range'] = users_['age'].apply(lambda x: x // 10 * 10)  # 10대, 20대, 30대, ...

    users_['location_list'] = users_['location'].apply(lambda x: split_location(x)) 
    users_['location_country'] = users_['location_list'].apply(lambda x: x[0])
    users_['location_state'] = users_['location_list'].apply(lambda x: x[1] if len(x) > 1 else np.nan)
    users_['location_city'] = users_['location_list'].apply(lambda x: x[2] if len(x) > 2 else np.nan)
    for idx, row in users_.iterrows():
        if (not pd.isna(row['location_state'])) and pd.isna(row['location_country']):
            fill_country = users_[users_['location_state'] == row['location_state']]['location_country'].mode()
            fill_country = fill_country[0] if len(fill_country) > 0 else np.nan
            users_.loc[idx, 'location_country'] = fill_country
        elif (not pd.isna(row['location_city'])) and pd.isna(row['location_state']):
            if not pd.isna(row['location_country']):
                fill_state = users_[(users_['location_country'] == row['location_country']) 
                                    & (users_['location_city'] == row['location_city'])]['location_state'].mode()
                fill_state = fill_state[0] if len(fill_state) > 0 else np.nan
                users_.loc[idx, 'location_state'] = fill_state
            else:
                fill_state = users_[users_['location_city'] == row['location_city']]['location_state'].mode()
                fill_state = fill_state[0] if len(fill_state) > 0 else np.nan
                fill_country = users_[users_['location_city'] == row['location_city']]['location_country'].mode()
                fill_country = fill_country[0] if len(fill_country) > 0 else np.nan
                users_.loc[idx, 'location_country'] = fill_country
                users_.loc[idx, 'location_state'] = fill_state

               
    
    users_ = users_.drop(['location'], axis=1)

    return users_, books_

In [4]:
user_features = ['user_id', 'age_range', 'location_country', 'location_state', 'location_city']
book_features = ['isbn', 'book_title', 'book_author', 'publisher', 'language', 'category', 'publication_range']
sparse_cols = ['user_id', 'isbn'] + list(set(user_features + book_features) - {'user_id', 'isbn'})

In [5]:
users_, books_ = process_context_data(users, books)

In [14]:
from tqdm import tqdm
from PIL import Image
from torchvision.transforms import v2

def image_vector(path, img_size):
    """
    Parameters
    ----------
    path : str
        이미지가 존재하는 경로를 입력합니다.

    Returns
    -------
    img_fe : np.ndarray
        이미지를 벡터화한 결과를 반환합니다.
        베이스라인에서는 grayscale일 경우 RGB로 변경한 뒤, img_size x img_size 로 사이즈를 맞추어 numpy로 반환합니다.
    """
    img = Image.open(path)
    transform = v2.Compose([
        v2.Lambda(lambda x: x.convert('RGB') if x.mode != 'RGB' else x),
        v2.Resize((img_size, img_size)),
        v2.ToImage(),
        v2.ToDtype(torch.float32, scale=True),
        v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    return transform(img).numpy()

def process_img_data(books):
    """
    Parameters
    ----------
    books : pd.DataFrame
        책 정보에 대한 데이터 프레임을 입력합니다.
    
    Returns
    -------
    books_ : pd.DataFrame
        이미지 정보를 벡터화하여 추가한 데이터 프레임을 반환합니다.
    """
    books_ = books.copy()
    books_['img_path'] = books_['img_path'].apply(lambda x: f'../data/{x}')
    img_vecs = []
    for idx in tqdm(books_.index):
        img_vec = image_vector(books_.loc[idx, 'img_path'], 28)
        img_vecs.append(img_vec)

    books_['img_vector'] = img_vecs

    return books_

In [15]:
books_ = process_img_data(books_)

100%|██████████| 149570/149570 [01:58<00:00, 1257.39it/s]


In [16]:
user_features = ['user_id', 'age_range', 'location_country', 'location_state', 'location_city']
book_features = ['isbn', 'book_title', 'book_author', 'publisher', 'language', 'category', 'publication_range']
sparse_cols = ['user_id', 'isbn'] + list(set(user_features + book_features) - {'user_id', 'isbn'})

train_df = train.merge(books_, on='isbn', how='left')\
                .merge(users_, on='user_id', how='left')[sparse_cols + ['img_vector', 'rating']]
test_df = test.merge(books_, on='isbn', how='left')\
                .merge(users_, on='user_id', how='left')[sparse_cols + ['img_vector']]
all_df = pd.concat([train_df, test_df], axis=0)

# feature_cols의 데이터만 라벨 인코딩하고 인덱스 정보를 저장
label2idx, idx2label = {}, {}
for col in sparse_cols:
    all_df[col] = all_df[col].fillna('unknown')
    train_df[col] = train_df[col].fillna('unknown')
    test_df[col] = test_df[col].fillna('unknown')
    unique_labels = all_df[col].astype("category").cat.categories
    label2idx[col] = {label:idx for idx, label in enumerate(unique_labels)}
    idx2label[col] = {idx:label for idx, label in enumerate(unique_labels)}
    train_df[col] = pd.Categorical(train_df[col], categories=unique_labels).codes
    test_df[col] = pd.Categorical(test_df[col], categories=unique_labels).codes

field_dims = [len(label2idx[col]) for col in sparse_cols]

data = {
        'train':train_df,
        'test':test_df,
        'field_names':sparse_cols,
        'field_dims':field_dims,
        'label2idx':label2idx,
        'idx2label':idx2label,
        'sub':sub,
        }

In [18]:
data['train']

Unnamed: 0,user_id,isbn,category,language,location_city,book_title,publisher,publication_range,book_author,location_country,age_range,location_state,img_vector,rating
0,0,39,132,4,10695,20344,4598,12,49515,34,2,987,"[[[0.6563062, 0.93030226, 1.4954194, 1.8036649...",4
1,16631,39,132,4,10752,20344,4598,12,49515,34,3,987,"[[[0.6563062, 0.93030226, 1.4954194, 1.8036649...",7
2,30285,39,132,4,5368,20344,4598,12,49515,34,2,987,"[[[0.6563062, 0.93030226, 1.4954194, 1.8036649...",8
3,48760,39,132,4,2284,20344,4598,12,49515,34,2,987,"[[[0.6563062, 0.93030226, 1.4954194, 1.8036649...",8
4,51452,39,132,4,4229,20344,4598,12,49515,34,2,987,"[[[0.6563062, 0.93030226, 1.4954194, 1.8036649...",9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
306790,68062,87948,4120,4,8337,111499,9440,12,12391,211,2,222,"[[[1.9235382, 1.8721639, 1.9064134, 1.9064134,...",7
306791,68066,75092,2729,4,2564,98380,8086,10,45139,211,3,1317,"[[[2.2489083, 2.2489083, 2.2489083, 2.2489083,...",6
306792,68066,109944,4120,4,2564,131928,6111,10,10197,211,3,1317,"[[[2.2489083, 2.2489083, 2.2489083, 2.2489083,...",7
306793,68066,113224,2298,4,2564,97954,5689,11,27804,211,3,1317,"[[[0.8446785, 0.60493195, 0.72480524, 0.913177...",7


In [19]:
tensor_32 = torch.randn(512, 32)   # torch.Size([512, 32])
tensor_64 = torch.randn(512, 64)   # torch.Size([512, 64])
tensor_128 = torch.randn(512, 128) # torch.Size([512, 128])

In [23]:
torch.cat([tensor_32, tensor_64, tensor_128], dim=1)

tensor([[-0.2155,  0.0107,  1.4315,  ..., -0.0577,  0.0609,  0.9538],
        [ 0.0318,  0.5450, -0.0040,  ..., -0.0110, -1.3896,  0.4619],
        [ 0.1508, -0.7334, -1.7633,  ...,  0.7443,  0.5680, -0.5062],
        ...,
        [-0.0922,  0.4166,  1.1189,  ...,  1.2173, -0.0697, -0.3387],
        [ 0.9076, -0.6142,  0.2010,  ..., -1.3910, -0.9923,  0.8728],
        [-0.5993,  1.4854,  1.0638,  ..., -0.4768, -0.8801, -0.9106]])

In [34]:
from transformers import AutoTokenizer, AutoModel
import re
import os

def text_preprocessing(summary):
    """
    Parameters
    ----------
    summary : pd.Series
        정규화와 같은 기본적인 전처리를 하기 위한 텍스트 데이터를 입력합니다.
    
    Returns
    -------
    summary : pd.Series
        전처리된 텍스트 데이터를 반환합니다.
        베이스라인에서는 특수문자 제거, 공백 제거를 진행합니다.
    """
    summary = re.sub("[^0-9a-zA-Z.,!?]", " ", summary)  # .,!?를 제외한 특수문자 제거
    summary = re.sub("\s+", " ", summary)  # 중복 공백 제거

    return summary


def text_to_vector(text, tokenizer, model):
    """
    Parameters
    ----------
    text : str
        `summary_merge()`를 통해 병합된 요약 데이터
    tokenizer : Tokenizer
        텍스트 데이터를 `model`에 입력하기 위한 토크나이저
    model : 사전학습된 언어 모델
        텍스트 데이터를 벡터로 임베딩하기 위한 모델
    ----------
    """
    text_ = "[CLS] " + text + " [SEP]"
    tokenized = tokenizer.encode(text_, add_special_tokens=True)
    token_tensor = torch.tensor([tokenized], device=model.device)
    with torch.no_grad():
        outputs = model(token_tensor)  # attention_mask를 사용하지 않아도 됨
        ### BERT 모델의 경우, 최종 출력물의 사이즈가 (토큰길이, 임베딩=768)이므로, 이를 평균내어 사용하거나 pooler_output을 사용하여 [CLS] 토큰의 임베딩만 사용
        # sentence_embedding = torch.mean(outputs.last_hidden_state[0], dim=0)  # 방법1) 모든 토큰의 임베딩을 평균내어 사용
        sentence_embedding = outputs.pooler_output.squeeze(0)  # 방법2) pooler_output을 사용하여 맨 첫 토큰인 [CLS] 토큰의 임베딩만 사용
    
    return sentence_embedding.cpu().detach().numpy() 

In [51]:
def process_text_data(ratings, users, books, tokenizer, model, vector_create=False):
    """
    Parameters
    ----------
    users : pd.DataFrame
        유저 정보에 대한 데이터 프레임을 입력합니다.
    books : pd.DataFrame
        책 정보에 대한 데이터 프레임을 입력합니다.
    vector_create : bool
        사전에 텍스트 데이터 벡터화가 된 파일이 있는지 여부를 입력합니다.

    Returns
    -------
    `users_` : pd.DataFrame
        각 유저가 읽은 책에 대한 요약 정보를 병합 및 벡터화하여 추가한 데이터 프레임을 반환합니다.

    `books_` : pd.DataFrame
        텍스트 데이터를 벡터화하여 추가한 데이터 프레임을 반환합니다.
    """
    num2txt = ['Zero', 'One', 'Two', 'Three', 'Four', 'Five']
    users_ = users.copy()
    books_ = books.copy()
    nan_value = 'None'
    books_['summary'] = books_['summary'].fillna(nan_value)\
                                         .apply(lambda x: text_preprocessing(x))\
                                         .replace({'': nan_value, ' ': nan_value})
    
    books_['summary_length'] = books_['summary'].apply(lambda x:len(x))
    books_['review_count'] = books_['isbn'].map(ratings['isbn'].value_counts())

    users_['books_read'] = users_['user_id'].map(ratings.groupby('user_id')['isbn'].apply(list))

    if vector_create:
        if not os.path.exists('./data/text_vector'):
            os.makedirs('./data/text_vector')

        print('Create Item Summary Vector')
        book_summary_vector_list = []
        for title, summary in tqdm(zip(books_['book_title'], books_['summary']), total=len(books_)):
            # 책에 대한 텍스트 프롬프트는 아래와 같이 구성됨
            # '''
            # Book Title: {title}
            # Summary: {summary}
            # '''
            prompt_ = f'Book Title: {title}\n Summary: {summary}\n'
            vector = text_to_vector(prompt_, tokenizer, model)
            book_summary_vector_list.append(vector)
        
        book_summary_vector_list = np.concatenate([
                                                books_['isbn'].values.reshape(-1, 1),
                                                np.asarray(book_summary_vector_list, dtype=np.float32)
                                                ], axis=1)
        
        np.save('./data/text_vector/book_summary_vector.npy', book_summary_vector_list)        


        print('Create User Summary Merge Vector')
        user_summary_merge_vector_list = []
        for books_read in tqdm(users_['books_read']):
            if not isinstance(books_read, list) and pd.isna(books_read):  # 유저가 읽은 책이 없는 경우, 텍스트 임베딩을 0으로 처리
                user_summary_merge_vector_list.append(np.zeros((768)))
                continue
            
            read_books = books_[books_['isbn'].isin(books_read)][['book_title', 'summary', 'review_count']]
            read_books = read_books.sort_values('review_count', ascending=False).head(5)  # review_count가 높은 순으로 5개의 책을 선택
            # 유저에 대한 텍스트 프롬프트는 아래와 같이 구성됨
            # DeepCoNN에서 유저의 리뷰를 요약하여 하나의 벡터로 만들어 사용함을 참고 (https://arxiv.org/abs/1701.04783)
            # '''
            # Five Books That You Read
            # 1. Book Title: {title}
            # Summary: {summary}
            # ...
            # 5. Book Title: {title}
            # Summary: {summary}
            # '''
            prompt_ = f'{num2txt[len(read_books)]} Books That You Read\n'
            for idx, (title, summary) in enumerate(zip(read_books['book_title'], read_books['summary'])):
                summary = summary if len(summary) < 100 else f'{summary[:100]} ...'
                prompt_ += f'{idx+1}. Book Title: {title}\n Summary: {summary}\n'
            vector = text_to_vector(prompt_, tokenizer, model)
            user_summary_merge_vector_list.append(vector)
        
        user_summary_merge_vector_list = np.concatenate([
                                                         users_['user_id'].values.reshape(-1, 1),
                                                         np.asarray(user_summary_merge_vector_list, dtype=np.float32)
                                                        ], axis=1)
        
        np.save('./data/text_vector/user_summary_merge_vector.npy', user_summary_merge_vector_list)        
        
    else:
        print('Check Vectorizer')
        print('Vector Load')
        book_summary_vector_list = np.load('/data/ephemeral/home/jay/code/data/text_vector/book_summary_vector.npy', allow_pickle=True)
        user_summary_merge_vector_list = np.load('/data/ephemeral/home/jay/code/data/text_vector/user_summary_merge_vector.npy', allow_pickle=True)

    book_summary_vector_df = pd.DataFrame({'isbn': book_summary_vector_list[:, 0]})
    book_summary_vector_df['book_summary_vector'] = list(book_summary_vector_list[:, 1:].astype(np.float32))
    user_summary_vector_df = pd.DataFrame({'user_id': user_summary_merge_vector_list[:, 0]})
    user_summary_vector_df['user_summary_merge_vector'] = list(user_summary_merge_vector_list[:, 1:].astype(np.float32))

    books_ = pd.merge(books_, book_summary_vector_df, on='isbn', how='left')
    users_ = pd.merge(users_, user_summary_vector_df, on='user_id', how='left')

    return users_, books_

In [43]:
data_path = '/data/ephemeral/home/jay/data/'
users = pd.read_csv(data_path + 'users.csv')
books = pd.read_csv(data_path + 'books.csv')
train = pd.read_csv(data_path + 'train_ratings.csv')
test = pd.read_csv(data_path + 'test_ratings.csv')
sub = pd.read_csv(data_path + 'sample_submission.csv')

In [44]:
users_, books_ = process_context_data(users, books)

In [52]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')
model.eval()
users_, books_ = process_text_data(train, users_, books_, tokenizer, model)

Check Vectorizer
Vector Load


ModuleNotFoundError: No module named 'numpy._core'

In [6]:
train_df = train.merge(users_, on='user_id', how='left')\
                .merge(books_, on='isbn', how='left')[sparse_cols + ['rating']]
test_df = test.merge(users_, on='user_id', how='left')\
                .merge(books_, on='isbn', how='left')[sparse_cols]
all_df = pd.concat([train_df, test_df], axis=0)

In [7]:
# feature_cols의 데이터만 라벨 인코딩하고 인덱스 정보를 저장
label2idx, idx2label = {}, {}
for col in sparse_cols:
    all_df[col] = all_df[col].fillna('unknown')
    unique_labels = all_df[col].astype("category").cat.categories
    label2idx[col] = {label:idx for idx, label in enumerate(unique_labels)}
    idx2label[col] = {idx:label for idx, label in enumerate(unique_labels)}
    train_df[col] = train_df[col].astype("category").cat.codes
    test_df[col] = test_df[col].astype("category").cat.codes

field_dims = [len(label2idx[col]) for col in train_df.columns if col != 'rating']

basic_data = {
        'train':train_df,
        'test':test_df,
        'field_names':sparse_cols,
        'field_dims':field_dims,
        'label2idx':label2idx,
        'idx2label':idx2label,
        'sub':sub,
        }

In [13]:
X_train, X_valid, y_train, y_valid = train_test_split(
                                                        basic_data['train'].drop(['rating'], axis=1),
                                                        basic_data['train']['rating'],
                                                        test_size=0.2,
                                                        random_state=42,
                                                        shuffle=True
                                                        )
basic_data['X_train'], basic_data['X_valid'], basic_data['y_train'], basic_data['y_valid'] = X_train, X_valid, y_train, y_valid

In [14]:
train_dataset = TensorDataset(torch.LongTensor(basic_data['X_train'].values), torch.LongTensor(basic_data['y_train'].values))
valid_dataset = TensorDataset(torch.LongTensor(basic_data['X_valid'].values), torch.LongTensor(basic_data['y_valid'].values))
test_dataset = TensorDataset(torch.LongTensor(basic_data['test'].values))

train_dataloader = DataLoader(train_dataset, batch_size=512, shuffle=True, num_workers=0)
valid_dataloader = DataLoader(valid_dataset, batch_size=512, shuffle=False, num_workers=0)
test_dataloader = DataLoader(test_dataset, batch_size=512, shuffle=False, num_workers=0)

basic_data['train_dataloader'], basic_data['valid_dataloader'], basic_data['test_dataloader'] = train_dataloader, valid_dataloader, test_dataloader

In [15]:
import pandas as pd
from tqdm import tqdm
from PIL import Image
from torchvision.transforms import v2
import torch
from torch.utils.data import DataLoader, Dataset
#from .basic_data import basic_data_split


class Image_Dataset(Dataset):
    def __init__(self, user_book_vector, img_vector, rating=None):
        """
        Parameters
        ----------
        user_book_vector : np.ndarray
            모델 학습에 사용할 유저 및 책 정보(범주형 데이터)를 입력합니다.
        img_vector : np.ndarray
            벡터화된 이미지 데이터를 입력합니다.
        rating : np.ndarray
            정답 데이터를 입력합니다.
        """
        self.user_book_vector = user_book_vector
        self.img_vector = img_vector
        self.rating = rating
    def __len__(self):
        return self.user_book_vector.shape[0]
    def __getitem__(self, i):
        return {
                'user_book_vector' : torch.tensor(self.user_book_vector[i], dtype=torch.long),
                'img_vector' : torch.tensor(self.img_vector[i], dtype=torch.float32),
                'rating' : torch.tensor(self.rating[i], dtype=torch.float32)
                } if self.rating is not None else \
                {
                'user_book_vector' : torch.tensor(self.user_book_vector[i], dtype=torch.long),
                'img_vector' : torch.tensor(self.img_vector[i], dtype=torch.float32)
                }

  from .autonotebook import tqdm as notebook_tqdm


In [16]:
def image_vector(path, img_size):
    """
    Parameters
    ----------
    path : str
        이미지가 존재하는 경로를 입력합니다.

    Returns
    -------
    img_fe : np.ndarray
        이미지를 벡터화한 결과를 반환합니다.
        베이스라인에서는 grayscale일 경우 RGB로 변경한 뒤, img_size x img_size 로 사이즈를 맞추어 numpy로 반환합니다.
    """
    img = Image.open(path)
    transform = v2.Compose([
        v2.Lambda(lambda x: x.convert('RGB') if x.mode != 'RGB' else x),
        v2.Resize((img_size, img_size)),
        v2.ToImage(),
        v2.ToDtype(torch.float32, scale=True),
        v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    return transform(img).numpy()


def process_img_data(books, args):
    """
    Parameters
    ----------
    books : pd.DataFrame
        책 정보에 대한 데이터 프레임을 입력합니다.
    
    Returns
    -------
    books_ : pd.DataFrame
        이미지 정보를 벡터화하여 추가한 데이터 프레임을 반환합니다.
    """
    books_ = books.copy()
    books_['img_path'] = books_['img_path'].apply(lambda x: f'data/{x}')
    img_vecs = []
    for idx in tqdm(books_.index):
        img_vec = image_vector(books_.loc[idx, 'img_path'], args.model_args[args.model].img_size)
        img_vecs.append(img_vec)

    books_['img_vector'] = img_vecs

    return books_

In [17]:
books_ = books.copy()
books_['img_path'] = books_['img_path'].apply(lambda x: f'data/{x}')
img_vecs = []

for path in tqdm(glob.glob('/data/ephemeral/home/jay/data/images/*.jpg')):
    img = Image.open(path)
    img_size = 28
    transform = v2.Compose([
            v2.Lambda(lambda x: x.convert('RGB') if x.mode != 'RGB' else x),
            v2.Resize((img_size, img_size)),
            v2.ToImage(),
            v2.ToDtype(torch.float32, scale=True),
            v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])
    img_vec = transform(img).numpy()
    img_vecs.append(img_vec)

100%|██████████| 149570/149570 [01:55<00:00, 1298.32it/s]


In [18]:
books_['img_vector'] = img_vecs

user_features = []
book_features = []
sparse_cols = ['user_id', 'isbn'] + list(set(user_features + book_features) - {'user_id', 'isbn'})

train_df = train.merge(books_, on='isbn', how='left')\
                .merge(users, on='user_id', how='left')[sparse_cols + ['img_vector', 'rating']]
test_df = test.merge(books_, on='isbn', how='left')\
                .merge(users, on='user_id', how='left')[sparse_cols + ['img_vector']]
all_df = pd.concat([train_df, test_df], axis=0)

# feature_cols의 데이터만 라벨 인코딩하고 인덱스 정보를 저장
label2idx, idx2label = {}, {}
for col in sparse_cols:
    all_df[col] = all_df[col].fillna('unknown')
    unique_labels = all_df[col].astype("category").cat.categories
    label2idx[col] = {label:idx for idx, label in enumerate(unique_labels)}
    idx2label[col] = {idx:label for idx, label in enumerate(unique_labels)}
    train_df[col] = train_df[col].astype("category").cat.codes
    test_df[col] = test_df[col].astype("category").cat.codes

field_dims = [len(label2idx[col]) for col in sparse_cols]

img_data = {
        'train':train_df,
        'test':test_df,
        'field_names':sparse_cols,
        'field_dims':field_dims,
        'label2idx':label2idx,
        'idx2label':idx2label,
        'sub':sub,
        }

In [19]:
X_train, X_valid, y_train, y_valid = train_test_split(
                                                        img_data['train'].drop(['rating'], axis=1),
                                                        img_data['train']['rating'],
                                                        test_size=0.2,
                                                        random_state=42,
                                                        shuffle=True
                                                        )
img_data['X_train'], img_data['X_valid'], img_data['y_train'], img_data['y_valid'] = X_train, X_valid, y_train, y_valid

In [20]:
train_dataset = Image_Dataset(
                            img_data['X_train'][img_data['field_names']].values,
                            img_data['X_train']['img_vector'].values,
                            img_data['y_train'].values
                            )
valid_dataset = Image_Dataset(
                            img_data['X_valid'][img_data['field_names']].values,
                            img_data['X_valid']['img_vector'].values,
                            img_data['y_valid'].values
                            )
test_dataset = Image_Dataset(
                            img_data['test'][img_data['field_names']].values,
                            img_data['test']['img_vector'].values
                            )

train_dataloader = DataLoader(train_dataset, batch_size=512, shuffle=True, num_workers=0)
valid_dataloader = DataLoader(valid_dataset, batch_size=512, shuffle=False, num_workers=0)
test_dataloader = DataLoader(test_dataset, batch_size=512, shuffle=False, num_workers=0)
img_data['train_dataloader'], img_data['valid_dataloader'], img_data['test_dataloader'] = train_dataloader, valid_dataloader, test_dataloader

In [21]:
train_dataset

<__main__.Image_Dataset at 0x7f5de0ce7bb0>

In [77]:
df = basic_data['train'].merge(img_data['train'], on=['user_id', 'isbn', 'rating'], how = 'left')
df

Unnamed: 0,user_id,isbn,age_range,language,publisher,publication_range,category,book_title,location_city,location_state,location_country,book_author,rating,img_vector
0,0,31,2,4,4143,11,117,17674,9906,918,28,43656,4,"[[[2.2489083, 2.2489083, 2.2489083, 2.2489083,..."
1,14622,31,3,4,4143,11,117,17674,9956,918,28,43656,7,"[[[2.2489083, 2.2489083, 2.2489083, 2.2489083,..."
2,26586,31,2,4,4143,11,117,17674,4948,918,28,43656,8,"[[[2.2489083, 2.2489083, 2.2489083, 2.2489083,..."
3,42805,31,2,4,4143,11,117,17674,2120,918,28,43656,8,"[[[2.2489083, 2.2489083, 2.2489083, 2.2489083,..."
4,45171,31,2,4,4143,11,117,17674,3911,918,28,43656,9,"[[[2.2489083, 2.2489083, 2.2489083, 2.2489083,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
306790,59796,77468,2,4,8494,11,-1,96960,7717,205,198,10933,7,"[[[2.2489083, 2.2489083, 2.2489083, 2.2489083,..."
306791,59800,66171,3,4,7258,9,2477,85468,2380,1222,198,39786,6,"[[[-1.6555357, -1.6555357, -1.6555357, -1.6212..."
306792,59800,96183,3,4,5504,9,-1,114704,2380,1222,198,9005,7,"[[[-1.980906, -1.9637812, -1.8096584, -1.96378..."
306793,59800,98957,3,4,5119,10,2086,85105,2380,1222,198,24508,7,"[[[-2.117904, -2.117904, -2.1007793, -2.066529..."


In [23]:
X_train, X_valid, y_train, y_valid = train_test_split(
                                                    df.drop(['rating'], axis=1),
                                                    df['rating'],
                                                    test_size=0.2,
                                                    random_state=42,
                                                    shuffle=True
                                                    )

In [24]:
X_train

Unnamed: 0,user_id,isbn,age_range,language,publisher,publication_range,category,book_title,location_city,location_state,location_country,book_author,img_vector
121312,19028,69821,2,4,9749,9,2086,38076,382,1316,198,25807,"[[[-0.95342064, -1.0219197, -0.88492167, -0.88..."
265089,24291,116763,6,4,9326,11,-1,79158,7879,922,198,27606,"[[[0.17681314, 0.8446785, 0.17681314, -1.07329..."
60236,32790,769,3,4,4913,10,2477,68874,3304,293,198,22622,"[[[2.2489083, 2.2489083, 2.2489083, 2.2489083,..."
111218,5971,38907,2,4,1103,11,2086,31116,534,744,198,26563,"[[[2.2489083, 2.2489083, 2.2489083, 2.2489083,..."
306001,58480,105361,4,4,3815,10,-1,20521,3080,-1,142,11881,"[[[1.5810431, 1.5467936, 1.4611698, 1.5467936,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
119879,56208,22444,3,4,887,10,2086,91881,3366,62,198,25799,"[[[2.2489083, 2.2489083, 2.2489083, 2.2489083,..."
259178,21634,114540,3,4,2649,10,-1,96259,6773,561,198,51633,"[[[2.0605361, 2.1119103, 2.1975343, 2.1290352,..."
131932,40197,22324,5,4,887,10,-1,11674,10653,744,198,21987,"[[[2.2489083, 2.2489083, 2.2489083, 2.2489083,..."
146867,32276,117216,2,8,3579,8,2086,49925,6936,1113,64,37585,"[[[-1.278791, -1.3644148, -1.3301654, -1.17604..."


In [25]:
class Image_Dataset(Dataset):
    def __init__(self, user_book_vector, img_vector, rating=None):
        """
        Parameters
        ----------
        user_book_vector : np.ndarray
            모델 학습에 사용할 유저 및 책 정보(범주형 데이터)를 입력합니다.
        img_vector : np.ndarray
            벡터화된 이미지 데이터를 입력합니다.
        rating : np.ndarray
            정답 데이터를 입력합니다.
        """
        self.user_book_vector = user_book_vector
        self.img_vector = img_vector
        self.rating = rating
    def __len__(self):
        return self.user_book_vector.shape[0]
    def __getitem__(self, i):
        return {
                'user_book_vector' : torch.tensor(self.user_book_vector[i], dtype=torch.long),
                'img_vector' : torch.tensor(self.img_vector[i], dtype=torch.float32),
                'rating' : torch.tensor(self.rating[i], dtype=torch.float32)
                } if self.rating is not None else \
                {
                'user_book_vector' : torch.tensor(self.user_book_vector[i], dtype=torch.long),
                'img_vector' : torch.tensor(self.img_vector[i], dtype=torch.float32)
                }

In [26]:
train_dataset = Image_Dataset(
            X_train[['user_id', 'isbn']].values,
            X_train['img_vector'].values,
            y_train.values
            )

In [27]:
DataLoader(train_dataset, batch_size=512, shuffle=True, num_workers=0)

<torch.utils.data.dataloader.DataLoader at 0x7f5db7f2d090>

In [28]:
img_data['X_valid']

Unnamed: 0,user_id,isbn,img_vector
31332,2681,29051,"[[[-1.1589177, -0.9362959, -0.7136741, -0.6794..."
114196,18124,47681,"[[[0.70768046, -0.37117895, -0.47392747, -0.40..."
302635,53700,69367,"[[[-1.9124069, -1.9466565, -1.6726604, -1.6555..."
279523,37348,41454,"[[[2.1632848, 2.1804094, 2.1632848, 2.1632848,..."
19094,22876,9230,"[[[2.2489083, 2.2489083, 2.2489083, 2.2489083,..."
...,...,...,...
89162,11143,44370,"[[[2.2489083, 2.2489083, 2.2489083, 2.2489083,..."
8960,40598,9079,"[[[2.2489083, 2.2489083, 2.2489083, 2.2489083,..."
2188,11351,19183,"[[[-1.4157891, -1.4157891, -1.3815396, -0.0115..."
93315,39902,17666,"[[[-0.16568191, -0.045808643, -0.21705617, -0...."


In [29]:
basic_data['X_valid']

Unnamed: 0,user_id,isbn,age_range,language,publisher,publication_range,category,book_title,location_city,location_state,location_country,book_author
31332,2681,29051,2,4,9749,10,186,2223,2116,912,198,15453
114196,18124,47681,1,4,8459,9,-1,18946,318,723,198,22117
302635,53700,69367,5,4,7709,10,-1,31120,2116,912,198,43971
279523,37348,41454,3,4,2434,10,-1,22323,3998,912,198,26976
19094,22876,9230,3,4,7025,11,-1,36672,1874,558,198,49837
...,...,...,...,...,...,...,...,...,...,...,...,...
89162,11143,44370,2,4,9886,10,-1,35926,3845,183,28,28882
8960,40598,9079,4,4,7025,10,2086,13682,9415,301,198,19979
2188,11351,19183,3,4,5453,10,2086,87576,8092,744,198,21723
93315,39902,17666,2,4,8693,10,-1,69303,1170,409,198,8725


In [30]:
img_data.keys()

dict_keys(['train', 'test', 'field_names', 'field_dims', 'label2idx', 'idx2label', 'sub', 'X_train', 'X_valid', 'y_train', 'y_valid', 'train_dataloader', 'valid_dataloader', 'test_dataloader'])

In [31]:
basic_train_dataset = TensorDataset(torch.LongTensor(basic_data['X_train'].values), torch.LongTensor(basic_data['y_train'].values))
img_train_dataset = Image_Dataset(
            X_train[['user_id', 'isbn']].values,
            X_train['img_vector'].values,
            y_train.values
            )

In [32]:
from torch.utils.data import ConcatDataset

combined_dataset = ConcatDataset([basic_train_dataset, img_train_dataset])


In [34]:
train_dataloader = DataLoader(combined_dataset, batch_size=512, shuffle=True, num_workers=0)

In [65]:
sum(books_['img_vector'][0].flatten() == books_['img_vector'][2].flatten())

2352

In [67]:
books_

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher,img_url,language,category,summary,img_path,img_vector
0,0002005018,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,en,['Actresses'],"In a small town in Canada, Clara Callan reluct...",data/images/0002005018.01.THUMBZZZ.jpg,"[[[2.2489083, 2.2489083, 2.2489083, 2.2489083,..."
1,0060973129,Decision in Normandy,Carlo D'Este,1991.0,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,en,['1940-1949'],"Here, for the first time in paperback, is an o...",data/images/0060973129.01.THUMBZZZ.jpg,"[[[-1.6384109, -1.6555357, -1.7069099, -1.6897..."
2,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999.0,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,en,['Medical'],"Describes the great flu epidemic of 1918, an o...",data/images/0374157065.01.THUMBZZZ.jpg,"[[[2.2489083, 2.2489083, 2.2489083, 2.2489083,..."
3,0399135782,The Kitchen God's Wife,Amy Tan,1991.0,Putnam Pub Group,http://images.amazon.com/images/P/0399135782.0...,en,['Fiction'],A Chinese immigrant who is convinced she is dy...,data/images/0399135782.01.THUMBZZZ.jpg,"[[[2.2489083, 2.2489083, 2.2489083, 2.2489083,..."
4,0425176428,What If?: The World's Foremost Military Histor...,Robert Cowley,2000.0,Berkley Publishing Group,http://images.amazon.com/images/P/0425176428.0...,en,['History'],"Essays by respected military historians, inclu...",data/images/0425176428.01.THUMBZZZ.jpg,"[[[2.2489083, 2.2489083, 2.2489083, 2.2489083,..."
...,...,...,...,...,...,...,...,...,...,...,...
149565,067161746X,The Bachelor Home Companion: A Practical Guide...,P.J. O'Rourke,1987.0,Pocket Books,http://images.amazon.com/images/P/067161746X.0...,en,['Humor'],A tongue-in-cheek survival guide for single pe...,data/images/067161746X.01.THUMBZZZ.jpg,"[[[-1.6555357, -1.6555357, -1.6555357, -1.6212..."
149566,0767907566,All Elevations Unknown: An Adventure in the He...,Sam Lightner,2001.0,Broadway Books,http://images.amazon.com/images/P/0767907566.0...,en,['Nature'],A daring twist on the travel-adventure genre t...,data/images/0767907566.01.THUMBZZZ.jpg,"[[[1.2727976, 1.3926709, 1.0330508, 0.9816765,..."
149567,0884159221,Why stop?: A guide to Texas historical roadsid...,Claude Dooley,1985.0,Lone Star Books,http://images.amazon.com/images/P/0884159221.0...,,,,data/images/0884159221.01.THUMBZZZ.jpg,"[[[-1.980906, -1.9637812, -1.8096584, -1.96378..."
149568,0912333022,The Are You Being Served? Stories: 'Camping In...,Jeremy Lloyd,1997.0,Kqed Books,http://images.amazon.com/images/P/0912333022.0...,en,['Fiction'],These hilarious stories by the creator of publ...,data/images/0912333022.01.THUMBZZZ.jpg,"[[[-2.117904, -2.117904, -2.1007793, -2.066529..."


In [68]:
import torch
import torch.nn as nn

class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.layer1 = nn.Sequential(#in_channels = 3이면 R,G,B 흑백이면 1
            nn.Conv2d(in_channels=3, out_channels=32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )
        self.layer2 = nn.Sequential(
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )
        self.fc1 = nn.Linear(64*6*6,600)
        self.dropout = nn.Dropout(0.25)
        self.fc2 = nn.Linear(600,120)
        self.fc3 = nn.Linear(120,10)
        
    def forward(self, x):
        out = self.layer1(x)
        out = self.layer2(out)
        out = out.view(out.size(0), -1)
        out = self.fc1(out)
        out = self.dropout(out)
        out = self.fc2(out)
        out = self.fc3(out)
        return out

In [101]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# 모델을 GPU 또는 CPU로 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# img_vector 열을 torch.Tensor로 변환한 후, torch.stack 사용
X_train = torch.stack([torch.tensor(img) for img in df['img_vector']])  # (100, 3, 28, 28) 크기의 텐서로 변환
y_train = torch.tensor(df['rating'].values) - 1  # rating을 텐서로 변환

# 데이터셋을 TensorDataset으로 만들고, DataLoader로 로드
train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True)

# 모델 초기화 후, GPU로 이동
model = CNN().to(device)  # CNN 모델을 GPU로 이동

# 손실 함수와 옵티마이저 초기화
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 학습 과정
num_epochs = 10

for epoch in range(num_epochs):
    model.train()  # 모델을 학습 모드로 설정
    running_loss = 0.0
    correct = 0
    total = 0
    
    for inputs, labels in train_loader:
        # 데이터를 GPU로 이동
        inputs, labels = inputs.to(device), labels.to(device)

        # 기울기 초기화
        optimizer.zero_grad()

        # 모델을 통해 예측값 계산
        outputs = model(inputs)

        # 손실 계산
        loss = criterion(outputs, labels)

        # 역전파
        loss.backward()

        # 옵티마이저로 가중치 업데이트
        optimizer.step()

        # 통계 출력
        running_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    epoch_loss = running_loss / len(train_loader)
    epoch_accuracy = 100 * correct / total
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.2f}%")

print("Finished Training")


Epoch [1/10], Loss: 2.0829, Accuracy: 23.88%
Epoch [2/10], Loss: 2.0757, Accuracy: 24.06%
Epoch [3/10], Loss: 2.0755, Accuracy: 24.06%
Epoch [4/10], Loss: 2.0751, Accuracy: 24.08%
Epoch [5/10], Loss: 2.0741, Accuracy: 24.09%
Epoch [6/10], Loss: 2.0733, Accuracy: 24.10%
Epoch [7/10], Loss: 2.0734, Accuracy: 24.09%
Epoch [8/10], Loss: 2.0719, Accuracy: 24.12%
Epoch [9/10], Loss: 2.0706, Accuracy: 24.13%
Epoch [10/10], Loss: 2.0693, Accuracy: 24.15%
Finished Training


In [102]:
users_ = users.copy()
books_ = books.copy()

# 데이터 전처리 (전처리는 각자의 상황에 맞게 진행해주세요!)
books_['category'] = books_['category'].apply(lambda x: str2list(x)[0] if not pd.isna(x) else np.nan)
books_['language'] = books_['language'].fillna(books_['language'].mode()[0])
books_['publication_range'] = books_['year_of_publication'].apply(lambda x: x // 10 * 10)  # 1990년대, 2000년대, 2010년대, ...

users_['age'] = users_['age'].fillna(users_['age'].mode()[0])
users_['age_range'] = users_['age'].apply(lambda x: x // 10 * 10)  # 10대, 20대, 30대, ...

users_['location_list'] = users_['location'].apply(lambda x: split_location(x)) 
users_['location_country'] = users_['location_list'].apply(lambda x: x[0])
users_['location_state'] = users_['location_list'].apply(lambda x: x[1] if len(x) > 1 else np.nan)
users_['location_city'] = users_['location_list'].apply(lambda x: x[2] if len(x) > 2 else np.nan)
for idx, row in users_.iterrows():
    if (not pd.isna(row['location_state'])) and pd.isna(row['location_country']):
        fill_country = users_[users_['location_state'] == row['location_state']]['location_country'].mode()
        fill_country = fill_country[0] if len(fill_country) > 0 else np.nan
        users_.loc[idx, 'location_country'] = fill_country
    elif (not pd.isna(row['location_city'])) and pd.isna(row['location_state']):
        if not pd.isna(row['location_country']):
            fill_state = users_[(users_['location_country'] == row['location_country']) 
                                & (users_['location_city'] == row['location_city'])]['location_state'].mode()
            fill_state = fill_state[0] if len(fill_state) > 0 else np.nan
            users_.loc[idx, 'location_state'] = fill_state
        else:
            fill_state = users_[users_['location_city'] == row['location_city']]['location_state'].mode()
            fill_state = fill_state[0] if len(fill_state) > 0 else np.nan
            fill_country = users_[users_['location_city'] == row['location_city']]['location_country'].mode()
            fill_country = fill_country[0] if len(fill_country) > 0 else np.nan
            users_.loc[idx, 'location_country'] = fill_country
            users_.loc[idx, 'location_state'] = fill_state

            

users_ = users_.drop(['location'], axis=1)

In [103]:
users_

Unnamed: 0,user_id,age,age_range,location_list,location_country,location_state,location_city
0,8,29.0,20.0,"[canada, ontario, timmins]",canada,ontario,timmins
1,11400,49.0,40.0,"[canada, ontario, ottawa]",canada,ontario,ottawa
2,11676,29.0,20.0,"[nan, nan, nan]",,,
3,67544,30.0,30.0,"[canada, ontario, toronto]",canada,ontario,toronto
4,85526,36.0,30.0,"[canada, british columbia, victoria]",canada,british columbia,victoria
...,...,...,...,...,...,...,...
68087,278376,54.0,50.0,"[usa, pennsylvania, danville]",usa,pennsylvania,danville
68088,278621,74.0,70.0,"[canada, delaware, victoria]",canada,delaware,victoria
68089,278636,29.0,20.0,"[usa, alabama, irvington]",usa,alabama,irvington
68090,278659,33.0,30.0,"[usa, washington, vancouver]",usa,washington,vancouver
