In [8]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re

import torch
from torch import Tensor
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.optim import Optimizer

import torch.nn as nn
import torch.nn.functional as F
from torch.nn.modules.loss import _Loss

from PIL import Image
from sklearn.model_selection import train_test_split
import torchvision.transforms as transforms
import torch
from torch.utils.data import DataLoader, Dataset
from torch.autograd import Variable
from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_row', 50)

path = '../data/'

train = pd.read_csv(path + 'train_ratings.csv')
test = pd.read_csv(path + 'test_ratings.csv')
books = pd.read_csv(path + 'books.csv')
users = pd.read_csv(path + 'users.csv')

books['img_path'] = books['img_path'].apply(lambda x: path+x)

def image_vector(path):
    img = Image.open(path)
    scale = transforms.Resize((32, 32))
    tensor = transforms.ToTensor()
    img_fe = Variable(tensor(scale(img)))
    return img_fe


BATCH_SIZE = 1024
lr = 0.005
EPOCH = 3
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [9]:
idxs = []
datas = []

for idx in tqdm(range(len(books))):
    idxs.append(idx)
    data = image_vector(books.iloc[idx, -1])
    if data.size()[0] == 3:
        data = np.array(data)
    else:
        data = np.array(data.expand(3, data.size()[1], data.size()[2]))
    datas.append(data)

100%|██████████| 149570/149570 [01:12<00:00, 2050.50it/s]


In [10]:
class MyBaseDataset(Dataset):
    def __init__(self, idxs, datas):
        self.idx = idxs
        self.img = datas


    def __len__(self): 
        return len(self.idx)
        
    def __getitem__(self, i): 
        return {'isbn' : torch.tensor(self.idx[i], dtype=torch.float32),
                'img' : torch.tensor(self.img[i], dtype=torch.float32),}

dataset = MyBaseDataset(idxs, datas)
train_loader = torch.utils.data.DataLoader(dataset, batch_size=BATCH_SIZE, num_workers=0, shuffle=True)
pred_loader = torch.utils.data.DataLoader(dataset, batch_size=BATCH_SIZE, num_workers=0, shuffle=False)

In [11]:
class Autoencoder(nn.Module):
    def __init__(self, hidden_dim):
        super(Autoencoder, self).__init__()
        self.encoder_1 = nn.Sequential(
            nn.Conv2d(3, 14, 5, padding = 2),
            nn.BatchNorm2d(14),
            nn.ReLU(),
            nn.Conv2d(14, 7, 5, padding = 2),
            nn.BatchNorm2d(7),
            nn.ReLU(),
        )

        self.encoder_2 = nn.Sequential(
            nn.Linear(7 * 32 * 32, hidden_dim),
        )

        self.decoder_1 = nn.Sequential(
            nn.Linear(hidden_dim, 7 * 32 * 32),
            nn.BatchNorm1d(7 * 32 * 32),
            nn.ReLU(),
        )

        self.decoder_2 = nn.Sequential(
            nn.Conv2d(7, 14, 5, padding = 2),
            nn.BatchNorm2d(14),
            nn.ReLU(),
            nn.Conv2d(14, 3, 5, padding = 2),
        )

        
    
  #인코더와 디코더 연산을 차례대로 수행하도록 설정 
    def forward(self, x):
        encoded = self.encoder_1(x)
        encoded = encoded.view(-1, 7 * 32 * 32)
        encoded = self.encoder_2(encoded)

        decoded = self.decoder_1(encoded)
        decoded = decoded.view(-1, 7, 32, 32) 
        decoded = self.decoder_2(decoded)
        return decoded, encoded

In [12]:
model = Autoencoder(5).to(device)
optimizer =  torch.optim.Adam(model.parameters(), lr=lr)
loss = nn.MSELoss()

for epoch in range(EPOCH):

    running_loss = 0.0

    for i, data in enumerate(train_loader, 0):
        data = data['img'].to(device)
       
        optimizer.zero_grad() # 매개변수를 0으로 만듭니다.

        outputs, _ = model(data) # 입력값을 넣어 순전파를 진행시킨뒤 결과값 배출
        _loss = loss(outputs, data) # 결과와 실제 값을 손실함수에 대입
        _loss.backward() # 손실함수에서 역전파 수행
        optimizer.step() # 옵티마이저를 사용해 매개변수 최적화

        running_loss += _loss.item()
    
    print(running_loss/len(books))

4.8158587705441843e-05
3.071391022636936e-05
2.8328177395045157e-05


In [6]:
image_vector = pd.DataFrame()

model.eval()

with torch.no_grad():
    for i, data in enumerate(pred_loader):
        _, out_data = model(data['img'].to(device))
        image_vector = pd.concat([image_vector, pd.DataFrame(out_data.squeeze().detach().cpu().numpy())])

image_vector.columns = ['v1','v2','v3','v4','v5']
image_vector.reset_index(drop = True, inplace= True)
books = pd.concat([books, image_vector], axis = 1)
books.head()

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher,img_url,language,category,summary,img_path,v1,v2,v3,v4,v5
0,2005018,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,en,['Actresses'],"In a small town in Canada, Clara Callan reluct...",../data/images/0002005018.01.THUMBZZZ.jpg,0.111604,2.208946,10.051038,33.557636,43.056538
1,60973129,Decision in Normandy,Carlo D'Este,1991.0,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,en,['1940-1949'],"Here, for the first time in paperback, is an o...",../data/images/0060973129.01.THUMBZZZ.jpg,-60.860321,-66.590637,40.232693,-30.225893,-28.444817
2,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999.0,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,en,['Medical'],"Describes the great flu epidemic of 1918, an o...",../data/images/0374157065.01.THUMBZZZ.jpg,-54.511848,0.230044,56.543148,143.358246,168.976364
3,399135782,The Kitchen God's Wife,Amy Tan,1991.0,Putnam Pub Group,http://images.amazon.com/images/P/0399135782.0...,en,['Fiction'],A Chinese immigrant who is convinced she is dy...,../data/images/0399135782.01.THUMBZZZ.jpg,-60.860321,-66.590637,40.232693,-30.225893,-28.444817
4,425176428,What If?: The World's Foremost Military Histor...,Robert Cowley,2000.0,Berkley Publishing Group,http://images.amazon.com/images/P/0425176428.0...,en,['History'],"Essays by respected military historians, inclu...",../data/images/0425176428.01.THUMBZZZ.jpg,-27.195101,-23.11529,8.017411,-34.547516,-24.669533


In [7]:
books.to_csv(path+'ksy_books_img.csv', index=False)