In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from PIL import Image
import os



## 전처리

In [2]:
# 데이터 로드
books_df = pd.read_csv('../../data/books.csv')
users_df = pd.read_csv('../../data/users.csv')
train_ratings_df = pd.read_csv('../../data/train_ratings.csv')
test_ratings_df = pd.read_csv('../../data/test_ratings.csv')

In [3]:
users_df['age'] = pd.to_numeric(users_df['age'], errors='coerce')
median_age = users_df['age'].median()
users_df['age'] = users_df['age'].fillna(median_age)


In [4]:
books_df['year_of_publication'] = pd.to_numeric(books_df['year_of_publication'], errors='coerce')
median_year = books_df['year_of_publication'].median()
books_df['year_of_publication'] = books_df['year_of_publication'].fillna(median_year).astype(int)

In [5]:
# 데이터 병합
train_data = train_ratings_df.merge(books_df, on='isbn').merge(users_df, on='user_id')
test_data = test_ratings_df.merge(books_df, on='isbn').merge(users_df, on='user_id')


In [6]:
# User의 평균 평점 계산
user_avg_rating = train_data.groupby('user_id')['rating'].mean().reset_index(name='user_mean_rating')

# Book의 평균 평점 계산
book_avg_rating = train_data.groupby('isbn')['rating'].mean().reset_index(name='book_mean_rating')

# 원래 데이터프레임에 user와 book의 평균 평점 합치기
train_data = train_data.merge(user_avg_rating, on='user_id')
train_data = train_data.merge(book_avg_rating, on='isbn')

# 테스트 데이터에 훈련 데이터의 평균 평점 적용
test_data = test_data.merge(user_avg_rating, on='user_id', how='left')
test_data = test_data.merge(book_avg_rating, on='isbn', how='left')

In [7]:
train_data

Unnamed: 0,user_id,isbn,rating,book_title,book_author,year_of_publication,publisher,img_url,language,category,summary,img_path,location,age,user_mean_rating,book_mean_rating
0,8,0002005018,4,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,en,['Actresses'],"In a small town in Canada, Clara Callan reluct...",images/0002005018.01.THUMBZZZ.jpg,"timmins, ontario, canada",34.0,4.428571,6.857143
1,67544,0002005018,7,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,en,['Actresses'],"In a small town in Canada, Clara Callan reluct...",images/0002005018.01.THUMBZZZ.jpg,"toronto, ontario, canada",30.0,7.285714,6.857143
2,123629,0002005018,8,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,en,['Actresses'],"In a small town in Canada, Clara Callan reluct...",images/0002005018.01.THUMBZZZ.jpg,"kingston, ontario, canada",34.0,8.000000,6.857143
3,200273,0002005018,8,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,en,['Actresses'],"In a small town in Canada, Clara Callan reluct...",images/0002005018.01.THUMBZZZ.jpg,"comber, ontario, canada",34.0,8.000000,6.857143
4,210926,0002005018,9,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,en,['Actresses'],"In a small town in Canada, Clara Callan reluct...",images/0002005018.01.THUMBZZZ.jpg,"guelph, ontario, canada",34.0,8.400000,6.857143
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
306790,278843,0743525493,7,The Motley Fool's What To Do with Your Money N...,David Gardner,2002,Simon & Schuster Audio,http://images.amazon.com/images/P/0743525493.0...,,,,images/0743525493.01.THUMBZZZ.jpg,"pismo beach, california, usa",28.0,8.000000,7.000000
306791,278851,067161746X,6,The Bachelor Home Companion: A Practical Guide...,P.J. O'Rourke,1987,Pocket Books,http://images.amazon.com/images/P/067161746X.0...,en,['Humor'],A tongue-in-cheek survival guide for single pe...,images/067161746X.01.THUMBZZZ.jpg,"dallas, texas, usa",33.0,5.833333,6.000000
306792,278851,0884159221,7,Why stop?: A guide to Texas historical roadsid...,Claude Dooley,1985,Lone Star Books,http://images.amazon.com/images/P/0884159221.0...,,,,images/0884159221.01.THUMBZZZ.jpg,"dallas, texas, usa",33.0,5.833333,7.000000
306793,278851,0912333022,7,The Are You Being Served? Stories: 'Camping In...,Jeremy Lloyd,1997,Kqed Books,http://images.amazon.com/images/P/0912333022.0...,en,['Fiction'],These hilarious stories by the creator of publ...,images/0912333022.01.THUMBZZZ.jpg,"dallas, texas, usa",33.0,5.833333,7.000000


In [8]:
# 필요한 특징 선택
features = ['user_id', 'isbn', 'book_title', 'book_author', 'year_of_publication', 'publisher', 
            'age', 'img_path', 'summary', 'user_mean_rating', 'book_mean_rating']
train_data = train_data[features + ['rating']]
test_data = test_data[features]


In [9]:
# 텍스트 전처리를 위한 토크나이저 초기화
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


In [10]:
# 이미지 전처리를 위한 변환 정의
image_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [11]:
# 정규화 
scaler = StandardScaler()
numerical_features = ['year_of_publication', 'age', 'user_mean_rating', 'book_mean_rating']
train_data[numerical_features] = scaler.fit_transform(train_data[numerical_features])
test_data[numerical_features] = scaler.transform(test_data[numerical_features])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data[numerical_features] = scaler.fit_transform(train_data[numerical_features])


In [33]:

# 데이터셋 클래스 정의
class BookDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=128):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        
        # 텍스트 처리
        text = f"{row['book_title']} {row['book_author']} {row['summary']}"

        text = ' '.join([str(item) if pd.notna(item) else '' for item in text.split()])
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        # 이미지 처리
        # 이미지 경로 설정 시 중복되는 'images/' 부분 제거
        image_filename = row['img_path'].replace('images/', '')  # 'images/' 부분 제거
        image_path = os.path.join('/data/ephemeral/home/data/images', image_filename)
        #image_path = os.path.join('/data/ephemeral/home/data/images', row['img_path'])

        image = Image.open(image_path).convert('RGB')
        image = image_transform(image)

        # 수치형 특징
        numerical = torch.tensor([row[feature] for feature in numerical_features], dtype=torch.float)

        # 레이블 (학습 데이터의 경우)
        if 'rating' in row:
            label = torch.tensor(row['rating'], dtype=torch.float)
        else:
            label = torch.tensor(0, dtype=torch.float)  # 테스트 데이터의 경우 더미 값

        return {
            'text_ids': encoding['input_ids'].flatten(),
            'text_mask': encoding['attention_mask'].flatten(),
            'image': image,
            'numerical': numerical,
            'label': label
        }

In [34]:
from sklearn.model_selection import train_test_split

train_data_1, val_data = train_test_split(train_data, test_size=0.2, random_state=42)

In [35]:
train_dataset = BookDataset(train_data, tokenizer)
val_dataset = BookDataset(val_data, tokenizer)
test_dataset = BookDataset(test_data, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [36]:

class MultimodalBookRatingModel(nn.Module):
    def __init__(self, num_numerical_features):
        super().__init__()
        
        # 이미지 인코더
        self.image_encoder = models.resnet18(pretrained=True)
        self.image_encoder.fc = nn.Linear(self.image_encoder.fc.in_features, 128)
        
        # 텍스트 인코더
        self.text_encoder = BertModel.from_pretrained("google/electra-small-discriminator") # 기존: 'bert-base-uncased'
        self.text_fc = nn.Linear(768, 128)
        
        # 정형 데이터 인코더
        self.numerical_encoder = nn.Sequential(
            nn.Linear(num_numerical_features, 64),
            nn.ReLU(),
            nn.Linear(64, 128)
        )
        
        # 멀티모달 퓨전
        self.fusion = nn.Sequential(
            nn.Linear(128 * 3, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )
        
    def forward(self, image, text_ids, text_mask, numerical_features):
        # 이미지 인코딩
        image_features = self.image_encoder(image)
        
        # 텍스트 인코딩
        text_output = self.text_encoder(input_ids=text_ids, attention_mask=text_mask)
        text_features = self.text_fc(text_output.last_hidden_state[:, 0, :])
        
        # 정형 데이터 인코딩
        numerical_features = self.numerical_encoder(numerical_features)
        
        # 특징 결합
        combined_features = torch.cat([image_features, text_features, numerical_features], dim=1)
        
        # 평점 예측
        rating = self.fusion(combined_features)
        
        return rating.squeeze()

In [37]:
# 모델 초기화
model = MultimodalBookRatingModel(num_numerical_features=len(numerical_features))
model = model.to('cuda' if torch.cuda.is_available() else 'cpu')

# 손실 함수와 옵티마이저 정의
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)


In [38]:
# 학습 함수
def train_epoch(model, data_loader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    for batch in data_loader:
        optimizer.zero_grad()
        
        text_ids = batch['text_ids'].to(device)
        text_mask = batch['text_mask'].to(device)
        image = batch['image'].to(device)
        numerical = batch['numerical'].to(device)
        labels = batch['label'].to(device)
        
        outputs = model(image, text_ids, text_mask, numerical)
        loss = criterion(outputs, labels)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    return total_loss / len(data_loader)

# 평가 함수
def evaluate(model, data_loader, criterion, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in data_loader:
            text_ids = batch['text_ids'].to(device)
            text_mask = batch['text_mask'].to(device)
            image = batch['image'].to(device)
            numerical = batch['numerical'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(image, text_ids, text_mask, numerical)
            loss = criterion(outputs, labels)
            
            total_loss += loss.item()
    
    return total_loss / len(data_loader)

In [39]:
# 학습 루프
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
num_epochs = 10

for epoch in range(num_epochs):
    train_loss = train_epoch(model, train_loader, criterion, optimizer, device)
    val_loss = evaluate(model, val_loader, criterion, device)
    print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')

KeyboardInterrupt: 

In [None]:
# 테스트 데이터에 대한 예측
model.eval()
predictions = []

with torch.no_grad():
    for batch in test_loader:
        text_ids = batch['text_ids'].to(device)
        text_mask = batch['text_mask'].to(device)
        image = batch['image'].to(device)
        numerical = batch['numerical'].to(device)
        
        outputs = model(image, text_ids, text_mask, numerical)
        predictions.extend(outputs.cpu().numpy())

# 예측 결과를 테스트 데이터프레임에 추가
test_data['rating'] = predictions

# 결과 저장
submission = test_data[['user_id', 'isbn', 'predicted_rating']]
submission.to_csv('submission.csv', index=False)

print("예측이 완료되었습니다. 결과는 'submission.csv' 파일에 저장되었습니다.")

예측이 완료되었습니다. 결과는 'submission.csv' 파일에 저장되었습니다.
