In [None]:
import pandas as pd
import numpy as np
import os
import re
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from gensim.models import FastText
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn

In [2]:
data_path = './data/train'
train_df = pd.read_csv(os.path.join(data_path, 'train_ratings.csv')) # 전체 학습 데이터

In [3]:
year_data = pd.read_csv(os.path.join(data_path, 'years.tsv'), sep='\t')
writer_data = pd.read_csv(os.path.join(data_path, 'writers.tsv'), sep='\t')
title_data = pd.read_csv(os.path.join(data_path, 'titles.tsv'), sep='\t')
genre_data = pd.read_csv(os.path.join(data_path, 'genres.tsv'), sep='\t')
director_data = pd.read_csv(os.path.join(data_path, 'directors.tsv'), sep='\t')

In [4]:
train_df['watch_date'] = pd.to_datetime(train_df['time'], unit='s')
train_df.drop(columns=['time'], inplace=True)


In [6]:
all_data = pd.merge(train_df, title_data, on='item', how='left').merge(year_data, on='item', how='left')

# 시계열 정보 추출
all_data['watch_year'] = all_data['watch_date'].dt.year
# all_data['watch_month'] = all_data['watch_date'].dt.month
# all_data['watch_day'] = all_data['watch_date'].dt.day

# 아이템의 year와의 차이
# all_data['year_diff'] = all_data['watch_year'] - all_data['year']

# watch_date 컬럼 삭제
all_data.drop(columns=['watch_date'], inplace=True)

# 34048번 영화 제목 수정
all_data.loc[all_data['item'] == 34048, 'title'] = all_data.loc[all_data['item'] == 34048, 'title'] + " Extended"


In [55]:
# title에서 연도 추출
all_data['extracted_year'] = all_data['title'].apply(lambda x: int(re.search(r'\((\d{4})\)', x).group(1)) if re.search(r'\((\d{4})\)', x) else None)

# 결측치 채우기
all_data['year'] = all_data['year'].fillna(all_data['extracted_year'])
# int32로 변환
all_data['year'] = all_data['year'].astype('int32')

# 불필요한 열 삭제
all_data.drop(columns=['extracted_year'], inplace=True)

# title에서 연도 제거
all_data['title'] = all_data['title'].apply(lambda x: re.sub(r'\s*\(\d{4}\)', '', x))


In [12]:
# Title 데이터를 공백으로 분리한 토큰 리스트로 변환
all_data['title_tokens'] = all_data['title'].apply(lambda x: x.split())

# FastText 모델 학습
fasttext_model = FastText(
    sentences=all_data['title_tokens'],  # Tokenized titles
    vector_size=50,                     # Embedding dimension
    window=3,                           # Context window size
    min_count=1,                        # 최소 등장 횟수 (1로 설정하면 모든 단어 학습)
    sg=1,                               # Skip-gram 사용 (0: CBOW, 1: Skip-gram)
    epochs=10                           # 학습 에폭 수
)

# 각 Title의 임베딩 생성 (단어 벡터의 평균 사용)
def get_title_embedding(title_tokens, model, vector_size):
    vectors = [model.wv[word] for word in title_tokens if word in model.wv]
    if len(vectors) > 0:
        return np.mean(vectors, axis=0)  # 단어 벡터의 평균
    else:
        return np.zeros(vector_size)    # OOV일 경우 0 벡터 반환

# Title 임베딩 추가
embedding_dim = 50
all_data['title_embedding'] = all_data['title_tokens'].apply(
    lambda x: get_title_embedding(x, fasttext_model, embedding_dim)
)

all_data = all_data.drop(columns=['title_tokens', 'title'])

In [18]:
# genre_data에서 장르를 리스트로 변환
genre_data['genre_list'] = genre_data['genre'].apply(lambda x: x.split(','))

# item별로 장르 리스트를 결합
genres_per_item = genre_data.groupby('item')['genre_list'].agg(lambda x: sum(x, [])).reset_index()

# all_data에 병합
all_data = pd.merge(all_data, genres_per_item, on='item', how='left')

# 모든 고유 장르 리스트 추출
all_genres = set([genre for sublist in all_data['genre_list'] for genre in sublist])
all_genres = sorted(list(all_genres))  # 정렬된 고유 장르 리스트

# 장르별 인덱스 매핑
genre_to_idx = {genre: idx for idx, genre in enumerate(all_genres)}


# Multi-hot Encoding 변환 함수
def encode_genre_list(genre_list, genre_to_idx, num_genres):
    multi_hot = np.zeros(num_genres, dtype=int)
    for genre in genre_list:
        if genre in genre_to_idx:
            multi_hot[genre_to_idx[genre]] = 1
    return multi_hot

# Multi-hot Encoding 적용
num_genres = len(all_genres)
all_data['genre_embedding'] = all_data['genre_list'].apply(lambda x: encode_genre_list(x, genre_to_idx, num_genres))

all_data = all_data.drop(columns=['genre_list'])

In [7]:
# Label Encoding
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

# 유저와 아이템을 고유한 정수로 변환
all_data['user_encoded'] = user_encoder.fit_transform(all_data['user'])
all_data['item_encoded'] = item_encoder.fit_transform(all_data['item'])
columns = ['user_encoded', 'item_encoded'] + [col for col in all_data.columns if col not in ['user_encoded', 'item_encoded']]
all_data = all_data[columns]

# User ID 매핑
user_id_mapping = dict(zip(all_data['user_encoded'], all_data['user']))

# Item ID 매핑
item_id_mapping = dict(zip(all_data['item_encoded'], all_data['item']))

모델링

In [6]:
class DeepFM(nn.Module):
    def __init__(self, input_dims, embedding_dim, mlp_dims, drop_rate=0.1):
        super(DeepFM, self).__init__()
        
        self.total_input_dim = sum(input_dims)
        self.embedding_dim = embedding_dim
        # 범주형 변수의 개수
        self.num_categorical_features = len(input_dims)
        continuous_columns = ['genre_embedding', 'year']
        # 연속형 변수의 총 차원 수
        self.total_continuous_feature_dim = sum(1 if col in ['year'] else all_data[col].iloc[0].shape[0] for col in continuous_columns if col in all_data.columns)

        print(f"Input Dims: {input_dims}")
        print(f"num_categorical_features: {self.num_categorical_features}")
        print(f"total_continuous_feature_dim: {self.total_continuous_feature_dim}")
        print(f"embedding_dim: {self.embedding_dim}")

        # Embedding layer for categorical variables
        self.embedding = nn.Embedding(self.total_input_dim, embedding_dim)

        # FM components
        self.fc = nn.Embedding(self.total_input_dim, 1)
        self.bias = nn.Parameter(torch.zeros((1,)))

        # Continuous features' linear transformation
        self.continuous_linear = nn.Linear(self.total_continuous_feature_dim, self.total_continuous_feature_dim * embedding_dim)
        self.embedding_dim_total = self.num_categorical_features * embedding_dim + self.total_continuous_feature_dim * embedding_dim

        # MLP components
        self.mlp_input_dim = (self.num_categorical_features * embedding_dim) + (self.total_continuous_feature_dim * embedding_dim)
        print(f"Calculated MLP Input Dimension: {self.mlp_input_dim}")
        mlp_layers = []
        for i, dim in enumerate(mlp_dims):
            if i == 0:
                mlp_layers.append(nn.Linear(self.mlp_input_dim, dim))
            else:
                mlp_layers.append(nn.Linear(mlp_dims[i-1], dim))
            mlp_layers.append(nn.ReLU())
            mlp_layers.append(nn.Dropout(drop_rate))
        mlp_layers.append(nn.Linear(mlp_dims[-1], 1))
        self.mlp_layers = nn.Sequential(*mlp_layers)

    def fm(self, x_categorical, x_continuous):
        # Embedding lookup for categorical variables
        embed_x = self.embedding(x_categorical)  # (batch_size, num_categorical_features, embedding_dim) = (2048, 2, 64)

        # Transform continuous features
        x_continuous_transformed = self.continuous_linear(x_continuous.float())  # (batch_size, total_continuous_feature_dim * embedding_dim) = (2048, 20*64=1280)
        x_continuous_transformed = x_continuous_transformed.view(
            -1, self.total_continuous_feature_dim, self.embedding_dim
        )  # (batch_size, total_continuous_feature_dim, embedding_dim) = (2048, 20, 64)

        # Concatenate embeddings and continuous features
        fm_input = torch.cat([embed_x, x_continuous_transformed], dim=1)  # (batch_size, total_features, embedding_dim) = (2048, 22, 64)

        # Linear term
        linear_part = torch.sum(self.fc(x_categorical), dim=1) + self.bias  # (batch_size, 1)
        linear_part += torch.sum(x_continuous, dim=1, keepdim=True)

        # Pairwise interaction term
        square_of_sum = torch.sum(fm_input, dim=1) ** 2  # (batch_size, embedding_dim) = (2048, 64)
        sum_of_square = torch.sum(fm_input ** 2, dim=1)  # (batch_size, embedding_dim) = (2048, 64)
        interaction_part = 0.5 * (square_of_sum - sum_of_square).sum(dim=1, keepdim=True)  # (batch_size, 1)

        fm_y = linear_part + interaction_part
        return fm_y

    def mlp(self, x_categorical, x_continuous):
        # Embedding lookup for categorical variables
        embed_x = self.embedding(x_categorical)  # (batch_size, num_categorical_features, embedding_dim)
        # print(f"MLP - Embed_x Shape: {embed_x.shape}")  # 확인: (1024, num_categorical_features, embedding_dim)
        embed_x = embed_x.view(embed_x.size(0), -1)  # Flatten embeddings
        # print(f"MLP - Flattened Embed_x Shap!!!e: {embed_x.shape}")  # 확인: (1024, num_categorical_features * embedding_dim)

        # Transform continuous features
        x_continuous_transformed = self.continuous_linear(x_continuous)  # (batch_size, total_continuous_feature_dim * embedding_dim)
        # print(f"MLP - x_continuous_transformed Shape: {x_continuous_transformed.shape}")  # 확인: (1024, total_continuous_feature_dim * embedding_dim)

        # Concatenate embeddings and continuous features
        combined_features = torch.cat([embed_x, x_continuous_transformed], dim=1)  # (batch_size, mlp_input_dim) = (1024, (4+68)*64=4608)
        # print(f"MLP - Combined Features Shape: {combined_features.shape}")  # 확인: (1024, mlp_input_dim)
        
        # MLP forward pass
        mlp_y = self.mlp_layers(combined_features)  # (batch_size, 1)
        # print(f"MLP - MLP Output Shape: {mlp_y.shape}")  # 확인: (1024, 1)
        return mlp_y

    def forward(self, x_categorical, x_continuous):
        # x_categorical: (batch_size, num_categorical_features)
        # x_continuous: (batch_size, total_continuous_feature_dim)

        # FM component
        fm_y = self.fm(x_categorical, x_continuous)  # (batch_size, 1)
        
        # MLP component
        mlp_y = self.mlp(x_categorical, x_continuous)  # (batch_size, 1)
        
        # Combine FM and MLP components
        y = fm_y + mlp_y
        
        return y.squeeze(1)  # (batch_size,) 


In [7]:
# Device 설정
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [None]:
# 각 범주형 변수의 고유값 개수
user_dim = all_data['user_encoded'].nunique()
item_dim = all_data['item_encoded'].nunique()


# 범주형 변수의 고유값 개수를 리스트로 전달
input_dims = [user_dim, item_dim] 

# 임베딩 차원 설정
embedding_dim = 32

# MLP 레이어 차원 설정
mlp_dims = [64, 32, 16]

# 드롭아웃 비율 설정
drop_rate = 0.1

# DeepFM 모델 초기화
model = DeepFM(input_dims=input_dims, 
               embedding_dim=embedding_dim, mlp_dims=mlp_dims, drop_rate=drop_rate).to(device)


Input Dims: [31360, 6807]
num_categorical_features: 2
total_continuous_feature_dim: 19
embedding_dim: 32
Calculated MLP Input Dimension: 672


In [15]:
class PairwiseRecommendationDataset(Dataset):
    def __init__(self, user_col, item_col, user_item_dict, all_items, item_probs, user_item_embeddings, num_negatives=1):
        self.user_col = user_col.cpu()
        self.item_col = item_col.cpu()
        self.user_item_dict = user_item_dict
        self.all_items = np.array(all_items)
        self.item_probs = item_probs
        self.num_negatives = num_negatives
        self.user_item_embeddings = user_item_embeddings  # 사용자-아이템 임베딩

    def __len__(self):
        return len(self.user_col)

    def __getitem__(self, idx):
        user = self.user_col[idx]
        pos_item = self.item_col[idx]

        # Negative 샘플링: 사용자-아이템 임베딩 유사도 기반
        neg_items = self.sample_similarity_negatives(user)

        return user, pos_item, torch.tensor(neg_items, dtype=torch.long)

    def sample_similarity_negatives(self, user):
        user_id = user.item()
        seen_items = self.user_item_dict[user_id]

        # 유저 임베딩
        user_embedding = self.user_item_embeddings[user_id]
        # 모든 아이템 임베딩과의 유사도 계산 (Cosine Similarity)
        item_embeddings = self.user_item_embeddings[len(self.user_item_dict):]
        similarities = np.dot(item_embeddings, user_embedding) / (
            np.linalg.norm(item_embeddings, axis=1) * np.linalg.norm(user_embedding)
        )

        # 유사도가 낮은 아이템 중 Negative 샘플 선택
        neg_items = []
        while len(neg_items) < self.num_negatives:
            neg_item = np.random.choice(self.all_items, p=self.item_probs)
            if neg_item not in seen_items:
                neg_items.append(neg_item)
        return neg_items

In [16]:
class LambdaRankLoss(nn.Module):
    def __init__(self):
        super(LambdaRankLoss, self).__init__()

    def forward(self, pos_preds, neg_preds):
        """
        pos_preds: Positive 샘플의 예측값 (batch_size,)
        neg_preds: Negative 샘플의 예측값 (batch_size, num_negatives)
        """
        # Difference between positive and negative predictions
        diff = pos_preds.unsqueeze(1) - neg_preds  # (batch_size, num_negatives)
        # LambdaRank Loss 계산
        loss = -torch.log(torch.sigmoid(diff) + 1e-8).sum(dim=1).mean()
        return loss

In [12]:
# 유저별 시청 아이템 딕셔너리 생성
user_item_dict = all_data.groupby('user_encoded')['item_encoded'].apply(set).to_dict()

# Categorical Features (long 타입)
user_col = torch.tensor(all_data['user_encoded'].values, dtype=torch.long).to(device)
item_col = torch.tensor(all_data['item_encoded'].values, dtype=torch.long).to(device)

# Continuous Features (float 타입)
genre_col = torch.tensor(all_data['genre_embedding'].tolist(), dtype=torch.float32).to(device)

# 'year'를 numpy 배열로 미리 준비
all_years = all_data['year'].values

# DataLoader 생성
dataset = PairwiseRecommendationDataset(
    user_col=user_col,
    item_col=item_col,
    user_item_dict=user_item_dict,
    all_items=all_items,
    item_probs=item_probs,
    user_item_embeddings=user_item_embeddings,
    num_negatives=10  # Negative 샘플 수
)
train_loader = DataLoader(dataset, batch_size=64, shuffle=True)

  genre_col = torch.tensor(all_data['genre_embedding'].tolist(), dtype=torch.float32).to(device)


In [13]:
def train_pairwise(model, train_loader, optimizer, epochs=10):
    model.train()
    criterion = LambdaRankLoss()

    for epoch in range(epochs):
        total_loss = 0
        for user, pos_item, neg_items in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
            user = user.to(device, dtype=torch.long)
            pos_item = pos_item.to(device, dtype=torch.long)
            neg_items = neg_items.to(device, dtype=torch.long)

            optimizer.zero_grad()

            # Positive 샘플
            pos_genre = genre_col[pos_item]
            pos_year = torch.tensor(all_years[pos_item.cpu().numpy()], dtype=torch.float32, device=device).unsqueeze(1)
            pos_continuous = torch.cat([pos_genre, pos_year], dim=1)
            x_categorical_pos = torch.stack([user, pos_item], dim=1)
            pos_preds = model(x_categorical_pos, pos_continuous)

            # Negative 샘플
            neg_items_flat = neg_items.view(-1)
            user_neg = user.unsqueeze(1).expand(-1, neg_items.shape[1]).reshape(-1)
            neg_genre = genre_col[neg_items_flat]
            neg_year = torch.tensor(all_years[neg_items_flat.cpu().numpy()], dtype=torch.float32, device=device).unsqueeze(1)
            neg_continuous = torch.cat([neg_genre, neg_year], dim=1)
            x_categorical_neg = torch.stack([user_neg, neg_items_flat], dim=1)
            neg_preds = model(x_categorical_neg, neg_continuous).view(neg_items.shape[0], neg_items.shape[1])

            # LambdaRank Loss 계산
            loss = criterion(pos_preds, neg_preds)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch [{epoch + 1}/{epochs}], Loss: {avg_loss:.4f}")


In [None]:
# model.load_state_dict(torch.load('deepfm_model.pth'))

In [None]:
# 옵티마이저 설정
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)

# 모델 학습
train_pairwise(model, train_loader, optimizer, epochs=10)


Epoch 1/10:   5%|▌         | 4316/80539 [07:22<2:10:15,  9.75it/s]


KeyboardInterrupt: 

In [61]:
def recommend_top_k(model, user_item_dict, all_items, k=10):
    """
    모델을 사용하여 모든 유저에 대해 상위 K개의 영화를 추천합니다.
    
    Args:
    - model: 훈련된 DeepFM 모델.
    - user_item_dict: 유저별 시청한 영화 딕셔너리 {user_id: set(movies_seen)}.
    - all_items: 전체 아이템 리스트.
    - k: 추천할 영화 개수.
    
    Returns:
    - recommendations: 추천 결과 리스트 [{"user": user_id, "item": movie_id}, ...]
    """
    model.eval()
    recommendations = []
    
    with torch.no_grad():
        for user_id in user_item_dict.keys():
            # 유저가 본 적 없는 영화
            unseen_items = list(set(all_items) - user_item_dict[user_id])
            
            if len(unseen_items) == 0:
                continue  # 모든 영화를 이미 본 경우
            
            # 유저 ID와 미시청 영화 ID 텐서 생성
            user_tensor = torch.tensor([user_id] * len(unseen_items), dtype=torch.long, device=device)
            item_tensor = torch.tensor(unseen_items, dtype=torch.long, device=device)
            
            # Title과 Genre 임베딩
            genre_tensor = genre_col[item_tensor].to(device)
            
            # Year과 Watch Year
            year_tensor = torch.tensor(
                all_data['year'].values[item_tensor.cpu().numpy()],
                dtype=torch.float32,
                device=device
            ).unsqueeze(1)
            
            watch_year_tensor = torch.tensor(
                all_data['watch_year'].values[item_tensor.cpu().numpy()],
                dtype=torch.float32,
                device=device
            ).unsqueeze(1)
            
            # Continuous Features 결합
            continuous_features = torch.cat([genre_tensor, year_tensor, watch_year_tensor], dim=1)
            
            # Categorical Features 결합
            categorical_features = torch.stack([user_tensor, item_tensor], dim=1)
            
            # 모델로 점수 예측
            scores = model(categorical_features, continuous_features)
            
            # 상위 K개 아이템 선택
            top_k_indices = torch.topk(scores, k=k).indices.cpu().numpy()
            top_k_items = item_tensor[top_k_indices].cpu().numpy()
            
            # 추천 결과 저장
            for item_id in top_k_items:
                recommendations.append({"user": user_id, "item": item_id})
    
    return recommendations


In [None]:
# 1. 인코딩된 ID를 원래의 ID로 매핑하기 위한 딕셔너리 생성
user_id = dict(zip(all_data['user_encoded'], all_data['user']))
item_id = dict(zip(all_data['item_encoded'], all_data['item']))

# 2. 추천 결과를 원래의 ID로 복원하여 데이터프레임 생성
recommendation_list = []

recommendations = recommend_top_k(model, user_item_dict, all_items, k=10)

for user_encoded, item_encoded_list in recommendations.items():
    # 원래의 유저 ID로 변환
    user = user_id[user_encoded]
    
    for item_encoded in item_encoded_list:
        # 원래의 아이템 ID로 변환
        item = item_id[item_encoded]
        
        # 추천 결과 추가
        recommendation_list.append({
            'user': user,
            'item': item
        })

# 추천 결과를 데이터프레임으로 변환
recommendations_df = pd.DataFrame(recommendation_list)

# 3. CSV 파일로 저장
recommendations_df.to_csv('recommendations.csv', index=False)
print("추천 결과가 'recommendations.csv' 파일로 저장되었습니다.")

NameError: name 'recommend_top_k' is not defined