In [1]:
import numpy as np
import pandas as pd
from keras.models import Model
from keras.layers import Input, Embedding, LSTM, Dropout, Dense
from keras.regularizers import l2
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score

In [2]:
# 데이터 로드

column_names = ["user_id", "movie_id", "rating", "timestamp"]
ratings_df = pd.read_csv('../../data/ratings.dat', sep="::", names=column_names, engine='python')

# Parameters
EMBEDDING_DIM = 50
SEQUENCE_LENGTH = 5  # The latest 5 ratings for each user as a sequence

num_movies = ratings_df['movie_id'].max()
num_users = ratings_df['user_id'].max()

def generate_train_sequences(ratings_df):
    user_sequences = ratings_df.groupby('user_id')['movie_id'].apply(list).tolist()
    
    user_ids, sequence_data = [], []
    for user_id, seq in enumerate(user_sequences, 1):
        if len(seq) >= SEQUENCE_LENGTH:
            user_ids.append(user_id)
            sequence_data.append(seq[-SEQUENCE_LENGTH:])  # Taking the last SEQUENCE_LENGTH ratings

    return np.array(user_ids), np.array(sequence_data)

# Splitting data into training and validation sets
train_ratings, val_ratings = train_test_split(ratings_df, test_size=0.2, random_state=42)
train_users, train_sequences = generate_train_sequences(train_ratings)
val_users, val_sequences = generate_train_sequences(val_ratings)

# Model Architecture
user_input = Input(shape=(SEQUENCE_LENGTH,), dtype='int32', name='user_sequence_input')
user_embedding = Embedding(num_users + 1, EMBEDDING_DIM, input_length=SEQUENCE_LENGTH, name='user_embedding', embeddings_regularizer=l2(0.001))(user_input)

lstm_out = LSTM(100)(user_embedding)
lstm_out = Dropout(0.5)(lstm_out)
output = Dense(num_movies + 1, activation='softmax')(lstm_out)

model = Model(inputs=user_input, outputs=output)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Preparing labels for training
train_labels = np.zeros((train_sequences.shape[0], num_movies + 1))
for idx, seq in enumerate(train_sequences):
    train_labels[idx, seq[-1]] = 1  # The last movie in the sequence is our "target" movie

# Model Training
model.fit(train_sequences, train_labels, epochs=5, batch_size=64)

predictions = model.predict(np.array([seq]))

def top_n_recommendation(model, sequence, num_movies=5):
    predictions = model.predict(np.array([sequence]))
    recommended_indices = np.argsort(-predictions[0])[:num_movies]
    return recommended_indices

# Performance Metrics
def calculate_precision_recall_ndcg(model, sequences, N=10):
    precisions, recalls, ndcgs = [], [], []
    for seq in sequences:
        true_movie = seq[-1]  # 가장 최근에 본 영화
        predictions = model.predict(np.array([seq]))
        recommended_movies = top_n_recommendation(model, seq, N)
        
        # Precision & Recall 계산
        precisions.append(1 if true_movie in recommended_movies else 0)
        recalls.append(1 if true_movie in recommended_movies[:len(seq)] else 0)
        
        # NDCG 계산
        true_relevance = np.zeros(N)
        true_relevance[0] = 1 if true_movie == recommended_movies[0] else 0
        ndcgs.append(ndcg_score([true_relevance], [predictions[0][recommended_movies]]))

    return np.mean(precisions), np.mean(recalls), np.mean(ndcgs)


precision, recall, ndcg = calculate_precision_recall_ndcg(model, val_sequences)
print(f"Precision@10{SEQUENCE_LENGTH}: {precision:.4f}")
print(f"Recall@10{SEQUENCE_LENGTH}: {recall:.4f}")
print(f"NDCG@10{SEQUENCE_LENGTH}: {ndcg:.4f}")

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Precision@105: 0.0361
Recall@105: 0.0186
NDCG@105: 0.0043
