In [1]:
from torchtext.data import get_tokenizer
from torchtext.vocab import GloVe
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import DataLoader
from sklearn.metrics import mean_squared_error
from torch_utils import MovieRatingDataset, Recommender, device
import numpy as np
import torch
import pandas as pd

In [2]:
tokenizer = get_tokenizer('basic_english')
tag_vocab = MovieRatingDataset.build_tag_vocab(tokenizer)

In [3]:
train_data = MovieRatingDataset(
    tag_vocab=tag_vocab,
    tokenizer=tokenizer,
    train='train'
)

valid_data = MovieRatingDataset(
    tag_vocab=tag_vocab,
    tokenizer=tokenizer,
    train='valid'
)

test_data = MovieRatingDataset(
    tag_vocab=tag_vocab,
    tokenizer=tokenizer,
    train='test'
)

In [4]:
tag_embeds = 50
vec = GloVe(name='6B', dim=tag_embeds)
embeddings = vec.get_vecs_by_tokens(train_data.vocab.get_itos(), lower_case_backup=True)

In [5]:
for tag_string in train_data.vocab.get_itos():
    vec_embed = vec[tag_string]
    embed_embed = embeddings[train_data.vocab[tag_string]]
    assert np.allclose(vec_embed, embed_embed)

In [6]:
model = Recommender(
    num_users=len(train_data.data_df['user_label'].unique()),
    num_movies=train_data.movies.shape[0],
    num_genres=len(train_data.genres_df.columns),
    num_tags=len(train_data.vocab),
    user_movie_embed=10,
    tag_embed=tag_embeds,
    tag_weights=embeddings,
    device=device,
    freeze=True,
)

In [7]:
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name)

user_embedding.weight
movie_embedding.weight
fc.0.weight
fc.0.bias
fc.3.weight
fc.3.bias


In [8]:
# Define our Loss
loss = nn.MSELoss()

# Define our optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Define our data loaders
train_data_loader = DataLoader(
    train_data,
    batch_size=1024,
    shuffle=True,
    collate_fn=model.collate_fn
)

valid_data_loader = DataLoader(
    valid_data,
    batch_size=1024,
    shuffle=False,
    collate_fn=model.collate_fn
)

test_data_loader = DataLoader(
    test_data,
    batch_size=1024,
    shuffle=False,
    collate_fn=model.collate_fn_test
)

In [9]:
# Training loop
from tqdm import tqdm
epochs = 50
for epoch in range(epochs):
    model.train()
    desc_string = "[Epoch" + " " * (4 - len(str(epoch))) + f"{epoch}]"
    # Train and update model parameters
    for batch in tqdm(train_data_loader, desc=desc_string):
        ratings, inputs = batch
        optimizer.zero_grad()
        predictions = model(inputs)
        loss_value = loss(predictions, ratings)
        loss_value.backward()
        optimizer.step()
    tqdm.write(f"Train MSE Loss: {loss_value.item():.3f}")
    # Evaluate on validation set
    model.eval()
    with torch.no_grad():
        # Calculate RMSE
        valid_loss = 0
        predictions_list = []
        all_ratings = []
        for batch in valid_data_loader:
            ratings, inputs = batch
            predictions = model(inputs)
            predictions_list.append(predictions)
            all_ratings.append(ratings)
        predictions = torch.cat(predictions_list)
        all_ratings = torch.cat(all_ratings)
        valid_loss = mean_squared_error(predictions.cpu().numpy(), all_ratings.cpu().numpy(), squared=False)
        tqdm.write(f"Validation RMSE Loss: {valid_loss:.3f}")

[Epoch   0]:  73%|███████▎  | 58/79 [00:13<00:04,  4.31it/s]


KeyboardInterrupt: 

In [None]:
# Now let's do the predictions on the test set
model.eval()
predictions = {} # A dict with row_id as key and rating as value
# Since the test_data_loader has shuffle=False, 
with torch.no_grad():
    for batch in tqdm(test_data_loader, leave=True, desc="Predictions"):
        row_id, users, movies, genres, tags = batch
        predictions_batch = model((users, movies, genres, tags))
        for row_id, prediction in zip(row_id, predictions_batch):
            predictions[row_id.item()] = prediction.item()

Predictions: 100%|██████████| 20/20 [00:04<00:00,  4.37it/s]


In [None]:
# From the predictions dict, we build a dataframe
submission_df = pd.DataFrame.from_dict(predictions, orient='index', columns=['rating'])
submission_df.index.name = 'Id'
submission_df.head()

Unnamed: 0_level_0,rating
Id,Unnamed: 1_level_1
0,2.989005
1,3.275538
2,2.344338
3,3.810655
4,3.550009


In [None]:
# Save the dataframe to a csv file
submission_df.to_csv('submission_new_approach.csv')