In [1]:
from torchtext.data import get_tokenizer
from torchtext.vocab import GloVe
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import DataLoader
from sklearn.metrics import mean_squared_error
from torch_utils import MovieRatingDataset, Recommender, device
import numpy as np
import torch
import pandas as pd

In [2]:
tokenizer = get_tokenizer('basic_english')
tag_vocab = MovieRatingDataset.build_tag_vocab(tokenizer)

In [3]:
train_data = MovieRatingDataset(
    tag_vocab=tag_vocab,
    tokenizer=tokenizer,
    train='train'
)

test_data = MovieRatingDataset(
    tag_vocab=tag_vocab,
    tokenizer=tokenizer,
    train='test'
)

In [4]:
tag_embeds = 50
vec = GloVe(name='6B', dim=tag_embeds)
embeddings = vec.get_vecs_by_tokens(train_data.vocab.get_itos(), lower_case_backup=True)

In [5]:
model = Recommender(
    num_users=len(train_data.data_df['user_label'].unique()),
    num_movies=train_data.movies.shape[0],
    num_genres=len(train_data.genres_df.columns),
    num_tags=len(train_data.vocab),
    user_movie_embed=10,
    tag_embed=tag_embeds,
    tag_weights=embeddings,
    device=device,
    freeze=True,
)

input_dim: 102


In [6]:
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name)

user_embedding.weight
movie_embedding.weight
fc.0.weight
fc.0.bias
fc.3.weight
fc.3.bias
fc.6.weight
fc.6.bias


In [7]:
# Define our Loss
loss = nn.MSELoss()

# Define our optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=0.001)

# Define our data loaders
train_data_loader = DataLoader(
    train_data,
    batch_size=len(train_data),
    shuffle=True,
    collate_fn=model.collate_fn
)

train_data_extracted = None
for batch in train_data_loader:
    train_data_extracted = batch


all_train_ratings, all_train_data = train_data_extracted

In [8]:
load_best_model = True
force_train = False
model_name = 'model_best.pt'

import os
if load_best_model and os.path.exists(model_name):
    print('model found, loading parameters...')
    model.load_state_dict(torch.load(model_name, map_location=device))
    model.eval()

    # Calculate train and valid loss
    with torch.no_grad():
        predictions = model(all_train_data)
        loss_value = loss(predictions, all_train_ratings)
    print(f"Loaded Model: Train MSE Loss: {loss_value.item():.3f}")

if not load_best_model or not os.path.exists(model_name) or force_train:
    if load_best_model and not os.path.exists(model_name):
        print('Load best model was set but model was not found, training from scratch...')
    elif not load_best_model:
        print('training from scratch...')
    elif os.path.exists('model_0817.pt') and force_train:
        print('force_train was set, fine-tuning the loaded model...')
    else:
        print('starting from scratch...')
    from math import inf
    # Training loop
    from tqdm import tqdm
    epochs = 8000
    pbar = tqdm(range(epochs))
    for epoch in pbar:
        model.train()
        optimizer.zero_grad()
        predictions = model(all_train_data)
        loss_value = loss(predictions, all_train_ratings)
        loss_value.backward()
        optimizer.step()
        pbar.set_description(f"Train MSE Loss: {loss_value.item():.3f}")
        pbar.update()

    print(f"Final Train MSE Loss: {loss_value.item():.3f}")

model found, loading parameters...
Loaded Model: Train MSE Loss: 0.524


In [9]:
test_data_loader = DataLoader(
    test_data,
    batch_size=len(test_data),
    shuffle=False,
    collate_fn=model.collate_fn_test
)

In [10]:
from tqdm import tqdm

# Now let's do the predictions on the test set
model.eval()
predictions = {} # A dict with row_id as key and rating as value
# Since the test_data_loader has shuffle=False, 
with torch.no_grad():
    for batch in tqdm(test_data_loader, leave=True, desc="Predictions"):
        row_id, inputs = batch
        predictions_batch = model(inputs)
        for row_id, prediction in zip(row_id, predictions_batch):
            predictions[row_id.item()] = prediction.item()

Predictions: 100%|██████████| 1/1 [00:10<00:00, 10.83s/it]


In [11]:
# From the predictions dict, we build a dataframe
submission_df = pd.DataFrame.from_dict(predictions, orient='index', columns=['rating'])
submission_df.index.name = 'Id'
submission_df.head()

Unnamed: 0_level_0,rating
Id,Unnamed: 1_level_1
0,3.616104
1,3.205281
2,3.048168
3,3.882848
4,3.756728


In [12]:
# Save the dataframe to a csv file
# submission_df.to_csv('submissions/submission.csv')

In [13]:
# save the model to load it later
# torch.save(model.state_dict(), 'model_state.pt')