In [1]:
from torchtext.data import get_tokenizer
from torchtext.vocab import GloVe
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import DataLoader
from sklearn.metrics import mean_squared_error
from torch_utils import MovieRatingDataset, Recommender, device
import numpy as np
import torch
import pandas as pd

In [2]:
tokenizer = get_tokenizer('basic_english')
tag_vocab = MovieRatingDataset.build_tag_vocab(tokenizer)

In [3]:
train_data = MovieRatingDataset(
    tag_vocab=tag_vocab,
    tokenizer=tokenizer,
    train='train'
)

valid_data = MovieRatingDataset(
    tag_vocab=tag_vocab,
    tokenizer=tokenizer,
    train='valid'
)

test_data = MovieRatingDataset(
    tag_vocab=tag_vocab,
    tokenizer=tokenizer,
    train='test'
)

In [4]:
tag_embeds = 50
vec = GloVe(name='6B', dim=tag_embeds)
embeddings = vec.get_vecs_by_tokens(train_data.vocab.get_itos(), lower_case_backup=True)

In [5]:
for tag_string in train_data.vocab.get_itos():
    vec_embed = vec[tag_string]
    embed_embed = embeddings[train_data.vocab[tag_string]]
    assert np.allclose(vec_embed, embed_embed)

In [6]:
model = Recommender(
    num_users=len(train_data.data_df['user_label'].unique()),
    num_movies=train_data.movies.shape[0],
    num_genres=len(train_data.genres_df.columns),
    num_tags=len(train_data.vocab),
    user_movie_embed=10,
    tag_embed=tag_embeds,
    tag_weights=embeddings,
    device=device,
    freeze=True,
)

input_dim: 48


In [7]:
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name)

user_embedding.weight
movie_embedding.weight
fc.0.weight
fc.0.bias
fc.3.weight
fc.3.bias
fc.6.weight
fc.6.bias


In [8]:
# Define our Loss
loss = nn.MSELoss()

# Define our optimizer
optimizer = optim.Adam(model.parameters(), lr=0.005)

# Define our data loaders
train_data_loader = DataLoader(
    train_data,
    batch_size=len(train_data),
    shuffle=True,
    collate_fn=model.collate_fn
)

valid_data_loader = DataLoader(
    valid_data,
    batch_size=len(valid_data),
    shuffle=False,
    collate_fn=model.collate_fn
)


In [9]:
test_data_loader = DataLoader(
    test_data,
    batch_size=len(test_data),
    shuffle=False,
    collate_fn=model.collate_fn_test
)

In [10]:
train_data_extracted = None
for batch in train_data_loader:
    train_data_extracted = batch

valid_data_extracted = None
for batch in valid_data_loader:
    valid_data_extracted = batch

In [11]:
all_train_ratings, all_train_data = train_data_extracted
all_valid_ratings, all_valid_data = valid_data_extracted

In [12]:
users, movies, genres, tags, lang, budget, popularity, runtime, vote_average, vote_count, revenue, overview_embeddings = all_train_data

In [13]:
overview_embeddings

tensor([[10.2400,  0.0671,  4.1502,  ..., 11.4720,  1.3777,  2.4046],
        [10.0204,  0.1261,  4.4729,  ..., 11.9630,  1.2093,  3.8267],
        [10.4805,  0.2603,  4.1470,  ..., 12.0793,  1.9890,  3.3217],
        ...,
        [ 9.6916, -2.9879,  3.2266,  ..., 11.8166,  1.4274,  1.1359],
        [ 9.1674, -2.9744,  3.0649,  ..., 11.8258,  1.3218,  1.2727],
        [10.6505,  0.4716,  4.4205,  ..., 11.8701,  1.7616,  2.4487]],
       device='mps:0')

In [14]:
# Training loop
from tqdm import tqdm
epochs = 2200
pbar = tqdm(range(epochs))
last_valid_loss = "???"
for epoch in pbar:
    model.train()
    optimizer.zero_grad()
    predictions = model(all_train_data)
    loss_value = loss(predictions, all_train_ratings)
    loss_value.backward()
    optimizer.step()
    model.eval()
    with torch.no_grad():
        # Calculate RMSE
        predictions = model(all_valid_data)
        valid_loss = mean_squared_error(predictions.cpu(), all_valid_ratings.cpu(), squared=False)
        last_valid_loss = valid_loss
    pbar.set_description(f"Train MSE Loss: {loss_value.item():.3f}, Valid RMSE Loss: {last_valid_loss:.3f}")
    pbar.update()

print(f"Final Train MSE Loss: {loss_value.item():.3f}, Valid RMSE Loss: {last_valid_loss:.3f}")

Train MSE Loss: 0.646, Valid RMSE Loss: 0.827: 100%|██████████| 2200/2200 [02:10<00:00, 16.92it/s]

Final Train MSE Loss: 0.646, Valid RMSE Loss: 0.827





In [15]:
# Now let's do the predictions on the test set
model.eval()
predictions = {} # A dict with row_id as key and rating as value
# Since the test_data_loader has shuffle=False, 
with torch.no_grad():
    for batch in tqdm(test_data_loader, leave=True, desc="Predictions"):
        row_id, inputs = batch
        predictions_batch = model(inputs)
        for row_id, prediction in zip(row_id, predictions_batch):
            predictions[row_id.item()] = prediction.item()

Predictions: 100%|██████████| 1/1 [00:10<00:00, 10.57s/it]


In [16]:
# From the predictions dict, we build a dataframe
submission_df = pd.DataFrame.from_dict(predictions, orient='index', columns=['rating'])
submission_df.index.name = 'Id'
submission_df.head()

Unnamed: 0_level_0,rating
Id,Unnamed: 1_level_1
0,3.008922
1,3.263643
2,3.111709
3,3.69247
4,3.401341


In [17]:
# From the predictions dict, we build a dataframe
submission_df = pd.DataFrame.from_dict(predictions, orient='index', columns=['rating'])
submission_df.index.name = 'Id'
submission_df.head()

Unnamed: 0_level_0,rating
Id,Unnamed: 1_level_1
0,3.008922
1,3.263643
2,3.111709
3,3.69247
4,3.401341


In [19]:
# Save the dataframe to a csv file
submission_df.to_csv('submission_new_approach14.csv')