In [1]:
from torchtext.data import get_tokenizer
from torchtext.vocab import GloVe
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import DataLoader
from sklearn.metrics import mean_squared_error
from torch_utils import MovieRatingDataset, Recommender, device
import numpy as np
import torch
import pandas as pd

In [2]:
device = torch.device('cpu')
device

device(type='cpu')

In [3]:
tokenizer = get_tokenizer('basic_english')
tag_vocab = MovieRatingDataset.build_tag_vocab(tokenizer)

In [4]:
train_data = MovieRatingDataset(
    tag_vocab=tag_vocab,
    tokenizer=tokenizer,
    train='train'
)

valid_data = MovieRatingDataset(
    tag_vocab=tag_vocab,
    tokenizer=tokenizer,
    train='valid'
)

test_data = MovieRatingDataset(
    tag_vocab=tag_vocab,
    tokenizer=tokenizer,
    train='test'
)

In [5]:
tag_embeds = 50
vec = GloVe(name='6B', dim=tag_embeds)
embeddings = vec.get_vecs_by_tokens(train_data.vocab.get_itos(), lower_case_backup=True)

In [6]:
for tag_string in train_data.vocab.get_itos():
    vec_embed = vec[tag_string]
    embed_embed = embeddings[train_data.vocab[tag_string]]
    assert np.allclose(vec_embed, embed_embed)

In [7]:
model = Recommender(
    num_users=len(train_data.data_df['user_label'].unique()),
    num_movies=train_data.movies.shape[0],
    num_genres=len(train_data.genres_df.columns),
    num_tags=len(train_data.vocab),
    user_movie_embed=10,
    tag_embed=tag_embeds,
    tag_weights=embeddings,
    device=device,
    freeze=True,
)

In [8]:
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name)

user_embedding.weight
movie_embedding.weight
fc.0.weight
fc.0.bias
fc.3.weight
fc.3.bias


In [9]:
# Define our Loss
loss = nn.MSELoss()

# Define our optimizer
optimizer = optim.Adam(model.parameters(), lr=0.005)

# Define our data loaders
train_data_loader = DataLoader(
    train_data,
    batch_size=len(train_data),
    shuffle=True,
    collate_fn=model.collate_fn
)

valid_data_loader = DataLoader(
    valid_data,
    batch_size=len(valid_data),
    shuffle=False,
    collate_fn=model.collate_fn
)


In [10]:
test_data_loader = DataLoader(
    test_data,
    batch_size=len(test_data),
    shuffle=False,
    collate_fn=model.collate_fn_test
)

In [11]:
train_data_extracted = None
for batch in train_data_loader:
    train_data_extracted = batch

valid_data_extracted = None
for batch in valid_data_loader:
    valid_data_extracted = batch

In [12]:
all_train_ratings, all_train_data = train_data_extracted
all_valid_ratings, all_valid_data = valid_data_extracted

In [13]:
users, movies, genres, tags, lang, budget, popularity, runtime, vote_average, vote_count, revenue, overview_embeddings = all_train_data

In [14]:
overview_embeddings

tensor([[ 9.8602, -0.2087,  4.7406,  ..., 12.1812,  0.9682,  2.9556],
        [10.4047,  0.1298,  3.9398,  ..., 11.4039,  1.8472,  3.3035],
        [10.4192,  0.3383,  4.2487,  ..., 12.2143,  2.1025,  2.5916],
        ...,
        [ 9.5527, -0.4575,  4.7554,  ..., 11.8770,  0.5459,  2.9962],
        [ 8.9806, -1.3935,  3.7656,  ..., 12.0576,  1.1271,  2.5512],
        [ 9.6700, -0.5782,  4.1318,  ..., 12.1648,  1.5044,  2.7012]])

In [15]:
# Training loop
from tqdm import tqdm
epochs = 2200
pbar = tqdm(range(epochs))
last_valid_loss = "???"
for epoch in pbar:
    model.train()
    optimizer.zero_grad()
    predictions = model(all_train_data)
    loss_value = loss(predictions, all_train_ratings)
    loss_value.backward()
    optimizer.step()
    model.eval()
    with torch.no_grad():
        # Calculate RMSE
        predictions = model(all_valid_data)
        valid_loss = mean_squared_error(predictions.cpu(), all_valid_ratings.cpu(), squared=False)
        last_valid_loss = valid_loss
    pbar.set_description(f"Train MSE Loss: {loss_value.item():.3f}, Valid RMSE Loss: {last_valid_loss:.3f}")
    pbar.update()

print(f"Final Train MSE Loss: {loss_value.item():.3f}, Valid RMSE Loss: {last_valid_loss:.3f}")

  0%|          | 0/2200 [00:00<?, ?it/s]

Train MSE Loss: 0.669, Valid RMSE Loss: 0.832: 100%|██████████| 2200/2200 [08:35<00:00,  4.27it/s]

Final Train MSE Loss: 0.669, Valid RMSE Loss: 0.832





In [16]:
# Now let's do the predictions on the test set
model.eval()
predictions = {} # A dict with row_id as key and rating as value
# Since the test_data_loader has shuffle=False, 
with torch.no_grad():
    for batch in tqdm(test_data_loader, leave=True, desc="Predictions"):
        row_id, inputs = batch
        predictions_batch = model(inputs)
        for row_id, prediction in zip(row_id, predictions_batch):
            predictions[row_id.item()] = prediction.item()

Predictions: 100%|██████████| 1/1 [00:04<00:00,  4.08s/it]


In [17]:
# From the predictions dict, we build a dataframe
submission_df = pd.DataFrame.from_dict(predictions, orient='index', columns=['rating'])
submission_df.index.name = 'Id'
submission_df.head()

Unnamed: 0_level_0,rating
Id,Unnamed: 1_level_1
0,3.191563
1,3.224806
2,2.997659
3,3.84013
4,3.431867


In [None]:
# From the predictions dict, we build a dataframe
submission_df = pd.DataFrame.from_dict(predictions, orient='index', columns=['rating'])
submission_df.index.name = 'Id'
submission_df.head()

Unnamed: 0_level_0,rating
Id,Unnamed: 1_level_1
0,3.025945
1,3.435218
2,2.572485
3,3.854323
4,3.353108


In [18]:
# Save the dataframe to a csv file
submission_df.to_csv('submission_new_approach13.csv')