In [1]:
!pip install --upgrade torch==1.7.1 torchtext==0.8.1 torchvision==0.8.2



In [2]:
import torch, torchtext, numpy as np
import pandas as pd, csv
from torch import nn, optim
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import pdb
torch.manual_seed(291)
np.random.seed(291)

In [3]:
ingr_map = pd.read_pickle("../datasets/our_ingr_map.pkl")
recipes = pd.read_pickle("../datasets/our_recipes.pkl")
interactions = pd.read_pickle("../datasets/our_interactions.pkl")

In [4]:
class RecipeDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        u2n = { u: n for n, u in enumerate(df['user_id'].unique()) }
        r2n = { r: n for n, r in enumerate(df['recipe_id'].unique()) }
        df['user_id'] = df['user_id'].apply(lambda u: u2n[u])
        df['recipe_id'] = df['recipe_id'].apply(lambda r: r2n[r])
        self.coords = torch.LongTensor(df[['user_id','recipe_id']].values)
        self.ratings = torch.FloatTensor(df['rating'].values)
        self.n_users = df['user_id'].nunique()
        self.n_movies = df['recipe_id'].nunique()

    def __len__(self):
        return len(self.coords)

    def __getitem__(self, i):
        return (self.coords[i], self.ratings[i])

In [5]:
interactions.head(10)

Unnamed: 0,user_id,recipe_id,date,rating,u
0,2046,517,2000-02-25,5.0,22095
1,868626,517,2009-07-24,5.0,11611
2,1773,7435,2000-03-13,5.0,24732
3,16346,7435,2001-08-23,0.0,15023
4,10649,7435,2001-12-06,3.0,17766
5,35414,7435,2002-03-26,4.0,7489
6,26652,7435,2004-03-18,5.0,1690
7,122001,7435,2004-03-19,5.0,3214
8,121581,7435,2005-02-15,5.0,2942
9,161717,7435,2005-03-22,5.0,17514


In [6]:
ds_full = RecipeDataset(interactions)

In [7]:
n_train = int(0.8 * len(ds_full))
n_test = len(ds_full) - n_train
rng = torch.Generator().manual_seed(291)
ds_train, ds_test = torch.utils.data.random_split(ds_full, [n_train, n_test], rng)

In [8]:
class RecipeRecs(nn.Module):
    def __init__(self, n_users, n_recipes, emb_dim):
        super(RecipeRecs, self).__init__()
        self.user_emb = nn.Embedding(n_users, emb_dim)
        self.user_bias = nn.Embedding(n_users, 1)
        self.recipe_emb = nn.Embedding(n_recipes, emb_dim)
        self.recipe_bias = nn.Embedding(n_recipes, 1)
        nn.init.xavier_uniform_(self.user_emb.weight)
        nn.init.xavier_uniform_(self.recipe_emb.weight)
        nn.init.zeros_(self.user_bias.weight)
        nn.init.zeros_(self.recipe_bias.weight)
    
    def forward(self, samples):
        users = self.user_emb(samples[:,0])
        recipes = self.recipe_emb(samples[:,1])
        dot = (users * recipes).sum(1)
        user_b = self.user_bias(samples[:,0]).squeeze()
        recipe_b = self.recipe_bias(samples[:,1]).squeeze()
        return dot + user_b + recipe_b

In [9]:
device = torch.device('cpu')

def run_test(model, ldr, crit):
    total_loss, total_count = 0, 0
    model.eval()
    tq_iters = tqdm(ldr, leave=False, desc='test iter')
    with torch.no_grad():
        for coords, labels in tq_iters:
            coords, labels = coords.to(device), labels.to(device)
            preds = model(coords)
            loss = crit(preds, labels)
            total_loss += loss.item() * labels.size(0)
            total_count += labels.size(0)
            tq_iters.set_postfix({'loss': total_loss/total_count}, refresh=True)
    return total_loss / total_count

def run_train(model, ldr, crit, opt, sched):
    model.train()
    total_loss, total_count = 0, 0
    tq_iters = tqdm(ldr, leave=False, desc='train iter')
    for (coords, labels) in tq_iters:
        opt.zero_grad()
        coords, labels = coords.to(device), labels.to(device)
        preds = model(coords)
        loss = crit(preds, labels)
        loss.backward()
        opt.step()
        sched.step()
        total_loss += loss.item() * labels.size(0)
        total_count += labels.size(0)
        tq_iters.set_postfix({'loss': total_loss/total_count}, refresh=True)
    return total_loss / total_count

def run_all(model, ldr_train, ldr_test, crit, opt, sched, n_epochs=10):
    best_loss = np.inf
    tq_epochs = tqdm(range(n_epochs), desc='epochs', unit='ep')
    for epoch in tq_epochs:
        train_loss = run_train(model, ldr_train, crit, opt, sched)
        test_loss = run_test(model, ldr_test, crit)
        tqdm.write(f'epoch {epoch}   train loss {train_loss:.6f}    test loss {test_loss:.6f}')
        if test_loss < best_loss:
            best_loss = test_loss
            tq_epochs.set_postfix({'bE': epoch, 'bL': best_loss}, refresh=True)

In [15]:
model = RecipeRecs(ds_full.n_users, ds_full.n_movies, 20)
model.to(device)

ldr_train = torch.utils.data.DataLoader(ds_train, batch_size=32, shuffle=True)
ldr_test = torch.utils.data.DataLoader(ds_test, batch_size=32)

n_epochs = 5

crit = nn.MSELoss().to(device)
opt = optim.SGD(model.parameters(), lr=1e-6, momentum=0.9)
sched = optim.lr_scheduler.OneCycleLR(opt, max_lr=0.4, steps_per_epoch=len(ldr_train), epochs=n_epochs)

run_all(model, ldr_train, ldr_test, crit, opt, sched, n_epochs)

HBox(children=(HTML(value='epochs'), FloatProgress(value=0.0, max=5.0), HTML(value='')))

HBox(children=(HTML(value='train iter'), FloatProgress(value=0.0, max=10509.0), HTML(value='')))

HBox(children=(HTML(value='test iter'), FloatProgress(value=0.0, max=2628.0), HTML(value='')))

epoch 0   train loss 5.890546    test loss 2.462677


HBox(children=(HTML(value='train iter'), FloatProgress(value=0.0, max=10509.0), HTML(value='')))

HBox(children=(HTML(value='test iter'), FloatProgress(value=0.0, max=2628.0), HTML(value='')))

epoch 1   train loss 1.625816    test loss 1.638718


HBox(children=(HTML(value='train iter'), FloatProgress(value=0.0, max=10509.0), HTML(value='')))

HBox(children=(HTML(value='test iter'), FloatProgress(value=0.0, max=2628.0), HTML(value='')))

epoch 2   train loss 0.924948    test loss 1.495614


HBox(children=(HTML(value='train iter'), FloatProgress(value=0.0, max=10509.0), HTML(value='')))

HBox(children=(HTML(value='test iter'), FloatProgress(value=0.0, max=2628.0), HTML(value='')))

epoch 3   train loss 0.524092    test loss 1.422464


HBox(children=(HTML(value='train iter'), FloatProgress(value=0.0, max=10509.0), HTML(value='')))

HBox(children=(HTML(value='test iter'), FloatProgress(value=0.0, max=2628.0), HTML(value='')))

epoch 4   train loss 0.270138    test loss 1.412836

