In [1]:
!pip install --upgrade torch==1.7.1 torchtext==0.8.1 torchvision==0.8.2



In [1]:
import torch, torchtext, numpy as np
import pandas as pd, csv
from torch import nn, optim
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import pdb
torch.manual_seed(291)
np.random.seed(291)

In [2]:
ingr_map = pd.read_pickle("../datasets/our_ingr_map.pkl")
recipes = pd.read_pickle("../datasets/our_recipes.pkl")
interactions = pd.read_pickle("../datasets/our_interactions.pkl")

In [3]:
class RecipeDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.u2n = { u: n for n, u in enumerate(df['user_id'].unique()) }
        self.r2n = { r: n for n, r in enumerate(df['recipe_id'].unique()) }
        df['user_id_n'] = df['user_id'].apply(lambda u: self.u2n[u])
        df['recipe_id_n'] = df['recipe_id'].apply(lambda r: self.r2n[r])
        self.coords = torch.LongTensor(df[['user_id_n','recipe_id_n']].values)
        self.ratings = torch.FloatTensor(df['rating'].values)
        self.n_users = df['user_id_n'].nunique()
        self.n_recipes = df['recipe_id_n'].nunique()

    def __len__(self):
        return len(self.coords)

    def __getitem__(self, i):
        return (self.coords[i], self.ratings[i])

In [4]:
interactions.head(10)

Unnamed: 0,user_id,recipe_id,date,rating
0,2046,517,2000-02-25,5.0
1,868626,517,2009-07-24,5.0
2,1773,7435,2000-03-13,5.0
3,16346,7435,2001-08-23,0.0
4,10649,7435,2001-12-06,3.0
5,35414,7435,2002-03-26,4.0
6,26652,7435,2004-03-18,5.0
7,122001,7435,2004-03-19,5.0
8,121581,7435,2005-02-15,5.0
9,161717,7435,2005-03-22,5.0


In [5]:
ds_full = RecipeDataset(interactions)

## Lets look into similarities here

* one idea is that we could fill in the entire recommendation matrix so every user has a rating for every recipe, then we can much more easily just calculate cosine similarity since the matrix is no longer sparse

Instead, lets try RF-recommendation

In [6]:
df= interactions

In [7]:
df[df["user_id"]==2046]

Unnamed: 0,user_id,recipe_id,date,rating,user_id_n,recipe_id_n
0,2046,517,2000-02-25,5.0,0,0
39,2046,3431,2000-04-07,5.0,0,3
47,2046,13307,2000-05-21,5.0,0,4


In [9]:
df[df["rating"] == 0]

Unnamed: 0,user_id,recipe_id,date,rating,user_id_n,recipe_id_n
3,16346,7435,2001-08-23,0.0,3,1
36,862099,278,2010-08-22,0.0,35,2
53,45815,13307,2003-05-18,0.0,49,4
66,191220,13307,2005-09-15,0.0,62,4
121,2325224,13307,2012-07-11,0.0,116,4
...,...,...,...,...,...,...
4211,38195,324905,2010-01-07,0.0,9144,104937
4215,11613,20596,2002-03-22,0.0,2761,104940
4230,62031,63808,2009-09-02,0.0,18827,104955
4233,808894,61330,2008-04-03,0.0,24249,104958


#### RF-Rec Recommendations
[This paper](http://ceur-ws.org/Vol-606/paper6.pdf) outlines the RF-Rec algorithm, which can predict missing ratings even on very sparse datasets, to a great degree of accuracy. 

In [14]:
df = pd.DataFrame(columns=[f"I{i}" for i in range(6)], data=[
    ["Alice", 1,1,None,5,4],
    ["U1", 2,None,5,5,5],
    ["U2", None, None, 1,1,None],
    ["U3", None, 5,1,1,2],
]).set_index("I0")
df

Unnamed: 0_level_0,I1,I2,I3,I4,I5
I0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Alice,1.0,1.0,,5,4.0
U1,2.0,,5.0,5,5.0
U2,,,1.0,1,
U3,,5.0,1.0,1,2.0


In [45]:
def freqUser(u, r):
    unique, counts = np.unique(df.loc[u].values, return_counts=True)
    freqs = dict(zip(unique, counts))
    if r in freqs:
        return freqs[r]
    return 0

def freqItem(i, r):
    unique, counts = np.unique(df[i].values, return_counts=True)
    freqs = dict(zip(unique, counts))
    if r in freqs:
        return freqs[r]
    return 0

def ind_avg_user(u, r):
    return round(df.loc[u].mean()) == r

def ind_avg_item(i, r):
    return round(df[i].mean()) == r

In [38]:
freqItem("I4", 4)

0

In [50]:
predictForR = lambda u,i,r: (freqUser(u, r) + 1 + ind_avg_user(u, r)) * (freqItem(i, r) + 1 + ind_avg_item(i, r))

possibleRatings = [1,2,3,4,5]
possibleRatings[np.argmax([predictForR("Alice", "I4", r) for r in possibleRatings])]

1

## Back to the training...

In [67]:
n_train = int(0.8 * len(ds_full))
n_test = len(ds_full) - n_train
rng = torch.Generator().manual_seed(291)
ds_train, ds_test = torch.utils.data.random_split(ds_full, [n_train, n_test], rng)

In [68]:
class RecipeRecs(nn.Module):
    def __init__(self, n_users, n_recipes, emb_dim):
        super(RecipeRecs, self).__init__()
        self.user_emb = nn.Embedding(n_users, emb_dim)
        self.user_bias = nn.Embedding(n_users, 1)
        self.recipe_emb = nn.Embedding(n_recipes, emb_dim)
        self.recipe_bias = nn.Embedding(n_recipes, 1)
        nn.init.xavier_uniform_(self.user_emb.weight)
        nn.init.xavier_uniform_(self.recipe_emb.weight)
        nn.init.zeros_(self.user_bias.weight)
        nn.init.zeros_(self.recipe_bias.weight)
    
    def forward(self, samples):
        users = self.user_emb(samples[:,0])
        recipes = self.recipe_emb(samples[:,1])
        dot = (users * recipes).sum(1)
        user_b = self.user_bias(samples[:,0]).squeeze()
        recipe_b = self.recipe_bias(samples[:,1]).squeeze()
        return dot + user_b + recipe_b

In [69]:
device = torch.device('cpu')

def run_test(model, ldr, crit):
    total_loss, total_count = 0, 0
    model.eval()
    tq_iters = tqdm(ldr, leave=False, desc='test iter')
    with torch.no_grad():
        for coords, labels in tq_iters:
            coords, labels = coords.to(device), labels.to(device)
            preds = model(coords)
            loss = crit(preds, labels)
            total_loss += loss.item() * labels.size(0)
            total_count += labels.size(0)
            tq_iters.set_postfix({'loss': total_loss/total_count}, refresh=True)
    return total_loss / total_count

def run_train(model, ldr, crit, opt, sched):
    model.train()
    total_loss, total_count = 0, 0
    tq_iters = tqdm(ldr, leave=False, desc='train iter')
    for (coords, labels) in tq_iters:
        opt.zero_grad()
        coords, labels = coords.to(device), labels.to(device)
        preds = model(coords)
        loss = crit(preds, labels)
        loss.backward()
        opt.step()
        sched.step()
        total_loss += loss.item() * labels.size(0)
        total_count += labels.size(0)
        tq_iters.set_postfix({'loss': total_loss/total_count}, refresh=True)
    return total_loss / total_count

def run_all(model, ldr_train, ldr_test, crit, opt, sched, n_epochs=10):
    best_loss = np.inf
    tq_epochs = tqdm(range(n_epochs), desc='epochs', unit='ep')
    for epoch in tq_epochs:
        train_loss = run_train(model, ldr_train, crit, opt, sched)
        test_loss = run_test(model, ldr_test, crit)
        tqdm.write(f'epoch {epoch}   train loss {train_loss:.6f}    test loss {test_loss:.6f}')
        if test_loss < best_loss:
            best_loss = test_loss
            tq_epochs.set_postfix({'bE': epoch, 'bL': best_loss}, refresh=True)

In [95]:
model = RecipeRecs(ds_full.n_users, ds_full.n_recipes, 20)
model.to(device)

ldr_train = torch.utils.data.DataLoader(ds_train, batch_size=32, shuffle=True)
ldr_test = torch.utils.data.DataLoader(ds_test, batch_size=32)

n_epochs = 5

crit = nn.MSELoss().to(device)
opt = optim.SGD(model.parameters(), lr=1e-6, momentum=0.9)
sched = optim.lr_scheduler.OneCycleLR(opt, max_lr=0.4, steps_per_epoch=len(ldr_train), epochs=n_epochs)

run_all(model, ldr_train, ldr_test, crit, opt, sched, n_epochs)

epochs:   0%|          | 0/5 [00:00<?, ?ep/s]

train iter:   0%|          | 0/10509 [00:00<?, ?it/s]

test iter:   0%|          | 0/2628 [00:00<?, ?it/s]

epoch 0   train loss 5.887074    test loss 2.460643


train iter:   0%|          | 0/10509 [00:00<?, ?it/s]

test iter:   0%|          | 0/2628 [00:00<?, ?it/s]

epoch 1   train loss 1.633300    test loss 1.653874


train iter:   0%|          | 0/10509 [00:00<?, ?it/s]

test iter:   0%|          | 0/2628 [00:00<?, ?it/s]

epoch 2   train loss 0.927808    test loss 1.495855


train iter:   0%|          | 0/10509 [00:00<?, ?it/s]

test iter:   0%|          | 0/2628 [00:00<?, ?it/s]

epoch 3   train loss 0.523735    test loss 1.420870


train iter:   0%|          | 0/10509 [00:00<?, ?it/s]

test iter:   0%|          | 0/2628 [00:00<?, ?it/s]

epoch 4   train loss 0.270169    test loss 1.410772


In [92]:
def get_recommendations_for_user(model, dataset, user_id, batch_size=32):
    user_n = dataset.u2n[user_id]
    ratings = []
    n2r = {value: key for key, value in dataset.r2n.items()}
    model.eval()
    with torch.no_grad():
        for coords in torch.LongTensor([[user_n, i] for i in range(dataset.n_recipes)]).split(batch_size):
            coords = coords.to(device)
            preds = model(coords)
            ratings += [(n2r[int(coords[i, 1])], float(preds[i])) for i in range(preds.shape[0])]
    return sorted(ratings, key=lambda x: x[1], reverse=True)

In [94]:
get_recommendations_for_user(model, ds_full, 2046)[:10]

[(495577, 4.656217098236084),
 (5083, 4.089728832244873),
 (4065, 4.083194255828857),
 (495152, 3.9110963344573975),
 (145478, 3.8353042602539062),
 (125515, 3.7945594787597656),
 (108231, 3.777906894683838),
 (518229, 3.701632499694824),
 (110139, 3.696901321411133),
 (154304, 3.695032835006714)]

In [96]:
interactions.tail(10)

Unnamed: 0,user_id,recipe_id,date,rating,u,user_id_n,recipe_id_n
4255,1012958,226362,2010-01-06,5.0,24707,22871,104980
4256,378828,315583,2008-08-29,5.0,24797,20530,104981
4257,1341416,384116,2009-08-19,5.0,24839,23808,104982
4258,77850,469932,2012-09-16,4.0,24846,21689,104983
4259,1608591,31043,2010-09-28,4.0,24848,9165,104984
4260,1468267,192266,2012-04-17,4.0,24877,22623,104985
4261,142796,256343,2007-10-29,5.0,24914,22651,104986
4262,557416,247915,2007-10-25,5.0,25006,23337,104987
4263,218411,116676,2005-09-21,3.0,25008,20817,104988
4264,587445,206493,2008-07-16,5.0,25009,20500,104989


In [58]:
interactions.query("user_id_n == 0")

Unnamed: 0,user_id,recipe_id,date,rating,u,user_id_n,recipe_id_n
0,2046,517,2000-02-25,5.0,22095,0,0
39,2046,3431,2000-04-07,5.0,22095,0,3
47,2046,13307,2000-05-21,5.0,22095,0,4
