In [1]:
from pathlib import Path
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence

from sklearn.model_selection import train_test_split
from collections import Counter

In [2]:
import gzip
import json
import io

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def unpack_dataset():
    ! wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Grocery_and_Gourmet_Food_5.json.gz
    ! mkdir data
    ! mv Gro* data

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

In [3]:
unpack_dataset()


--2020-06-24 06:09:47--  http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Grocery_and_Gourmet_Food_5.json.gz
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 146631394 (140M) [application/octet-stream]
Saving to: ‘Grocery_and_Gourmet_Food_5.json.gz’


2020-06-24 06:09:56 (17.3 MB/s) - ‘Grocery_and_Gourmet_Food_5.json.gz’ saved [146631394/146631394]



In [4]:
data = getDF('data/Grocery_and_Gourmet_Food_5.json.gz')


In [6]:
data = data.loc[(data.reviewTime.str.contains('2018'))] #| (data.reviewTime.str.contains('2016')) | (data.reviewTime.str.contains('2017')) | (data.reviewTime.str.contains('2018'))]


In [7]:
data.shape

(98253, 12)

In [8]:
data.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,style,image
50,5.0,True,"03 20, 2018",A1U32SY2BN1I4T,9742356831,R. Barath,Use very sparingly unless you love very spicy ...,Five Stars,1521504000,,,
51,5.0,True,"03 15, 2018",A2ULESH6GADUEW,9742356831,Stetson and Megan Jenkins,I LOVE this curry paste. It beats Thai Kitche...,"Best curry paste HANDS DOWN, plus, it'll last ...",1521072000,,,
52,3.0,True,"03 13, 2018",A1TI6UAU422P4Y,9742356831,E. Scott,"Has good flavor, but flavor should be stronger...",Three Stars,1520899200,,,
53,5.0,True,"01 11, 2018",A3VG446SHJH37O,9742356831,PLee,Absolutely LOVE this!!! It's spicy but sooooo ...,LOVE!,1515628800,,,
509,5.0,True,"04 12, 2018",A2ZKFKG2PMJA3A,B00008RCN8,tr@st,tastes good,freshhhh,1523491200,,,


In [10]:
# now use user, item, reviewText & ratings
df = data[['asin', 'reviewerID', 'reviewText', 'overall']]
df.columns = ['place', 'user', 'review', 'rating']

In [11]:
del data

In [12]:
df.head()

Unnamed: 0,place,user,review,rating
50,9742356831,A1U32SY2BN1I4T,Use very sparingly unless you love very spicy ...,5.0
51,9742356831,A2ULESH6GADUEW,I LOVE this curry paste. It beats Thai Kitche...,5.0
52,9742356831,A1TI6UAU422P4Y,"Has good flavor, but flavor should be stronger...",3.0
53,9742356831,A3VG446SHJH37O,Absolutely LOVE this!!! It's spicy but sooooo ...,5.0
509,B00008RCN8,A2ZKFKG2PMJA3A,tastes good,5.0


## Split Train/Validation

In [13]:
train, val = train_test_split(df, test_size=0.2)
train.shape, val.shape

((78602, 4), (19651, 4))

In [14]:
del df

## Encode Places and Users

In [15]:
def encode_cols(train, val, cols):
    for col in cols:
        uniq = train[col].unique()
        col2idx = {val: i for i, val in enumerate(uniq)}
        train[col] = train[col].apply(lambda x: col2idx.get(x, -1))
        val[col] = val[col].apply(lambda x: col2idx.get(x, -1))
        val = val.loc[val[col] >= 0] 
    return train, val

In [16]:
train_enc, val_enc = encode_cols(train, val, ['place', 'user'])

In [17]:
len(train_enc.user.unique()), len(train_enc.place.unique())

(32931, 19020)

In [18]:
train_enc.head()

Unnamed: 0,place,user,review,rating
951119,0,0,very bitter,3.0
1141375,1,1,The best of Nature Valley flavors. Good for br...,5.0
40189,2,2,This is yummy with 8-12 oz of almond milk,5.0
1137552,3,3,I love this bread! Nice and fresh!,5.0
1006747,4,4,"Very good Quinoa - fully pre rinsed, I just co...",5.0


## Get text from reviews, create vocab

In [19]:
place_reviews = train_enc.groupby('place').review.apply(list)

In [20]:
len(place_reviews), sum([any(review_list) for review_list in place_reviews.values])

(19020, 19020)

In [21]:
# 261 places without any reviews our ot 10476

In [28]:
place_reviews = place_reviews.apply(lambda x: ' '.join([str(rev) for rev in x]))

In [29]:
place_reviews.head(2)

place
0    very bitter SUPER !!!!!! not bold enought for ...
1    The best of Nature Valley flavors. Good for br...
Name: review, dtype: object

In [30]:
import re

In [31]:
def tok(text):
    reg = re.findall(r"[\w']+|[.,!?;]", text, re.UNICODE)
    return [w.lower() for w in reg]

In [32]:
tok(place_reviews.iloc[0])[:20]

['very',
 'bitter',
 'super',
 '!',
 '!',
 '!',
 '!',
 '!',
 '!',
 'not',
 'bold',
 'enought',
 'for',
 'me',
 'great',
 'coffee',
 '.',
 'best',
 'drip',
 'coffee']

In [33]:
all_vocab = [word for review in place_reviews.values for word in tok(review)]

In [34]:
len(all_vocab)

2357689

In [35]:
word_cts = Counter(all_vocab)
len(word_cts)

28121

In [36]:
del all_vocab

In [37]:
common = {k:v for k, v in word_cts.items() if v >= 5}
vocab2idx = {'<PAD>': 0, 'UNK': 1}

for i, (word, ct) in enumerate(common.items()):
    if ct >= 5:
        vocab2idx[word] = i + 2

In [38]:
del word_cts, common

In [39]:
len(vocab2idx)

9430

## Review to Vector

In [40]:
place_reviews_tok = [tok(review) for review in place_reviews.values]

In [41]:
lens = [len(rev) for rev in place_reviews_tok]
max(lens)

12498

In [42]:
del lens

In [43]:
def encode_review(review, N=6412):
    enc1 = np.array([vocab2idx.get(w, vocab2idx["UNK"]) for w in review])
    enc = np.zeros(N, dtype=np.int32)
    l = min(N, len(review))
    enc[:l] = enc1[:l]
    if l == 0:
        l=1
    return enc, l

In [44]:
encode_review(place_reviews_tok[0])

(array([2, 3, 4, ..., 0, 0, 0], dtype=int32), 1419)

## Dataset and Data Loader

In [45]:
class AmazonDataset(Dataset):
    def __init__(self, X, y, reviews):
        self.user = X.user.values
        self.place = X.place.values
        self.y = y
        self.reviews = reviews
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        user = self.user[idx]
        place = self.place[idx]
        review_vec, l = encode_review(self.reviews[place])
        return user, place, review_vec, l, self.y[idx]

In [46]:
train_ds = AmazonDataset(train_enc[['place', 'user']], train_enc.rating.values, place_reviews_tok)
valid_ds = AmazonDataset(val_enc[['place', 'user']], val_enc.rating.values, place_reviews_tok)

In [47]:
next(iter(train_ds))

(0, 0, array([2, 3, 4, ..., 0, 0, 0], dtype=int32), 1419, 3.0)

## Load Pre-Trained Glove

In [48]:
def loadGloveModel(gloveFile='data/glove.6B.50d.txt'):
    """ Loads word vectors into a dictionary."""
    ! wget http://nlp.stanford.edu/data/glove.6B.zip
    !unzip glove*.zip
    ! mv gl* data

    f = open(gloveFile,'r')
    words = []
    word_vecs = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        words.append(word)
        word_vecs[word] = np.array([float(val) for val in splitLine[1:]])
    return word_vecs

In [49]:
glove = loadGloveModel()

--2020-06-24 06:12:56--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2020-06-24 06:12:56--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2020-06-24 06:12:57--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2020-0

In [50]:
def create_embedding_matrix(word_vecs, vocab2index, emb_size=50):
    """Creates embedding matrix from word vectors. """ 
    V = len(vocab2index.keys())
    W = np.random.uniform(-0.25,0.25, (V, emb_size)) # initialize random
    # adding a vector for padding
    W[0] = np.zeros(emb_size, dtype='float16')
    for word, index in vocab2index.items():
        if word in word_vecs:
            W[index] = word_vecs[word]
    return W

In [51]:
max([v for k, v in vocab2idx.items()])

9429

In [52]:
W = create_embedding_matrix(glove, vocab2idx)

In [53]:
del glove

## Model

In [54]:
class Context_MF_bias(nn.Module):
    def __init__(self, num_users, num_places, vocab_size,  emb_size=50, glove_weights=None, bias=True):
        super(Context_MF_bias, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.place_emb = nn.Embedding(num_places, emb_size)
        self.user_emb.weight.data.uniform_(0,0.05)
        self.place_emb.weight.data.uniform_(0,0.05)
        self.bias = bias
        if self.bias:
            self.user_bias = nn.Embedding(num_users, 1)
            self.place_bias = nn.Embedding(num_places, 1)
            self.user_bias.weight.data.uniform_(-0.01,0.01)
            self.place_emb.weight.data.uniform_(-0.01,0.01)
        
        self.embeddings = nn.Embedding(vocab_size, emb_size, padding_idx=0)
        if glove_weights is not None:
            self.embeddings.weight.data.copy_(torch.from_numpy(glove_weights))
            self.embeddings.weight.requires_grad = False ## freeze embeddings
        
        # input dim and hidden dim must be the same to later combine item vec and review vec
        self.gru = nn.GRU(emb_size, emb_size, batch_first=True) 
        self.dropout = nn.Dropout(0.3)
        
    def forward(self, u, v, v_review, l):
        U = self.user_emb(u)
        V = self.place_emb(v)
        U, V = self.dropout(U), self.dropout(V)
        
        emb = self.embeddings(v_review)
        emb = self.dropout(emb)
        pack1 = pack_padded_sequence(emb, l, batch_first=True, enforce_sorted=False)
        _, h1 = self.gru(pack1)
        
        V_rev = h1[-1]
        
        V = torch.stack([V, V_rev], dim=1) # combines item vector with review vector
        V = torch.mean(V, dim=1)   # still dim 50, avg at each dim from item/review vec
        
        if self.bias:
            b_u = self.user_bias(u).squeeze()
            b_v = self.place_bias(v).squeeze()
            return (U*V).sum(1) +  b_u  + b_v
        
        return (U*V).sum(1)

## Training Functions

In [55]:
def cosine_segment(start_lr, end_lr, iterations):
    i = np.arange(iterations)
    c_i = 1 + np.cos(i*np.pi/iterations)
    return end_lr + (start_lr - end_lr)/2 *c_i

def get_cosine_triangular_lr(max_lr, iterations, div_start=5, div_end=5):
    min_start, min_end = max_lr/div_start, max_lr/div_end
    iter1 = int(0.3*iterations)
    iter2 = iterations - iter1
    segs = [cosine_segment(min_start, max_lr, iter1), cosine_segment(max_lr, min_end, iter2)]
    return np.concatenate(segs)

In [56]:
def set_learning_rate(optimizer, lr):
    """Changing learning rates without creating a new optimizer"""
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

In [57]:
def train_epocs(model, optimzer, train_dl, valid_dl, epochs=10, max_lr=0.01):
    iterations = epochs*len(train_dl)
    lrs = get_cosine_triangular_lr(max_lr, iterations)
    vals = []
    for epoch in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        i = 0
        for u, v, v_rev, l, y in train_dl:
            set_learning_rate(optimizer, lrs[i])
            users = u.long().cuda()
            places = v.long().cuda()
            review_vec = v_rev.long().cuda()
            ratings = y.float().cuda()
            y_hat = model(users, places, review_vec, l)
            loss = F.mse_loss(y_hat, ratings)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
            i += 1
        val_loss = val_metrics(model, valid_dl)
        vals.append(val_loss)
        #if epoch % 10 == 0: 
        print(f"train loss {sum_loss/total:.3f} valid loss {val_loss:.3f}") #valid accuracy {testaccur:.5f}

In [58]:
def val_metrics(model, valid_dl):
    with torch.no_grad(): # reduce memory
        model.eval()
        correct = 0
        total = 0
        sum_loss = 0.0
        for u, v, v_rev, l, y in valid_dl:
            users = u.long().cuda()
            places = v.long().cuda()
            review_vec = v_rev.long().cuda()
            ratings = y.float().cuda()
            y_hat = model(users, places, review_vec, l)
            loss = F.mse_loss(y_hat, ratings)
            #y_pred = y_hat > 0
            #correct += (y_pred.float() == y).float().sum()
            total += y.shape[0]
            sum_loss += loss.item()*y.shape[0]
    return sum_loss/total #, correct/total

## Train with Glove

In [59]:
len(train_enc)

78602

In [60]:
batch_size = 2_500
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=batch_size)

In [61]:
num_places, num_users, V = len(train_enc.place.unique()), len(train_enc.user.unique()), len(vocab2idx.keys())
num_places, num_users, V

(19020, 32931, 9430)

In [62]:
torch.cuda.empty_cache()

In [63]:
model_glove = Context_MF_bias(num_users, num_places, V, emb_size=50, bias=False, glove_weights=W).cuda()

In [64]:
parameters = filter(lambda p: p.requires_grad, model_glove.parameters())
optimizer = torch.optim.Adam(parameters, lr=0.01, weight_decay=1e-5)

In [65]:
train_epocs(model_glove, optimizer, train_dl, valid_dl, max_lr=0.01, epochs=5)

train loss 18.620 valid loss 13.990
train loss 11.792 valid loss 8.230
train loss 6.819 valid loss 4.851
train loss 4.052 valid loss 3.099
train loss 2.591 valid loss 2.197


In [66]:
# unfreezing the embeddings
model_glove.embeddings.weight.requires_grad = True
parameters = filter(lambda p: p.requires_grad, model_glove.parameters())

In [67]:
train_epocs(model_glove, optimizer, train_dl, valid_dl, max_lr=0.01, epochs=30)

train loss 1.805 valid loss 1.938
train loss 1.556 valid loss 1.745
train loss 1.360 valid loss 1.593
train loss 1.196 valid loss 1.484
train loss 1.075 valid loss 1.398
train loss 0.976 valid loss 1.336
train loss 0.902 valid loss 1.289
train loss 0.838 valid loss 1.254
train loss 0.791 valid loss 1.226
train loss 0.754 valid loss 1.204
train loss 0.723 valid loss 1.193
train loss 0.694 valid loss 1.185
train loss 0.679 valid loss 1.172
train loss 0.663 valid loss 1.164
train loss 0.651 valid loss 1.163
train loss 0.638 valid loss 1.157
train loss 0.631 valid loss 1.155
train loss 0.627 valid loss 1.150
train loss 0.620 valid loss 1.147
train loss 0.609 valid loss 1.147
train loss 0.605 valid loss 1.148
train loss 0.607 valid loss 1.151
train loss 0.600 valid loss 1.149
train loss 0.598 valid loss 1.146
train loss 0.594 valid loss 1.144
train loss 0.589 valid loss 1.146
train loss 0.589 valid loss 1.143
train loss 0.585 valid loss 1.143
train loss 0.580 valid loss 1.142
train loss 0.5