In [1]:
from pathlib import Path
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence

from sklearn.model_selection import train_test_split
from collections import Counter

In [2]:
import gzip
import json
import io

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def unpack_dataset():
    ! wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Grocery_and_Gourmet_Food_5.json.gz
    ! mkdir data
    ! mv Gro* data

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

In [3]:
unpack_dataset()


--2020-06-24 05:30:41--  http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Grocery_and_Gourmet_Food_5.json.gz
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 146631394 (140M) [application/octet-stream]
Saving to: ‘Grocery_and_Gourmet_Food_5.json.gz’


2020-06-24 05:30:49 (18.8 MB/s) - ‘Grocery_and_Gourmet_Food_5.json.gz’ saved [146631394/146631394]

mkdir: cannot create directory ‘data’: File exists


In [4]:
data = getDF('data/Grocery_and_Gourmet_Food_5.json.gz')

In [5]:
data.shape

(1143860, 12)

In [6]:
data.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,style,image
0,5.0,True,"11 19, 2014",A1QVBUH9E1V6I8,4639725183,Jamshed Mathur,No adverse comment.,Five Stars,1416355200,,,
1,5.0,True,"10 13, 2016",A3GEOILWLK86XM,4639725183,itsjustme,Gift for college student.,Great product.,1476316800,,,
2,5.0,True,"11 21, 2015",A32RD6L701BIGP,4639725183,Krystal Clifton,"If you like strong tea, this is for you. It mi...",Strong,1448064000,,,
3,5.0,True,"08 12, 2015",A2UY1O1FBGKIE6,4639725183,U. Kane,Love the tea. The flavor is way better than th...,Great tea,1439337600,,,
4,5.0,True,"05 28, 2015",A3QHVBQYDV7Z6U,4639725183,The Nana,I have searched everywhere until I browsed Ama...,This is the tea I remembered!,1432771200,,,


In [7]:
np.unique(data.reviewTime.str[-4:])#.str.contains('2014')

array(['2000', '2002', '2003', '2004', '2005', '2006', '2007', '2008',
       '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016',
       '2017', '2018'], dtype=object)

In [8]:
data = data.loc[(data.reviewTime.str.contains('2018'))] #| (data.reviewTime.str.contains('2016')) | (data.reviewTime.str.contains('2017')) | (data.reviewTime.str.contains('2018'))]

In [9]:
data.shape

(98253, 12)

In [10]:
# now use user, item, reviewText & ratings
df = data[['reviewerID', 'asin', 'reviewText', 'overall']]
df.columns = ['place', 'user', 'review', 'rating']

In [11]:
del data

In [12]:
df.head()

Unnamed: 0,place,user,review,rating
50,A1U32SY2BN1I4T,9742356831,Use very sparingly unless you love very spicy ...,5.0
51,A2ULESH6GADUEW,9742356831,I LOVE this curry paste. It beats Thai Kitche...,5.0
52,A1TI6UAU422P4Y,9742356831,"Has good flavor, but flavor should be stronger...",3.0
53,A3VG446SHJH37O,9742356831,Absolutely LOVE this!!! It's spicy but sooooo ...,5.0
509,A2ZKFKG2PMJA3A,B00008RCN8,tastes good,5.0


## Split Train/Validation

In [13]:
train, val = train_test_split(df, test_size=0.2)
train.shape, val.shape

((78602, 4), (19651, 4))

In [14]:
del df

## Encode Places and Users

In [15]:
def encode_cols(train, val, cols):
    for col in cols:
        uniq = train[col].unique()
        col2idx = {val: i for i, val in enumerate(uniq)}
        train[col] = train[col].apply(lambda x: col2idx.get(x, -1))
        val[col] = val[col].apply(lambda x: col2idx.get(x, -1))
        val = val.loc[val[col] >= 0] 
    return train, val

In [16]:
train_enc, val_enc = encode_cols(train, val, ['place', 'user'])

In [17]:
len(train_enc.user.unique()), len(train_enc.place.unique())

(19077, 32844)

In [18]:
train_enc.head()

Unnamed: 0,place,user,review,rating
1065889,0,0,I was very disappointed in this mix! I have u...,3.0
264018,1,1,"These aren't as yummy as regular granola bars,...",4.0
891366,2,2,"My bought it as a gift for my in laws,and she ...",5.0
1098870,3,3,Taste great,5.0
1138760,4,4,I bought ONE ORGANIC stevia to compare to thre...,5.0


## Get text from reviews, create vocab

In [19]:
place_reviews = train_enc.groupby('place').review.apply(list)

In [20]:
len(place_reviews), sum([any(review_list) for review_list in place_reviews.values])

(32844, 32844)

In [21]:
# 261 places without any reviews our ot 10476
' '.join(place_reviews[0])

"I was very disappointed in this mix!  I have used other Krusteaz products, and enjoyed them, but this just was a big let down.  I got a new waffle maker and was all jazzed to try it out with this mix.  After a couple of tries, I got the perfect golden brown waffle, and took a taste.  The waffle was crisp, but there really was no taste to it.  It was like eating cardboard, albeit crisp and hot.  I recommend that you go to allrecipes, etc. and find a REAL recipe for waffles, because this was a BIG disappointment. I found this coffee to be a little bitter, but that is just my opinion.  Taste is subjective, and I like the House Blend better.  That said, if you like darker roast coffee, this may be perfect for you. I love this added to scrambled eggs to make them just a little more moist.  I gather my own farm-raised eggs, so this is a big deal to me.  I also use it to keep my lips from drying out.  This is really a versatile product--you can use it for frying, baking, etc. and also as a b

In [22]:
place_reviews = place_reviews.apply(lambda x: ' '.join([str(rev) for rev in x]))

In [23]:
place_reviews.head(2)

place
0    I was very disappointed in this mix!  I have u...
1    These aren't as yummy as regular granola bars,...
Name: review, dtype: object

In [24]:
import re

In [25]:
def tok(text):
    reg = re.findall(r"[\w']+|[.,!?;]", text, re.UNICODE)
    return [w.lower() for w in reg]

In [26]:
tok(place_reviews.iloc[0])[:20]

['i',
 'was',
 'very',
 'disappointed',
 'in',
 'this',
 'mix',
 '!',
 'i',
 'have',
 'used',
 'other',
 'krusteaz',
 'products',
 ',',
 'and',
 'enjoyed',
 'them',
 ',',
 'but']

In [27]:
all_vocab = [word for review in place_reviews.values for word in tok(review)]

In [28]:
len(all_vocab)

2353649

In [29]:
word_cts = Counter(all_vocab)
len(word_cts)

28075

In [30]:
del all_vocab

In [31]:
common = {k:v for k, v in word_cts.items() if v >= 5}
vocab2idx = {'<PAD>': 0, 'UNK': 1}

for i, (word, ct) in enumerate(common.items()):
    if ct >= 5:
        vocab2idx[word] = i + 2

In [32]:
del word_cts, common

In [33]:
len(vocab2idx)

9413

## Review to Vector

In [34]:
place_reviews_tok = [tok(review) for review in place_reviews.values]

In [35]:
lens = [len(rev) for rev in place_reviews_tok]
max(lens)

4158

In [36]:
del lens

In [37]:
def encode_review(review, N=6412):
    enc1 = np.array([vocab2idx.get(w, vocab2idx["UNK"]) for w in review])
    enc = np.zeros(N, dtype=np.int32)
    l = min(N, len(review))
    enc[:l] = enc1[:l]
    if l == 0:
        l=1
    return enc, l

In [38]:
encode_review(place_reviews_tok[0])

(array([2, 3, 4, ..., 0, 0, 0], dtype=int32), 432)

## Dataset and Data Loader

In [39]:
class AmazonDataset(Dataset):
    def __init__(self, X, y, reviews):
        self.user = X.user.values
        self.place = X.place.values
        self.y = y
        self.reviews = reviews
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        user = self.user[idx]
        place = self.place[idx]
        review_vec, l = encode_review(self.reviews[place])
        return user, place, review_vec, l, self.y[idx]

In [40]:
train_ds = AmazonDataset(train_enc[['place', 'user']], train_enc.rating.values, place_reviews_tok)
valid_ds = AmazonDataset(val_enc[['place', 'user']], val_enc.rating.values, place_reviews_tok)

In [41]:
next(iter(train_ds))

(0, 0, array([2, 3, 4, ..., 0, 0, 0], dtype=int32), 432, 3.0)

## Load Pre-Trained Glove

In [42]:
def loadGloveModel(gloveFile='data/glove.6B.50d.txt'):
    """ Loads word vectors into a dictionary."""
    ! wget http://nlp.stanford.edu/data/glove.6B.zip
    !unzip glove*.zip
    ! mv gl* data

    f = open(gloveFile,'r')
    words = []
    word_vecs = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        words.append(word)
        word_vecs[word] = np.array([float(val) for val in splitLine[1:]])
    return word_vecs

In [43]:
#glove = loadGloveModel()

In [44]:
def create_embedding_matrix(word_vecs, vocab2index, emb_size=50):
    """Creates embedding matrix from word vectors. """ 
    V = len(vocab2index.keys())
    W = np.random.uniform(-0.25,0.25, (V, emb_size)) # initialize random
    # adding a vector for padding
    W[0] = np.zeros(emb_size, dtype='float32')
    for word, index in vocab2index.items():
        if word in word_vecs:
            W[index] = word_vecs[word]
    return W

In [45]:
#max([v for k, v in vocab2idx.items()])

In [46]:
#W = create_embedding_matrix(glove, vocab2idx)

In [47]:
#del glove

## Model

In [48]:
class Context_MF_bias(nn.Module):
    def __init__(self, num_users, num_places, vocab_size,  emb_size=50, glove_weights=None, bias=True):
        super(Context_MF_bias, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.place_emb = nn.Embedding(num_places, emb_size)
        self.user_emb.weight.data.uniform_(0,0.05)
        self.place_emb.weight.data.uniform_(0,0.05)
        self.bias = bias
        if self.bias:
            self.user_bias = nn.Embedding(num_users, 1)
            self.place_bias = nn.Embedding(num_places, 1)
            self.user_bias.weight.data.uniform_(-0.01,0.01)
            self.place_emb.weight.data.uniform_(-0.01,0.01)
        
        self.embeddings = nn.Embedding(vocab_size, emb_size, padding_idx=0)
        if glove_weights is not None:
            self.embeddings.weight.data.copy_(torch.from_numpy(glove_weights))
            self.embeddings.weight.requires_grad = False ## freeze embeddings
        
        # input dim and hidden dim must be the same to later combine item vec and review vec
        self.gru = nn.GRU(emb_size, emb_size, batch_first=True) 
        self.dropout = nn.Dropout(0.3)
        
    def forward(self, u, v, v_review, l):
        U = self.user_emb(u)
        V = self.place_emb(v)
        U, V = self.dropout(U), self.dropout(V)
        
        emb = self.embeddings(v_review)
        emb = self.dropout(emb)
        pack1 = pack_padded_sequence(emb, l, batch_first=True, enforce_sorted=False)
        _, h1 = self.gru(pack1)
        
        V_rev = h1[-1]
        
        V_ = torch.stack([V, V_rev], dim=1) # combines item vector with review vector
        V_ = torch.mean(V_, dim=1)   # still dim 50, avg at each dim from item/review vec
        
        if self.bias:
            b_u = self.user_bias(u).squeeze()
            b_v = self.place_bias(v).squeeze()
            return (U*V_).sum(1) +  b_u  + b_v
        
        return (U*V_).sum(1)

## Training Functions

In [49]:
def cosine_segment(start_lr, end_lr, iterations):
    i = np.arange(iterations)
    c_i = 1 + np.cos(i*np.pi/iterations)
    return end_lr + (start_lr - end_lr)/2 *c_i

def get_cosine_triangular_lr(max_lr, iterations, div_start=5, div_end=5):
    min_start, min_end = max_lr/div_start, max_lr/div_end
    iter1 = int(0.3*iterations)
    iter2 = iterations - iter1
    segs = [cosine_segment(min_start, max_lr, iter1), cosine_segment(max_lr, min_end, iter2)]
    return np.concatenate(segs)

In [50]:
def set_learning_rate(optimizer, lr):
    """Changing learning rates without creating a new optimizer"""
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

In [51]:
def train_epocs(model, optimzer, train_dl, valid_dl, epochs=10, max_lr=0.01):
    iterations = epochs*len(train_dl)
    lrs = get_cosine_triangular_lr(max_lr, iterations)
    vals = []
    for epoch in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        i = 0
        for u, v, v_rev, l, y in train_dl:
            set_learning_rate(optimizer, lrs[i])
            users = u.long().cuda()
            places = v.long().cuda()
            review_vec = v_rev.long().cuda()
            ratings = y.float().cuda()
            y_hat = model(users, places, review_vec, l)
            loss = F.mse_loss(y_hat, ratings)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
            i += 1
        val_loss = val_metrics(model, valid_dl)
        vals.append(val_loss)
        #if epoch % 10 == 0: 
        print(f"train loss {sum_loss/total:.3f} valid loss {val_loss:.3f}") #valid accuracy {testaccur:.5f}

In [52]:
def val_metrics(model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    for u, v, v_rev, l, y in valid_dl:
        users = u.long().cuda()
        places = v.long().cuda()
        review_vec = v_rev.long().cuda()
        ratings = y.float().cuda()
        y_hat = model(users, places, review_vec, l)
        loss = F.mse_loss(y_hat, ratings)
        #y_pred = y_hat > 0
        #correct += (y_pred.float() == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
    return sum_loss/total #, correct/total

## Train

In [53]:
len(train_enc)

78602

In [54]:
batch_size = 1_500
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=batch_size)

In [55]:
num_places, num_users, V = len(train_enc.place.unique()), len(train_enc.user.unique()), len(vocab2idx.keys())
num_places, num_users, V

(32844, 19077, 9413)

In [56]:
torch.cuda.empty_cache()

In [57]:
model = Context_MF_bias(num_users, num_places, V, emb_size=50, bias=False).cuda()

In [58]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-5)

In [59]:
train_epocs(model, optimizer, train_dl, valid_dl, max_lr=0.01, epochs=20)

train loss 16.747 valid loss 11.602
train loss 9.965 valid loss 7.312
train loss 6.725 valid loss 5.149
train loss 4.863 valid loss 3.866
train loss 3.664 valid loss 3.037
train loss 2.853 valid loss 2.483
train loss 2.288 valid loss 2.101
train loss 1.881 valid loss 1.829
train loss 1.589 valid loss 1.640
train loss 1.373 valid loss 1.502
train loss 1.216 valid loss 1.406
train loss 1.097 valid loss 1.334
train loss 1.009 valid loss 1.285
train loss 0.942 valid loss 1.245
train loss 0.892 valid loss 1.217
train loss 0.847 valid loss 1.195
train loss 0.819 valid loss 1.180
train loss 0.793 valid loss 1.172
train loss 0.772 valid loss 1.166
train loss 0.758 valid loss 1.156


In [60]:
train_epocs(model, optimizer, train_dl, valid_dl, max_lr=0.001, epochs=10)

train loss 0.722 valid loss 1.154
train loss 0.716 valid loss 1.152
train loss 0.714 valid loss 1.150
train loss 0.713 valid loss 1.149
train loss 0.709 valid loss 1.148
train loss 0.710 valid loss 1.147
train loss 0.701 valid loss 1.146
train loss 0.703 valid loss 1.145
train loss 0.700 valid loss 1.145
train loss 0.700 valid loss 1.144
