In [1]:
from pathlib import Path
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence

from sklearn.model_selection import train_test_split
from collections import Counter

In [2]:
import gzip
import json
import io

def unpack_dataset():
    ! wget http://jmcauley.ucsd.edu/data/googlelocal/kcore/NY_5.json.gz
    ! mkdir data
    ! mv NY* data

def readGz(fname):
    gz = gzip.open(fname, 'rb')
    f = io.BufferedReader(gz)
    data = []
        
    for l in f.readlines():
        data.append(eval(l))
        
    gz.close()
    return data

In [3]:
unpack_dataset()


--2020-06-24 05:48:34--  http://jmcauley.ucsd.edu/data/googlelocal/kcore/NY_5.json.gz
Resolving jmcauley.ucsd.edu (jmcauley.ucsd.edu)... 137.110.160.73
Connecting to jmcauley.ucsd.edu (jmcauley.ucsd.edu)|137.110.160.73|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 11983311 (11M) [application/x-gzip]
Saving to: ‘NY_5.json.gz’


2020-06-24 05:48:35 (8.68 MB/s) - ‘NY_5.json.gz’ saved [11983311/11983311]

mkdir: cannot create directory ‘data’: File exists


In [4]:
ny5 = readGz('data/NY_5.json.gz')

In [5]:
data = pd.DataFrame(ny5)

In [6]:
del ny5

In [7]:
data.shape

(121878, 8)

In [8]:
data.head()

Unnamed: 0,rating,reviewerName,reviewText,categories,gPlusPlaceId,unixReviewTime,reviewTime,gPlusUserId
0,5.0,Chris Johnson,,"[American Restaurant, Bar]",103654778391814923896,1311510000.0,"Jul 24, 2011",100000524810171549476
1,5.0,Chris Johnson,"service is amazing, the line goes so fast.",[Grocery Store],105947477166033397439,1316105000.0,"Sep 15, 2011",100000524810171549476
2,5.0,Chris Johnson,Get the chicken green salad. Yum.,[Chicken Restaurant],107098981103934500500,1311509000.0,"Jul 24, 2011",100000524810171549476
3,5.0,Chris Johnson,Never had a falafel bar before. Yum. +1 for su...,"[Vegetarian Restaurant, Mediterranean Restaura...",108585910849109169666,1311509000.0,"Jul 24, 2011",100000524810171549476
4,5.0,Chris Johnson,Incredible short rib hash for breakfast,"[French Restaurant, American Restaurant, Wine ...",115453471525181677863,1320496000.0,"Nov 5, 2011",100000524810171549476


In [9]:
# now use user, item, reviewText & ratings
df = data[['gPlusPlaceId', 'gPlusUserId', 'reviewText', 'rating']]
df.columns = ['place', 'user', 'review', 'rating']

In [10]:
del data

In [11]:
df.head()

Unnamed: 0,place,user,review,rating
0,103654778391814923896,100000524810171549476,,5.0
1,105947477166033397439,100000524810171549476,"service is amazing, the line goes so fast.",5.0
2,107098981103934500500,100000524810171549476,Get the chicken green salad. Yum.,5.0
3,108585910849109169666,100000524810171549476,Never had a falafel bar before. Yum. +1 for su...,5.0
4,115453471525181677863,100000524810171549476,Incredible short rib hash for breakfast,5.0


## Split Train/Validation

In [12]:
train, val = train_test_split(df, test_size=0.2)
train.shape, val.shape

((97502, 4), (24376, 4))

In [13]:
del df

## Encode Places and Users

In [14]:
def encode_cols(train, val, cols):
    for col in cols:
        uniq = train[col].unique()
        col2idx = {val: i for i, val in enumerate(uniq)}
        train[col] = train[col].apply(lambda x: col2idx.get(x, -1))
        val[col] = val[col].apply(lambda x: col2idx.get(x, -1))
        val = val.loc[val[col] >= 0] 
    return train, val

In [15]:
train_enc, val_enc = encode_cols(train, val, ['place', 'user'])

In [16]:
len(train_enc.user.unique()), len(train_enc.place.unique())

(6104, 7230)

In [17]:
train_enc.head()

Unnamed: 0,place,user,review,rating
6345,0,0,"Tiny, and there's often a line so you need to ...",4.0
909,1,1,This is an NYC gem. What could be better than ...,5.0
87628,2,2,When did all these great places appear in Quee...,5.0
46535,3,3,Somehow the Starbucks on Court Street is much ...,3.0
67471,4,4,I have not found another place in the city tha...,5.0


## Get text from reviews, create vocab

In [18]:
place_reviews = train_enc.groupby('place').review.apply(list)

In [19]:
len(place_reviews), sum([any(review_list) for review_list in place_reviews.values])

(7230, 7182)

In [20]:
# 261 places without any reviews our ot 10476

In [21]:
place_reviews = place_reviews.apply(lambda x: ' '.join([rev for rev in x if rev is not None]))

In [22]:
place_reviews.head(2)

place
0    Tiny, and there's often a line so you need to ...
1    This is an NYC gem. What could be better than ...
Name: review, dtype: object

In [23]:
import re

In [24]:
def tok(text):
    reg = re.findall(r"[\w']+|[.,!?;]", text, re.UNICODE)
    return [w.lower() for w in reg]

In [25]:
tok(place_reviews.iloc[0])[:20]

['tiny',
 ',',
 'and',
 "there's",
 'often',
 'a',
 'line',
 'so',
 'you',
 'need',
 'to',
 'time',
 'this',
 'one',
 'right',
 '.',
 'but',
 "there's",
 'a',
 'line']

In [26]:
all_vocab = [word for review in place_reviews.values for word in tok(review)]

In [27]:
len(all_vocab)

3113883

In [28]:
word_cts = Counter(all_vocab)
len(word_cts)

44971

In [29]:
del all_vocab

In [30]:
common = {k:v for k, v in word_cts.items() if v >= 5}
vocab2idx = {'<PAD>': 0, 'UNK': 1}

for i, (word, ct) in enumerate(common.items()):
    if ct >= 5:
        vocab2idx[word] = i + 2

In [31]:
del word_cts, common

In [32]:
len(vocab2idx)

13026

## Review to Vector

In [33]:
place_reviews_tok = [tok(review) for review in place_reviews.values]

In [34]:
lens = [len(rev) for rev in place_reviews_tok]
max(lens)

8447

In [35]:
del lens

In [36]:
def encode_review(review, N=6412):
    enc1 = np.array([vocab2idx.get(w, vocab2idx["UNK"]) for w in review])
    enc = np.zeros(N, dtype=np.int32)
    l = min(N, len(review))
    enc[:l] = enc1[:l]
    if l == 0:
        l=1
    return enc, l

In [37]:
encode_review(place_reviews_tok[0])

(array([2, 3, 4, ..., 0, 0, 0], dtype=int32), 789)

## Dataset and Data Loader

In [38]:
class GoogleDataset(Dataset):
    def __init__(self, X, y, reviews):
        self.user = X.user.values
        self.place = X.place.values
        self.y = y
        self.reviews = reviews
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        user = self.user[idx]
        place = self.place[idx]
        review_vec, l = encode_review(self.reviews[place])
        return user, place, review_vec, l, self.y[idx]

In [39]:
train_ds = GoogleDataset(train_enc[['place', 'user']], train_enc.rating.values, place_reviews_tok)
valid_ds = GoogleDataset(val_enc[['place', 'user']], val_enc.rating.values, place_reviews_tok)

In [40]:
next(iter(train_ds))

(0, 0, array([2, 3, 4, ..., 0, 0, 0], dtype=int32), 789, 4.0)

## Load Pre-Trained Glove

In [41]:
def loadGloveModel(gloveFile='data/glove.6B.50d.txt'):
    """ Loads word vectors into a dictionary."""
    ! wget http://nlp.stanford.edu/data/glove.6B.zip
    !unzip glove*.zip
    ! mv gl* data

    f = open(gloveFile,'r')
    words = []
    word_vecs = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        words.append(word)
        word_vecs[word] = np.array([float(val) for val in splitLine[1:]])
    return word_vecs

In [42]:
glove = loadGloveModel()

--2020-06-24 05:48:56--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2020-06-24 05:48:56--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2020-06-24 05:48:56--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2020-0

In [43]:
def create_embedding_matrix(word_vecs, vocab2index, emb_size=50):
    """Creates embedding matrix from word vectors. """ 
    V = len(vocab2index.keys())
    W = np.random.uniform(-0.25,0.25, (V, emb_size)) # initialize random
    # adding a vector for padding
    W[0] = np.zeros(emb_size, dtype='float16')
    for word, index in vocab2index.items():
        if word in word_vecs:
            W[index] = word_vecs[word]
    return W

In [44]:
max([v for k, v in vocab2idx.items()])

13025

In [45]:
W = create_embedding_matrix(glove, vocab2idx)

In [46]:
del glove

## Model

In [47]:
class Context_MF_bias(nn.Module):
    def __init__(self, num_users, num_places, vocab_size,  emb_size=50, glove_weights=None, bias=True):
        super(Context_MF_bias, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.place_emb = nn.Embedding(num_places, emb_size)
        self.user_emb.weight.data.uniform_(0,0.05)
        self.place_emb.weight.data.uniform_(0,0.05)
        self.bias = bias
        if self.bias:
            self.user_bias = nn.Embedding(num_users, 1)
            self.place_bias = nn.Embedding(num_places, 1)
            self.user_bias.weight.data.uniform_(-0.01,0.01)
            self.place_emb.weight.data.uniform_(-0.01,0.01)
        
        self.embeddings = nn.Embedding(vocab_size, emb_size, padding_idx=0)
        if glove_weights is not None:
            self.embeddings.weight.data.copy_(torch.from_numpy(glove_weights))
            self.embeddings.weight.requires_grad = False ## freeze embeddings
        
        # input dim and hidden dim must be the same to later combine item vec and review vec
        self.gru = nn.GRU(emb_size, emb_size, batch_first=True) 
        self.dropout = nn.Dropout(0.3)
        
    def forward(self, u, v, v_review, l):
        U = self.user_emb(u)
        V = self.place_emb(v)
        U, V = self.dropout(U), self.dropout(V)
        
        emb = self.embeddings(v_review)
        emb = self.dropout(emb)
        pack1 = pack_padded_sequence(emb, l, batch_first=True, enforce_sorted=False)
        _, h1 = self.gru(pack1)
        
        V_rev = h1[-1]
        
        V = torch.stack([V, V_rev], dim=1) # combines item vector with review vector
        V = torch.mean(V, dim=1)   # still dim 50, avg at each dim from item/review vec
        
        if self.bias:
            b_u = self.user_bias(u).squeeze()
            b_v = self.place_bias(v).squeeze()
            return (U*V).sum(1) +  b_u  + b_v
        
        return (U*V).sum(1)

## Training Functions

In [48]:
def cosine_segment(start_lr, end_lr, iterations):
    i = np.arange(iterations)
    c_i = 1 + np.cos(i*np.pi/iterations)
    return end_lr + (start_lr - end_lr)/2 *c_i

def get_cosine_triangular_lr(max_lr, iterations, div_start=5, div_end=5):
    min_start, min_end = max_lr/div_start, max_lr/div_end
    iter1 = int(0.3*iterations)
    iter2 = iterations - iter1
    segs = [cosine_segment(min_start, max_lr, iter1), cosine_segment(max_lr, min_end, iter2)]
    return np.concatenate(segs)

In [49]:
def set_learning_rate(optimizer, lr):
    """Changing learning rates without creating a new optimizer"""
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

In [50]:
def train_epocs(model, optimzer, train_dl, valid_dl, epochs=10, max_lr=0.01):
    iterations = epochs*len(train_dl)
    lrs = get_cosine_triangular_lr(max_lr, iterations)
    vals = []
    for epoch in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        i = 0
        for u, v, v_rev, l, y in train_dl:
            set_learning_rate(optimizer, lrs[i])
            users = u.long().cuda()
            places = v.long().cuda()
            review_vec = v_rev.long().cuda()
            ratings = y.float().cuda()
            y_hat = model(users, places, review_vec, l)
            loss = F.mse_loss(y_hat, ratings)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
            i += 1
        val_loss = val_metrics(model, valid_dl)
        vals.append(val_loss)
        #if epoch % 10 == 0: 
        print(f"train loss {sum_loss/total:.3f} valid loss {val_loss:.3f}") #valid accuracy {testaccur:.5f}

In [51]:
def val_metrics(model, valid_dl):
    with torch.no_grad(): # reduce memory
        model.eval()
        correct = 0
        total = 0
        sum_loss = 0.0
        for u, v, v_rev, l, y in valid_dl:
            users = u.long().cuda()
            places = v.long().cuda()
            review_vec = v_rev.long().cuda()
            ratings = y.float().cuda()
            y_hat = model(users, places, review_vec, l)
            loss = F.mse_loss(y_hat, ratings)
            #y_pred = y_hat > 0
            #correct += (y_pred.float() == y).float().sum()
            total += y.shape[0]
            sum_loss += loss.item()*y.shape[0]
    return sum_loss/total #, correct/total

## Train with Glove

In [52]:
len(train_enc)

97502

In [53]:
batch_size = 2_500
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=batch_size)

In [54]:
num_places, num_users, V = len(train_enc.place.unique()), len(train_enc.user.unique()), len(vocab2idx.keys())
num_places, num_users, V

(7230, 6104, 13026)

In [55]:
torch.cuda.empty_cache()

In [56]:
model_glove = Context_MF_bias(num_users, num_places, V, emb_size=50, bias=False, glove_weights=W).cuda()

In [57]:
parameters = filter(lambda p: p.requires_grad, model_glove.parameters())
optimizer = torch.optim.Adam(parameters, lr=0.01, weight_decay=1e-5)

In [58]:
train_epocs(model_glove, optimizer, train_dl, valid_dl, max_lr=0.01, epochs=5)

train loss 10.459 valid loss 3.109
train loss 2.129 valid loss 1.236
train loss 0.993 valid loss 0.821
train loss 0.757 valid loss 0.703
train loss 0.694 valid loss 0.662


In [59]:
# unfreezing the embeddings
model_glove.embeddings.weight.requires_grad = True
parameters = filter(lambda p: p.requires_grad, model_glove.parameters())

In [60]:
train_epocs(model_glove, optimizer, train_dl, valid_dl, max_lr=0.01, epochs=30)

train loss 0.665 valid loss 0.639
train loss 0.647 valid loss 0.630
train loss 0.643 valid loss 0.626
train loss 0.635 valid loss 0.622
train loss 0.631 valid loss 0.620
train loss 0.629 valid loss 0.617
train loss 0.632 valid loss 0.615
train loss 0.624 valid loss 0.614
train loss 0.623 valid loss 0.613
train loss 0.624 valid loss 0.613
train loss 0.626 valid loss 0.611
train loss 0.624 valid loss 0.610
train loss 0.618 valid loss 0.610
train loss 0.622 valid loss 0.608
train loss 0.614 valid loss 0.605
train loss 0.617 valid loss 0.605
train loss 0.614 valid loss 0.606
train loss 0.615 valid loss 0.605
train loss 0.616 valid loss 0.604
train loss 0.614 valid loss 0.604
train loss 0.612 valid loss 0.603
train loss 0.614 valid loss 0.601
train loss 0.615 valid loss 0.602
train loss 0.612 valid loss 0.601
train loss 0.610 valid loss 0.599
train loss 0.610 valid loss 0.599
train loss 0.611 valid loss 0.599
train loss 0.611 valid loss 0.600
train loss 0.611 valid loss 0.599
train loss 0.6