In [1]:
from pathlib import Path
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence

from sklearn.model_selection import train_test_split
from collections import Counter

In [2]:
import gzip
import json
import io

def readGz(fname):
    gz = gzip.open(fname, 'rb')
    f = io.BufferedReader(gz)
    data = []
        
    for l in f.readlines():
        data.append(eval(l))
        
    gz.close()
    return data

In [3]:
ca5 = readGz('data/CA_5.json.gz')

In [4]:
data = pd.DataFrame(ca5)

In [5]:
del ca5

In [6]:
data.shape

(163408, 8)

In [7]:
data.head()

Unnamed: 0,rating,reviewerName,reviewText,categories,gPlusPlaceId,unixReviewTime,reviewTime,gPlusUserId
0,4.0,Mary Gainza,"Gap always has a jean that fits awesomely, i w...","[Clothing Store, Women's Clothing Store, Child...",100556368174926958612,1355437000.0,"Dec 13, 2012",100000715097692381911
1,4.0,Mary Gainza,"Madewell girls are always nice and smiley, i r...",[Women's Clothing Store],101139208042040218757,1355437000.0,"Dec 13, 2012",100000715097692381911
2,4.0,Mary Gainza,I like the wide range macy's has in their stor...,"[Department Store, Men's Clothing Store]",103732895939896117386,1355437000.0,"Dec 13, 2012",100000715097692381911
3,5.0,Mary Gainza,"I love the guys that work at this store, they ...",[Juice Shop],105777991601449500815,1355437000.0,"Dec 13, 2012",100000715097692381911
4,5.0,Mary Gainza,I love Specialty's they have the best semi-swe...,[Cafe],109907393708736192804,1355437000.0,"Dec 13, 2012",100000715097692381911


In [8]:
# now use user, item, reviewText & ratings
df = data[['gPlusPlaceId', 'gPlusUserId', 'reviewText', 'rating']]
df.columns = ['place', 'user', 'review', 'rating']

In [9]:
del data

In [10]:
df.head()

Unnamed: 0,place,user,review,rating
0,100556368174926958612,100000715097692381911,"Gap always has a jean that fits awesomely, i w...",4.0
1,101139208042040218757,100000715097692381911,"Madewell girls are always nice and smiley, i r...",4.0
2,103732895939896117386,100000715097692381911,I like the wide range macy's has in their stor...,4.0
3,105777991601449500815,100000715097692381911,"I love the guys that work at this store, they ...",5.0
4,109907393708736192804,100000715097692381911,I love Specialty's they have the best semi-swe...,5.0


## Split Train/Validation

In [11]:
train, val = train_test_split(df, test_size=0.2)
train.shape, val.shape

((130726, 4), (32682, 4))

In [12]:
del df

## Encode Places and Users

In [13]:
def encode_cols(train, val, cols):
    for col in cols:
        uniq = train[col].unique()
        col2idx = {val: i for i, val in enumerate(uniq)}
        train[col] = train[col].apply(lambda x: col2idx.get(x, -1))
        val[col] = val[col].apply(lambda x: col2idx.get(x, -1))
        val = val.loc[val[col] >= 0] 
    return train, val

In [14]:
train_enc, val_enc = encode_cols(train, val, ['place', 'user'])

In [15]:
len(train_enc.user.unique()), len(train_enc.place.unique())

(9177, 10477)

In [16]:
train_enc.head()

Unnamed: 0,place,user,review,rating
83341,0,0,the best british pub in la,4.0
12398,1,1,,5.0
41876,2,2,,4.0
139598,3,3,"Can you say unlimiited soup, salad and breadst...",4.0
72156,4,4,,4.0


## Get text from reviews, create vocab

In [17]:
place_reviews = train_enc.groupby('place').review.apply(list)

In [18]:
len(place_reviews), sum([any(review_list) for review_list in place_reviews.values])

(10477, 10232)

In [19]:
# 261 places without any reviews our ot 10476

In [20]:
place_reviews = place_reviews.apply(lambda x: ' '.join([rev for rev in x if rev is not None]))

In [21]:
place_reviews.head(2)

place
0    the best british pub in la Good pub Great pub ...
1    What an amazing view of the City!  I brought m...
Name: review, dtype: object

In [22]:
import re

In [23]:
def tok(text):
    reg = re.findall(r"[\w']+|[.,!?;]", text, re.UNICODE)
    return [w.lower() for w in reg]

In [24]:
tok(place_reviews.iloc[0])[:20]

['the',
 'best',
 'british',
 'pub',
 'in',
 'la',
 'good',
 'pub',
 'great',
 'pub',
 'experience',
 '.',
 'its',
 'open',
 ',',
 'friendly',
 'and',
 'never',
 'crowded',
 '.']

In [25]:
all_vocab = [word for review in place_reviews.values for word in tok(review)]

In [26]:
len(all_vocab)

2993907

In [27]:
word_cts = Counter(all_vocab)
len(word_cts)

45623

In [28]:
del all_vocab

In [29]:
common = {k:v for k, v in word_cts.items() if v >= 5}
vocab2idx = {'<PAD>': 0, 'UNK': 1}

for i, (word, ct) in enumerate(common.items()):
    if ct >= 5:
        vocab2idx[word] = i + 2

In [30]:
del word_cts, common

In [31]:
len(vocab2idx)

13049

## Review to Vector

In [32]:
place_reviews_tok = [tok(review) for review in place_reviews.values]

In [33]:
lens = [len(rev) for rev in place_reviews_tok]
max(lens)

6237

In [34]:
del lens

In [35]:
def encode_review(review, N=6412):
    enc1 = np.array([vocab2idx.get(w, vocab2idx["UNK"]) for w in review])
    enc = np.zeros(N, dtype=np.int32)
    l = min(N, len(review))
    enc[:l] = enc1[:l]
    if l == 0:
        l=1
    return enc, l

In [36]:
encode_review(place_reviews_tok[0])

(array([2, 3, 4, ..., 0, 0, 0], dtype=int32), 191)

## Dataset and Data Loader

In [37]:
class GoogleDataset(Dataset):
    def __init__(self, X, y, reviews):
        self.user = X.user.values
        self.place = X.place.values
        self.y = y
        self.reviews = reviews
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        user = self.user[idx]
        place = self.place[idx]
        review_vec, l = encode_review(self.reviews[place])
        return user, place, review_vec, l, self.y[idx]

In [38]:
train_ds = GoogleDataset(train_enc[['place', 'user']], train_enc.rating.values, place_reviews_tok)
valid_ds = GoogleDataset(val_enc[['place', 'user']], val_enc.rating.values, place_reviews_tok)

In [39]:
next(iter(train_ds))

(0, 0, array([2, 3, 4, ..., 0, 0, 0], dtype=int32), 191, 4.0)

## Load Pre-Trained Glove

In [40]:
def loadGloveModel(gloveFile='data/glove.6B.50d.txt'):
    """ Loads word vectors into a dictionary."""
    f = open(gloveFile,'r')
    words = []
    word_vecs = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        words.append(word)
        word_vecs[word] = np.array([float(val) for val in splitLine[1:]])
    return word_vecs

In [41]:
glove = loadGloveModel()

In [42]:
def create_embedding_matrix(word_vecs, vocab2index, emb_size=50):
    """Creates embedding matrix from word vectors. """ 
    V = len(vocab2index.keys())
    W = np.random.uniform(-0.25,0.25, (V, emb_size)) # initialize random
    # adding a vector for padding
    W[0] = np.zeros(emb_size, dtype='float32')
    for word, index in vocab2index.items():
        if word in word_vecs:
            W[index] = word_vecs[word]
    return W

In [43]:
max([v for k, v in vocab2idx.items()])

13048

In [44]:
W = create_embedding_matrix(glove, vocab2idx)

In [45]:
del glove

## Model

In [46]:
class Context_MF_bias(nn.Module):
    def __init__(self, num_users, num_places, vocab_size,  emb_size=50, glove_weights=None, bias=True):
        super(Context_MF_bias, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.place_emb = nn.Embedding(num_places, emb_size)
        self.user_emb.weight.data.uniform_(0,0.05)
        self.place_emb.weight.data.uniform_(0,0.05)
        self.bias = bias
        if self.bias:
            self.user_bias = nn.Embedding(num_users, 1)
            self.place_bias = nn.Embedding(num_places, 1)
            self.user_bias.weight.data.uniform_(-0.01,0.01)
            self.place_emb.weight.data.uniform_(-0.01,0.01)
        
        self.embeddings = nn.Embedding(vocab_size, emb_size, padding_idx=0)
        if glove_weights is not None:
            self.embeddings.weight.data.copy_(torch.from_numpy(glove_weights))
            self.embeddings.weight.requires_grad = False ## freeze embeddings
        
        # input dim and hidden dim must be the same to later combine item vec and review vec
        self.gru = nn.GRU(emb_size, emb_size, batch_first=True) 
        self.dropout = nn.Dropout(0.3)
        
    def forward(self, u, v, v_review, l):
        U = self.user_emb(u)
        V = self.place_emb(v)
        U, V = self.dropout(U), self.dropout(V)
        
        emb = self.embeddings(v_review)
        emb = self.dropout(emb)
        pack1 = pack_padded_sequence(emb, l, batch_first=True, enforce_sorted=False)
        _, h1 = self.gru(pack1)
        
        V_rev = h1[-1]
        
        V_ = torch.stack([V, V_rev], dim=1) # combines item vector with review vector
        V_ = torch.mean(V_, dim=1)   # still dim 50, avg at each dim from item/review vec
        
        if self.bias:
            b_u = self.user_bias(u).squeeze()
            b_v = self.place_bias(v).squeeze()
            return (U*V_).sum(1) +  b_u  + b_v
        
        return (U*V_).sum(1)

## Training Functions

In [47]:
def cosine_segment(start_lr, end_lr, iterations):
    i = np.arange(iterations)
    c_i = 1 + np.cos(i*np.pi/iterations)
    return end_lr + (start_lr - end_lr)/2 *c_i

def get_cosine_triangular_lr(max_lr, iterations, div_start=5, div_end=5):
    min_start, min_end = max_lr/div_start, max_lr/div_end
    iter1 = int(0.3*iterations)
    iter2 = iterations - iter1
    segs = [cosine_segment(min_start, max_lr, iter1), cosine_segment(max_lr, min_end, iter2)]
    return np.concatenate(segs)

In [48]:
def set_learning_rate(optimizer, lr):
    """Changing learning rates without creating a new optimizer"""
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

In [49]:
def train_epocs(model, optimzer, train_dl, valid_dl, epochs=10, max_lr=0.01):
    iterations = epochs*len(train_dl)
    lrs = get_cosine_triangular_lr(max_lr, iterations)
    vals = []
    for epoch in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        i = 0
        for u, v, v_rev, l, y in train_dl:
            set_learning_rate(optimizer, lrs[i])
            users = u.long().cuda()
            places = v.long().cuda()
            review_vec = v_rev.long().cuda()
            ratings = y.float().cuda()
            y_hat = model(users, places, review_vec, l)
            loss = F.mse_loss(y_hat, ratings)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
            i += 1
        val_loss = val_metrics(model, valid_dl)
        vals.append(val_loss)
        #if epoch % 10 == 0: 
        print(f"train loss {sum_loss/total:.3f} valid loss {val_loss:.3f}") #valid accuracy {testaccur:.5f}

In [50]:
def val_metrics(model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    for u, v, v_rev, l, y in valid_dl:
        users = u.long().cuda()
        places = v.long().cuda()
        review_vec = v_rev.long().cuda()
        ratings = y.float().cuda()
        y_hat = model(users, places, review_vec, l)
        loss = F.mse_loss(y_hat, ratings)
        #y_pred = y_hat > 0
        #correct += (y_pred.float() == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
    return sum_loss/total #, correct/total

## Train

In [51]:
len(train_enc)

130726

In [52]:
batch_size = 2_500
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=batch_size)

In [53]:
num_places, num_users, V = len(train_enc.place.unique()), len(train_enc.user.unique()), len(vocab2idx.keys())
num_places, num_users, V

(10477, 9177, 13049)

In [54]:
torch.cuda.empty_cache()

In [55]:
model = Context_MF_bias(num_users, num_places, V, emb_size=50, bias=False).cuda()

In [56]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-5)

In [None]:
train_epocs(model, optimizer, train_dl, valid_dl, max_lr=0.01, epochs=20)

train loss 11.897 valid loss 6.826
train loss 4.623 valid loss 3.264
train loss 2.355 valid loss 1.969
train loss 1.481 valid loss 1.379
train loss 1.092 valid loss 1.082
