In [6]:
from tqdm import tqdm 
import os 

import numpy as np 
import pandas as pd

import scipy.sparse  as sp 

from sklearn.model_selection import train_test_split 

import torch 
from torch import nn, optim 
from torch.utils.data import Dataset, DataLoader

In [7]:
class args:
    seed = 42
    num_layers = 4
    batch_size= 4096
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    SAVE_PATH = 'Parameters'

In [31]:
d_set = pd.read_csv('dataset/Yelp2018/Yelp2018.csv', encoding='utf-8-sig')

In [54]:
d_train, d_test = train_test_split(d_set, train_size=0.6, random_state=args.seed)
d_valid, d_test = train_test_split(d_test, train_size=0.5, random_state=args.seed)

In [55]:
d_train = d_train.astype({'user_id':'category', 'business_id':'category'})
d_valid = d_valid.astype({'user_id':'category', 'business_id':'category'})
d_test = d_test.astype({'user_id':'category', 'business_id':'category'})

In [56]:
u_cat = d_train.user_id.cat.categories
b_cat = d_train.business_id.cat.categories

In [57]:
d_valid.user_id = d_valid.user_id.cat.set_categories(u_cat)
d_valid.business_id = d_valid.business_id.cat.set_categories(b_cat)

d_test.user_id = d_test.user_id.cat.set_categories(u_cat)
d_test.business_id = d_test.business_id.cat.set_categories(b_cat)

In [58]:
d_train.user_id = d_train.user_id.cat.codes
d_train.business_id = d_train.business_id.cat.codes 

d_valid.user_id = d_valid.user_id.cat.codes
d_valid.business_id = d_valid.business_id.cat.codes 

d_test.user_id = d_test.user_id.cat.codes
d_test.business_id = d_test.business_id.cat.codes 

In [59]:
d_train = d_train.dropna()
d_valid = d_valid.dropna()
d_test = d_test.dropna()

d_train.reset_index(drop=True, inplace=True)
d_valid.reset_index(drop=True, inplace=True)
d_test.reset_index(drop=True, inplace=True)

In [60]:
d_train = d_train.astype({'user_id': int, 'business_id': int})
d_valid = d_valid.astype({'user_id': int, 'business_id': int})
d_test = d_test.astype({'user_id': int, 'business_id': int})

In [61]:
args.num_users = d_train.user_id.max() + 1
args.num_items = d_train.business_id.max() + 1
args.latent_dim = 64
args.num_epochs = 15

In [62]:
class GNNLayer(nn.Module):
    def __init__(self, in_feats, out_feats):
        super(GNNLayer, self).__init__()
        self.in_feats = in_feats
        self.out_feats = out_feats 

        self.W1 = nn.Linear(in_feats, out_feats)
        self.W2 = nn.Linear(in_feats, out_feats)

        self._init_weight()

    def forward(self, L, SelfLoop, feats):
        # (L+I)EW_1
        sf_L = L + SelfLoop
        L = L.cuda()
        sf_L = sf_L.cuda()
        sf_E = torch.sparse.mm(sf_L, feats)
        left_part = self.W1(sf_E) # left part

        # EL odot EW_2, odot indicates element-wise product 
        LE = torch.sparse.mm(L, feats)
        E = torch.mul(LE, feats)
        right_part = self.W2(E)

        return left_part + right_part 

    def _init_weight(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, std=0.01)

# class GNNLayer(nn.Module):
#     def __init__(self, in_feats, out_feats):
#         super(GNNLayer, self).__init__()
#         self.in_feats = in_feats 
#         self.out_feats = out_feats 
#         self.linear1 = nn.Linear(self.in_feats, self.out_feats)
#         self.linear2 = nn.Linear(self.in_feats, self.out_feats)
        
#     def forward(self, L, selfLoop, features):
#         L1 = L + selfLoop
#         L2 = L.cuda()
#         L1 = L1.cuda()
#         inter_feats = torch.sparse.mm(L2, features)
#         inter_feats = torch.mul(inter_feats, features)
        
#         inter_part1 = self.linear1(torch.sparse.mm(L1, features))
#         inter_part2 = self.linear2(torch.sparse.mm(L2, inter_feats))
        
#         return inter_part1 + inter_part2 

class NGCF(nn.Module):
    def __init__(self, args, matrix):
        super(NGCF, self).__init__()
        self.num_users = args.num_users 
        self.num_items = args.num_items 
        self.latent_dim = args.latent_dim 
        self.device = args.device

        self.user_emb = nn.Embedding(self.num_users, self.latent_dim)
        self.item_emb = nn.Embedding(self.num_items, self.latent_dim)

        self.num_layers = args.num_layers
        self.L = self.LaplacianMatrix(matrix)
        self.I = self.SelfLoop(self.num_users + self.num_items)

        self.leakyrelu = nn.LeakyReLU()
        self.GNNLayers = nn.ModuleList()
        # self.sigmoid = nn.Sigmoid()

        for i in range(self.num_layers-1):
            self.GNNLayers.append(GNNLayer(self.latent_dim, self.latent_dim))

        self.fc_layer = nn.Sequential(
            nn.Linear(self.latent_dim * self.num_layers * 2, 64), 
            nn.ReLU(), 
            nn.Linear(64, 32), 
            nn.ReLU(), 
            nn.Linear(32, 1)
        )

    def _init_weight(self):
        for m in self.modules():
            if isinstance(m, nn.Embedding):
                nn.init.xavier_uniform_(m.weight)
            
            if isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, std=0.01)


    def SelfLoop(self, num):
        i = torch.LongTensor([[k for k in range(0, num)], [j for j in range(0, num)]])
        val = torch.FloatTensor([1]*num)
        return torch.sparse.FloatTensor(i, val)

    def LaplacianMatrix(self, ratings):
        iids = ratings['business_id'] + self.num_users 
        matrix = sp.coo_matrix((ratings['stars'], (ratings['user_id'], ratings['business_id'])))
        
        upper_matrix = sp.coo_matrix((ratings['stars'], (ratings['user_id'], iids)))
        lower_matrix = matrix.transpose()
        lower_matrix.resize((self.num_items, self.num_users + self.num_items))

        A = sp.vstack([upper_matrix, lower_matrix])
        row_sum = (A > 0).sum(axis=1)
        row_sum = np.array(row_sum).flatten()
        # diag = list(np.array(row_sum.flatten())[0])
        D = np.power(row_sum, -0.5)
        D = sp.diags(D)
        L = D * A * D
        L = sp.coo_matrix(L)
        row = L.row 
        col = L.col
        idx = np.stack([row, col])
        idx = torch.LongTensor(idx)
        data = torch.FloatTensor(L.data)
        SparseL = torch.sparse.FloatTensor(idx, data)
        return SparseL 

    def FeatureMatrix(self):
        uids = torch.LongTensor([i for i in range(self.num_users)]).to(self.device)
        iids = torch.LongTensor([i for i in range(self.num_items)]).to(self.device)
        user_emb = self.user_emb(uids)
        item_emb = self.item_emb(iids)
        features = torch.cat([user_emb, item_emb], dim=0)
        return features

    def forward(self, uids, iids):
        iids = self.num_users + iids 

        features = self.FeatureMatrix()
        final_emb = features.clone()

        for gnn in self.GNNLayers:
            features = gnn(self.L, self.I, features)
            features = self.leakyrelu(features)
            final_emb = torch.concat([final_emb, features],dim=-1)

        user_emb = final_emb[uids]
        item_emb = final_emb[iids]

        inputs = torch.concat([user_emb, item_emb], dim=-1)
        outs = self.fc_layer(inputs)
        # outs = self.sigmoid(outs)
        return outs.flatten()

In [63]:
class GraphDataset(Dataset):
    def __init__(self, dataframe):
        super(Dataset, self).__init__()
        
        self.uid = list(dataframe['user_id'])
        self.iid = list(dataframe['business_id'])
        self.ratings = list(dataframe['stars'])
    
    def __len__(self):
        return len(self.uid)
    
    def __getitem__(self, idx):
        uid = self.uid[idx]
        iid = self.iid[idx]
        rating = self.ratings[idx]
        
        return (uid, iid, rating)

In [64]:
def get_loader(args, dataset, num_workers):
    d_set = GraphDataset(dataset)
    return DataLoader(d_set, batch_size=args.batch_size, num_workers=num_workers)

In [65]:
train_loader = get_loader(args, d_train, 4)
valid_loader = get_loader(args, d_valid, 4)
test_loader = get_loader(args, d_test, 4)

In [66]:
def graph_evaluate(args, model, test_loader, criterion):
    output = []
    test_loss = 0

    model.eval()
    with torch.no_grad():
        for batch in tqdm(test_loader, desc='evaluating...'):
            batch = tuple(b.to(args.device) for b in batch)
            inputs = {'uids':   batch[0], 
                      'iids':   batch[1]}
            gold_y = batch[2].float()
            
            pred_y = model(**inputs)
            output.append(pred_y)
            
            loss = criterion(pred_y, gold_y)
            loss = torch.sqrt(loss)
            test_loss += loss.item()
    test_loss /= len(test_loader)
    return test_loss, output

In [67]:
def graph_train(args, model, train_loader, valid_loader, optimizer, criterion):
    best_loss = float('inf')
    train_losses, valid_losses = [], []
    for epoch in range(1, args.num_epochs + 1):
        train_loss = 0.0

        model.train()
        for batch in tqdm(train_loader, desc='training...'):
            batch = tuple(b.to(args.device) for b in batch)
            inputs = {'uids':   batch[0], 
                      'iids':   batch[1]}
            
            gold_y = batch[2].float()
            

            pred_y = model(**inputs)
            
            loss = criterion(pred_y, gold_y)
            loss = torch.sqrt(loss)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        train_loss /= len(train_loader)
        train_losses.append(train_loss)

        valid_loss , outputs = graph_evaluate(args, model, valid_loader, criterion)
        valid_losses.append(valid_loss)
        

        print(f'Epoch: [{epoch}/{args.num_epochs}]')
        print(f'Train Loss: {train_loss:.4f}\tValid Loss: {valid_loss:.4f}')

        if best_loss > valid_loss:
            best_loss = valid_loss
            if not os.path.exists(args.SAVE_PATH):
                os.makedirs(args.SAVE_PATH)
            torch.save(model.state_dict(), os.path.join(args.SAVE_PATH, f'{model._get_name()}_parameters.pt'))

    return {
        'train_loss': train_losses, 
        'valid_loss': valid_losses
    }, outputs

In [68]:
models = NGCF(args, d_train).to(args.device)

optimizer = optim.Adam(models.parameters(), lr = 1e-4)
criterion = nn.L1Loss()

In [69]:
results = graph_train(args, models, train_loader, valid_loader, optimizer, criterion)

training...: 100%|██████████| 177/177 [00:38<00:00,  4.56it/s]
evaluating...: 100%|██████████| 59/59 [00:09<00:00,  6.46it/s]


Epoch: [1/15]
Train Loss: 1.4006	Valid Loss: 0.9906


training...: 100%|██████████| 177/177 [00:39<00:00,  4.49it/s]
evaluating...: 100%|██████████| 59/59 [00:09<00:00,  6.45it/s]


Epoch: [2/15]
Train Loss: 0.9325	Valid Loss: 0.9288


training...: 100%|██████████| 177/177 [00:38<00:00,  4.55it/s]
evaluating...: 100%|██████████| 59/59 [00:09<00:00,  6.45it/s]


Epoch: [3/15]
Train Loss: 0.9168	Valid Loss: 0.9246


training...: 100%|██████████| 177/177 [00:37<00:00,  4.67it/s]
evaluating...: 100%|██████████| 59/59 [00:09<00:00,  6.54it/s]


Epoch: [4/15]
Train Loss: 0.9139	Valid Loss: 0.9227


training...: 100%|██████████| 177/177 [00:38<00:00,  4.65it/s]
evaluating...: 100%|██████████| 59/59 [00:09<00:00,  6.46it/s]


Epoch: [5/15]
Train Loss: 0.9115	Valid Loss: 0.9209


training...: 100%|██████████| 177/177 [00:38<00:00,  4.57it/s]
evaluating...: 100%|██████████| 59/59 [00:09<00:00,  6.39it/s]


Epoch: [6/15]
Train Loss: 0.9088	Valid Loss: 0.9192


training...: 100%|██████████| 177/177 [00:38<00:00,  4.56it/s]
evaluating...: 100%|██████████| 59/59 [00:09<00:00,  6.45it/s]


Epoch: [7/15]
Train Loss: 0.9056	Valid Loss: 0.9172


training...: 100%|██████████| 177/177 [00:38<00:00,  4.58it/s]
evaluating...: 100%|██████████| 59/59 [00:09<00:00,  6.49it/s]


Epoch: [8/15]
Train Loss: 0.9020	Valid Loss: 0.9152


training...: 100%|██████████| 177/177 [00:39<00:00,  4.53it/s]
evaluating...: 100%|██████████| 59/59 [00:09<00:00,  6.50it/s]


Epoch: [9/15]
Train Loss: 0.8983	Valid Loss: 0.9134


training...: 100%|██████████| 177/177 [00:39<00:00,  4.49it/s]
evaluating...: 100%|██████████| 59/59 [00:09<00:00,  6.13it/s]


Epoch: [10/15]
Train Loss: 0.8946	Valid Loss: 0.9117


training...: 100%|██████████| 177/177 [00:39<00:00,  4.45it/s]
evaluating...: 100%|██████████| 59/59 [00:09<00:00,  6.08it/s]


Epoch: [11/15]
Train Loss: 0.8910	Valid Loss: 0.9105


training...: 100%|██████████| 177/177 [00:39<00:00,  4.47it/s]
evaluating...: 100%|██████████| 59/59 [00:09<00:00,  6.19it/s]


Epoch: [12/15]
Train Loss: 0.8878	Valid Loss: 0.9092


training...: 100%|██████████| 177/177 [00:39<00:00,  4.51it/s]
evaluating...: 100%|██████████| 59/59 [00:09<00:00,  6.40it/s]


Epoch: [13/15]
Train Loss: 0.8848	Valid Loss: 0.9082


training...: 100%|██████████| 177/177 [00:39<00:00,  4.52it/s]
evaluating...: 100%|██████████| 59/59 [00:09<00:00,  6.18it/s]


Epoch: [14/15]
Train Loss: 0.8820	Valid Loss: 0.9072


training...: 100%|██████████| 177/177 [00:41<00:00,  4.23it/s]
evaluating...: 100%|██████████| 59/59 [00:10<00:00,  5.88it/s]


Epoch: [15/15]
Train Loss: 0.8794	Valid Loss: 0.9062


In [70]:
inference_results = graph_evaluate(args, models, test_loader, criterion)

evaluating...: 100%|██████████| 59/59 [00:10<00:00,  5.73it/s]


In [71]:
outs, yhat = inference_results
yhat = torch.concat(yhat, dim=0).detach().cpu()

In [72]:
d_test.loc[:, 'yhat'] = yhat 

In [73]:
def sentiment_score(x):
    if x >= 3.5 : return 1
    elif x < 3.5 : return 0

In [74]:
d_test.loc[:, 'stars'] = d_test.loc[:, 'stars'].apply(sentiment_score)

In [75]:
def dcg(label, k):
    label = np.asfarray(label)[:k]
    if label.size:
        return label[0] + np.sum(label[1:] / np.log2(np.arange(2, label.size + 1)))

    return 0

def ndcg(dataframe, k):
    ndcg_list = []
    for uid in dataframe.user_id.unique():
        label_temp = dataframe.loc[dataframe.user_id == uid]['stars'].tolist()

        idcg = dcg(sorted(label_temp, reverse=True), k)

        if not idcg:
            return 0 

        ndcg_list.append(dcg(label_temp, k) / idcg)
    return np.mean(ndcg_list)

In [76]:
def sa_metrics(dataframe, top_k):
    # metrics for Sentiment Analysis
    item = dataframe.groupby(['user_id'])['stars'].sum()
    precision_k, recall_k, f1_k, ndcg_k = [], [], [], []
    for k in top_k:
        precision, recall, f1_score, ndcg_score = [], [], [], []
        for uid in tqdm(dataframe.loc[:, 'user_id'].unique(), desc=f'Tok@{k} evaluating..'):
            new_df = dataframe.loc[dataframe.loc[:, 'user_id'] == uid].copy()

            new_df = new_df.sort_values(by = ['yhat'], ascending=False).head(k)
            pr_temp = sum(new_df.loc[:, 'stars']) / k 
            re_temp = sum(new_df.loc[:, 'stars']) / item[uid] if item[uid] != 0 else 0 
            pr_re = pr_temp + re_temp 
            f1_temp = (2 * pr_temp * re_temp) / pr_re if pr_re != 0 else 0
            precision.append(pr_temp)
            recall.append(re_temp)
            f1_score.append(f1_temp)
            ndcg_score.append(ndcg(new_df, k))
        
        precision_k.append(np.mean(precision))
        recall_k.append(np.mean(recall))
        f1_k.append(np.mean(f1_score))
        ndcg_k.append(np.mean(ndcg_score))

    outputs = pd.DataFrame({
        'recall': recall_k, 
        'precision': precision_k, 
        'f1_score': f1_k, 
        'ndcg': ndcg_k
    }, index=top_k)
    return outputs 

In [77]:
ngcf_results = sa_metrics(d_test, [10, 20, 40, 50, 80, 100])

Tok@10 evaluating..: 100%|██████████| 25307/25307 [00:19<00:00, 1267.22it/s]
Tok@20 evaluating..: 100%|██████████| 25307/25307 [00:19<00:00, 1282.35it/s]
Tok@40 evaluating..: 100%|██████████| 25307/25307 [00:19<00:00, 1273.14it/s]
Tok@50 evaluating..: 100%|██████████| 25307/25307 [00:19<00:00, 1274.96it/s]
Tok@80 evaluating..: 100%|██████████| 25307/25307 [00:19<00:00, 1267.26it/s]
Tok@100 evaluating..: 100%|██████████| 25307/25307 [00:19<00:00, 1270.00it/s]


In [78]:
ngcf_results

Unnamed: 0,recall,precision,f1_score,ndcg
10,0.889048,0.501308,0.576639,0.893569
20,0.956685,0.302053,0.414979,0.892741
40,0.974539,0.163428,0.257015,0.892621
50,0.976297,0.132321,0.215552,0.892611
80,0.977881,0.083916,0.14533,0.892603
100,0.978195,0.067402,0.119485,0.892601
