Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.

SPDX-License-Identifier: Apache-2.0

# This notebook uses the pre-processed Reddit user-behavior data to train ELAND

## Table of contents

1. Loading data
2. Setting up the model trainer
3. Model training

In [1]:
import os
import pickle
import sys
import json
import math
import logging
import pickle as pk
from collections import Counter
import numpy as np
import pandas as pd
import scipy.sparse as sp
from scipy.sparse import csr_matrix, coo_matrix
import torch
from torch.utils.data import DataLoader
import torch.nn.functional as F
import torch.nn as nn
from torch.nn import MSELoss, CosineEmbeddingLoss
from sklearn.metrics import precision_recall_curve, roc_curve, auc, average_precision_score, roc_auc_score, f1_score

  from .autonotebook import tqdm as notebook_tqdm


### 1. Loading data 

### User labels

In [2]:
user_label = pd.read_csv("../../data/02_intermediate/user_behavior/user_labels.csv")

In [3]:
user_label.head(10)

Unnamed: 0,author,label
0,ultimatt42,0
1,jonknee,0
2,dons,0
3,Jedravent,0
4,burtonmkz,0
5,pavel_lishin,0
6,sblinn,0
7,WebZen,0
8,doodahdei,0
9,Tack122,0


#### User and subreddit topic index

In [4]:
with open("../../data/02_intermediate/user_behavior/u2index.pkl","rb") as f:
    u2index = pickle.load(f)

In [5]:
with open("../../data/02_intermediate/user_behavior/p2index.pkl","rb") as f:
    p2index = pickle.load(f)

#### Edge list data 

In [6]:
edgelist_df = pd.read_csv("../../data/02_intermediate/user_behavior/edge_list.csv")

In [7]:
edgelist_df.head(10)

Unnamed: 0,author,subreddit,retrieved_on
0,ultimatt42,science,1425846806
1,jonknee,programming,1425846807
2,burtonmkz,science,1425846810
3,pavel_lishin,reddit.com,1425846810
4,pavel_lishin,reddit.com,1425846810
5,sblinn,politics,1425846810
6,dons,programming,1425846811
7,Jedravent,politics,1425846811
8,WebZen,politics,1425846811
9,doodahdei,politics,1425846812


In [8]:
from scipy.sparse import csr_matrix, coo_matrix
def process_edgelist(edge_list, u2index, p2index):
    """ Load edge list and construct a graph """
    edges = Counter()

    for i, row in edge_list.iterrows():
        #u = row[0]
        #p = row[1]
        #t = row[2]
        u = row['author']
        p = row['subreddit']
        t = row['retrieved_on']

        if i<1:
            print(u, p, t)
        edges[(u2index[u], p2index[p])] += 1
    # Construct the graph
    row = []
    col = []
    entry = []
    for edge, w in edges.items():
        #print(w)
        i, j = edge
        row.append(i)
        col.append(j)
        entry.append(w)
    graph = csr_matrix(
        (entry, (row, col)), 
        shape=(len(u2index), len(p2index))
    )   
    return graph

In [9]:
graph = process_edgelist(edgelist_df, u2index, p2index)

ultimatt42 science 1425846806


In [10]:
type(graph)

scipy.sparse.csr.csr_matrix

#### Train/validation/test id split

In [11]:
with open("../../data/02_intermediate/user_behavior/data_tvt.pkl","rb") as f:
    tvt_idx = pickle.load(f)

In [12]:
idx_train, idx_val, idx_test = tvt_idx

In [13]:
idx_train.shape, idx_val.shape, idx_test.shape

((195,), (198,), (393,))

#### Convert label format (to numpy array)

In [14]:
def process_label(labels: pd.DataFrame) -> np.array:
    """process label information"""
    u_all = set()
    pos_uids = set()
    labeled_uids = set()
    #convert a dataframe to an numpy array, array index being mapped indexes from u2index
    for i,row in labels.iterrows():
        author = row['author']
        author_label = row['label']
        u_all.add(author)
        if author_label == 1:
            pos_uids.add(author)
            labeled_uids.add(author)
        elif author_label == 0:
            labeled_uids.add(author)
    print(f'loaded labels, total of {len(pos_uids)} positive users and {len(labeled_uids)} labeled users')
    labels = np.zeros(len(u2index))
    for u in u2index:
        if u in pos_uids:
            labels[u2index[u]] = 1
    labels = labels.astype(int)
    return labels

In [15]:
labels = process_label(user_label)

loaded labels, total of 327 positive users and 787 labeled users


In [16]:
print('Train: total of {:5} users with {:5} pos users and {:5} neg users'.format(
    len(idx_train), 
    np.sum(labels[idx_train]), 
    len(idx_train)-np.sum(labels[idx_train]))
     )
print('Val:   total of {:5} users with {:5} pos users and {:5} neg users'.format(
    len(idx_val), 
    np.sum(labels[idx_val]), 
    len(idx_val)-np.sum(labels[idx_val]))
     )
print('Test:  total of {:5} users with {:5} pos users and {:5} neg users'.format(
    len(idx_test), 
    np.sum(labels[idx_test]), 
    len(idx_test)-np.sum(labels[idx_test]))
     )

Train: total of   195 users with    75 pos users and   120 neg users
Val:   total of   198 users with    87 pos users and   111 neg users
Test:  total of   393 users with   165 pos users and   228 neg users


In [17]:
user_features = np.load("../../data/02_intermediate/user_behavior/user2vec_npy.npz")
print(user_features['data'].shape)

(787, 300)


In [18]:
item_features = np.load("../../data/02_intermediate/user_behavior/prod2vec_npy.npz")
print(item_features['data'].shape)

(47, 300)


### 2. Setting up the model trainer 

In [19]:
sys.path.append('../../src/')

In [20]:
from anomaly_detection_spatial_temporal_data.model.data_loader import DynamicGraphWNFDataSet, DynamicGraphWNodeFeatDatasetLoader
from anomaly_detection_spatial_temporal_data.model.dynamic_graph import Eland_e2e
from anomaly_detection_spatial_temporal_data.model.model_config import ElandConfig

#### Set up dataloader

In [21]:
data_loader = DynamicGraphWNodeFeatDatasetLoader(
    user_label, 
    u2index, 
    p2index, 
    edgelist_df, 
    tvt_idx, 
    user_features['data'], 
    item_features['data']
)

#sequential data loader
dataset = DynamicGraphWNFDataSet(p2index, item_features['data'], edgelist_df)
lstm_dataloader = DataLoader(dataset, batch_size=300)
    

loaded labels, total of 327 positive users and 787 labeled users
Train: total of   195 users with    75 pos users and   120 neg users
Val:   total of   198 users with    87 pos users and   111 neg users
Test:  total of   393 users with   165 pos users and   228 neg users


In [22]:
data_dict = {
        'graph': data_loader.graph, 
        'lstm_dataloader': lstm_dataloader,
        'user_features': data_loader.user_features,
        'item_features': data_loader.item_features,
        'labels': data_loader.labels,
        'tvt_nids': data_loader.tvt_idx,
        'u2index': data_loader.u2index,
        'p2index': data_loader.p2index
    }


#### Load model config

In [23]:
import yaml

In [24]:
model_config_file = '../../conf/base/parameters/eland.yml'

In [25]:
with open(model_config_file, "r") as stream:
    try:
        mode_config=yaml.safe_load(stream)
        print(mode_config)
    except yaml.YAMLError as exc:
        print(exc)

{'eland_data_load_options': {'dataset': 'reddit', 'baseline': 'store_true', 'batch_size': 300}, 'eland_model_options': {'dim_feats': 300, 'cuda': 0, 'hidden_size': 128, 'n_layers': 2, 'epochs': 50, 'batch_size': 300, 'seed': -1, 'lr': 0.0001, 'log': True, 'weight_decay': 1e-06, 'dropout': 0.4, 'tensorboard': False, 'name': 'debug', 'gnnlayer_type': 'gcn', 'rnn_type': 'lstm', 'pretrain_bm': 25, 'pretrain_nc': 200, 'alpha': 0.05, 'bmloss_type': 'mse', 'device': 'cpu', 'base_pred': 400, 'save_directory': 'data/07_model_output/user_behavior'}}


In [26]:
# Open a log directory for notebook training session 
from pathlib import Path
log_dir = Path('logs/')
log_dir.mkdir(parents=True, exist_ok=True)

In [27]:
eland_config = ElandConfig(mode_config['eland_model_options'])

#### Adjust model directory for notebook 

In [28]:
eland_config.save_directory

'data/07_model_output/user_behavior'

In [29]:
eland_config.save_directory = '../../data/07_model_output/user_behavior/'
eland_config.epochs = 10 # reduce to 10 epochs in notebooks for demonstration 

In [30]:
if not os.path.exists(eland_config.save_directory):
    os.makedirs(eland_config.save_directory)

### 3. Model training

In [31]:
model_obj = Eland_e2e(
    data_dict['graph'], 
    data_dict['lstm_dataloader'], 
    data_dict['user_features'],
    data_dict['item_features'], 
    data_dict['labels'], 
    data_dict['tvt_nids'], 
    data_dict['u2index'],
    data_dict['p2index'], 
    data_dict['item_features'], 
    eland_config
)
training_result,save_model_path = model_obj.train()

2022-08-12 22:26:32,177 - Parameters: {'dim_feats': 300, 'hidden_size': 128, 'n_layers': 2, 'lr': 0.0001, 'weight_decay': 1e-06, 'dropout': 0.4, 'gnnlayer_type': 'gcn', 'rnn_type': 'lstm', 'bmloss_type': 'mse'}
2022-08-12 22:26:38,437 - BM Module pretrain, Epoch 1/25: loss 104.97600428
2022-08-12 22:26:43,544 - BM Module pretrain, Epoch 2/25: loss 99.27821954
2022-08-12 22:26:48,775 - BM Module pretrain, Epoch 3/25: loss 91.91539001
2022-08-12 22:26:53,843 - BM Module pretrain, Epoch 4/25: loss 81.15819422
2022-08-12 22:26:58,970 - BM Module pretrain, Epoch 5/25: loss 67.96443526
2022-08-12 22:27:04,149 - BM Module pretrain, Epoch 6/25: loss 56.07752895
2022-08-12 22:27:09,218 - BM Module pretrain, Epoch 7/25: loss 47.38684654
2022-08-12 22:27:14,355 - BM Module pretrain, Epoch 8/25: loss 40.11199252
2022-08-12 22:27:19,463 - BM Module pretrain, Epoch 9/25: loss 33.8731308
2022-08-12 22:27:24,491 - BM Module pretrain, Epoch 10/25: loss 28.78942426
2022-08-12 22:27:29,604 - BM Module pr

In [32]:
training_result

{1: {'train_auc': 0.9702222222222223, 'val_auc': 0.983224603914259},
 2: {'train_auc': 0.9674444444444443, 'val_auc': 0.9853991922957441},
 3: {'train_auc': 0.9654444444444445, 'val_auc': 0.9841565703634669},
 4: {'train_auc': 0.9637777777777778, 'val_auc': 0.9855027441234339},
 5: {'train_auc': 0.9658888888888889, 'val_auc': 0.9869524697110903},
 6: {'train_auc': 0.9634444444444444, 'val_auc': 0.9872631251941596},
 7: {'train_auc': 0.9686666666666668, 'val_auc': 0.9857098477788133},
 8: {'train_auc': 0.9657777777777777, 'val_auc': 0.9843636740188465},
 9: {'train_auc': 0.975111111111111, 'val_auc': 0.9874702288495392},
 10: {'train_auc': 0.9673333333333334, 'val_auc': 0.9822926374650512}}

# References

Jason Baumgartner, Savvas Zannettou, Brian Keegan, Megan Squire, and Jeremy Blackburn. 2020. The Pushshift Reddit Dataset.

Tong Zhao, Bo Ni, Wenhao Yu, Zhichun Guo, Neil Shah, and Meng Jiang, 2021. Action Sequence Augmentation for Early Graph-based Anomaly Detection.