# This notebook is prepared to show eland model training results 

In [84]:
import os
import pickle
import sys
import json
import math
import logging
import pickle as pk
from collections import Counter
import numpy as np
import pandas as pd
import scipy.sparse as sp
from scipy.sparse import csr_matrix, coo_matrix
import torch
from torch.utils.data import DataLoader
import torch.nn.functional as F
import torch.nn as nn
from torch.nn import MSELoss, CosineEmbeddingLoss
from sklearn.metrics import precision_recall_curve, roc_curve, auc, average_precision_score, roc_auc_score, f1_score

## loading data 

### user label

In [85]:
user_label = pd.read_csv("../../data/02_intermediate/user_behavior/user_label.csv")

In [86]:
user_label.head(10)

Unnamed: 0,author,label
0,ultimatt42,0
1,jonknee,0
2,dons,0
3,Jedravent,0
4,burtonmkz,0
5,pavel_lishin,0
6,sblinn,0
7,WebZen,0
8,doodahdei,0
9,Tack122,0


## user and subreddit topic index

In [87]:
with open("../../data/02_intermediate/user_behavior/u2index.pkl","rb") as f:
    u2index = pickle.load(f)

In [88]:
u2index

{'0_o': 0,
 '138': 1,
 '13ren': 2,
 '1812overture': 3,
 '1esproc': 4,
 '315was_an_inside_job': 5,
 '43P04T34': 6,
 '7oby': 7,
 'AAjax': 8,
 'ABabyAteMyDingo': 9,
 'ANSICL': 10,
 'AbouBenAdhem': 11,
 'Aerik': 12,
 'Ajenthavoc': 13,
 'AliasHandler': 14,
 'AmericanGoyBlog': 15,
 'AngelaMotorman': 16,
 'AngledLuffa': 17,
 'Anonymous7777': 18,
 'AnteChronos': 19,
 'ApostrophePosse': 20,
 'ArcticCelt': 21,
 'Bagel': 22,
 'Battleloser': 23,
 'BedtimeForSheeple': 24,
 'BeetleB': 25,
 'Benny_Lava': 26,
 'Bensch': 27,
 'Bixie': 28,
 'Bloodlustt': 29,
 'Bloody_Eye': 30,
 'BlueBeard': 31,
 'BobGaffney': 32,
 'BraveSirRobin': 33,
 'BrianBoyko': 34,
 'Browzer': 35,
 'Burlapin': 36,
 'Busybyeski': 37,
 'CampusTour': 38,
 'CannedMango': 39,
 'Captain-Obliviouss': 40,
 'Chirp08': 41,
 'ChunkyLaFunga': 42,
 'Ciserus': 43,
 'Clothos': 44,
 'CodeMonkey1': 45,
 'Codebender': 46,
 'ColdSnickersBar': 47,
 'Cookie': 48,
 'CrackIsGoodForYou': 49,
 'CrimsonSun99': 50,
 'D-Style': 51,
 'DCGaymer': 52,
 'DOGA': 5

In [89]:
with open("../../data/02_intermediate/user_behavior/p2index.pkl","rb") as f:
    p2index = pickle.load(f)

In [90]:
p2index

{'AskReddit': 0,
 'Drugs': 1,
 'Economics': 2,
 'Music': 3,
 'WTF': 4,
 'apple': 5,
 'area51': 6,
 'atheism': 7,
 'bestof': 8,
 'business': 9,
 'canada': 10,
 'cogsci': 11,
 'comics': 12,
 'entertainment': 13,
 'environment': 14,
 'funny': 15,
 'gadgets': 16,
 'gaming': 17,
 'geek': 18,
 'happy': 19,
 'lgbt': 20,
 'linux': 21,
 'lolcats': 22,
 'math': 23,
 'netsec': 24,
 'nsfw': 25,
 'obama': 26,
 'offbeat': 27,
 'philosophy': 28,
 'photography': 29,
 'pics': 30,
 'politics': 31,
 'programming': 32,
 'psychology': 33,
 'reddit.com': 34,
 'science': 35,
 'scifi': 36,
 'self': 37,
 'sex': 38,
 'software': 39,
 'sports': 40,
 'technology': 41,
 'videos': 42,
 'web_design': 43,
 'worldnews': 44,
 'xkcd': 45,
 'yourweek': 46}

## edge list data 

In [91]:
edgelist_df = pd.read_csv("../../data/02_intermediate/user_behavior/edge_list.csv")

In [92]:
edgelist_df.head(10)

Unnamed: 0,author,subreddit,retrieved_on
0,ultimatt42,science,1425846806
1,jonknee,programming,1425846807
2,burtonmkz,science,1425846810
3,pavel_lishin,reddit.com,1425846810
4,pavel_lishin,reddit.com,1425846810
5,sblinn,politics,1425846810
6,dons,programming,1425846811
7,Jedravent,politics,1425846811
8,WebZen,politics,1425846811
9,doodahdei,politics,1425846812


In [93]:
from scipy.sparse import csr_matrix, coo_matrix
def process_edgelist(edge_list, u2index, p2index):
    """ Load edge list and construct a graph """
    edges = Counter()

    for i, row in edge_list.iterrows():
        #u = row[0]
        #p = row[1]
        #t = row[2]
        u = row['author']
        p = row['subreddit']
        t = row['retrieved_on']

        if i<1:
            print(u, p, t)
        edges[(u2index[u], p2index[p])] += 1
    # Construct the graph
    row = []
    col = []
    entry = []
    for edge, w in edges.items():
        #print(w)
        i, j = edge
        row.append(i)
        col.append(j)
        entry.append(w)
    graph = csr_matrix(
        (entry, (row, col)), 
        shape=(len(u2index), len(p2index))
    )   
    return graph

In [94]:
graph = process_edgelist(edgelist_df, u2index, p2index)

ultimatt42 science 1425846806


In [95]:
type(graph)

scipy.sparse.csr.csr_matrix

## train/validation/test id split

In [96]:
with open("../../data/02_intermediate/user_behavior/data_tvt.pkl","rb") as f:
    tvt_idx = pickle.load(f)

In [97]:
idx_train, idx_val, idx_test = tvt_idx

In [98]:
idx_train.shape, idx_val.shape, idx_test.shape

((314,), (79,), (393,))

### convert label format (to numpy array)

In [99]:
def process_label(labels: pd.DataFrame) -> np.array:
    """process label information"""
    u_all = set()
    pos_uids = set()
    labeled_uids = set()
    #convert a dataframe to an numpy array, array index being mapped indexes from u2index
    for i,row in labels.iterrows():
        author = row['author']
        author_label = row['label']
        u_all.add(author)
        if author_label == 1:
            pos_uids.add(author)
            labeled_uids.add(author)
        elif author_label == 0:
            labeled_uids.add(author)
    print(f'loaded labels, total of {len(pos_uids)} positive users and {len(labeled_uids)} labeled users')
    labels = np.zeros(len(u2index))
    for u in u2index:
        if u in pos_uids:
            labels[u2index[u]] = 1
    labels = labels.astype(int)
    return labels

In [100]:
labels = process_label(user_label)

loaded labels, total of 327 positive users and 787 labeled users


In [101]:
print('Train: total of {:5} users with {:5} pos users and {:5} neg users'.format(
    len(idx_train), 
    np.sum(labels[idx_train]), 
    len(idx_train)-np.sum(labels[idx_train]))
     )
print('Val:   total of {:5} users with {:5} pos users and {:5} neg users'.format(
    len(idx_val), 
    np.sum(labels[idx_val]), 
    len(idx_val)-np.sum(labels[idx_val]))
     )
print('Test:  total of {:5} users with {:5} pos users and {:5} neg users'.format(
    len(idx_test), 
    np.sum(labels[idx_test]), 
    len(idx_test)-np.sum(labels[idx_test]))
     )

Train: total of   314 users with   131 pos users and   183 neg users
Val:   total of    79 users with    38 pos users and    41 neg users
Test:  total of   393 users with   157 pos users and   236 neg users


In [102]:
user_features = np.load("../../data/02_intermediate/user_behavior/user2vec_npy.npz")

In [103]:
user_features['data'].shape #787 users

(787, 300)

In [104]:
item_features = np.load("../../data/02_intermediate/user_behavior/prod2vec_npy.npz")

In [105]:
item_features['data'].shape #47 topics

(47, 300)

## setting up the model trainer 

In [106]:
#sys.path.append('/home/ec2-user/SageMaker/anomaly-detection-spatial-temporal-data/')
sys.path.append('/home/ec2-user/SageMaker/anomaly-detection-spatial-temporal-data/src/')

In [107]:
from anomaly_detection_spatial_temporal_data.model.data_loader import DynamicGraphWNFDataSet, DynamicGraphWNodeFeatDatasetLoader
from anomaly_detection_spatial_temporal_data.model.dynamic_graph import Eland_e2e
from anomaly_detection_spatial_temporal_data.model.model_config import ElandConfig


### set up dataloader

In [122]:
data_loader = DynamicGraphWNodeFeatDatasetLoader(
    user_label, 
    u2index, 
    p2index, 
    edgelist_df, 
    tvt_idx, 
    user_features['data'], 
    item_features['data']
)

#sequential data loader
dataset = DynamicGraphWNFDataSet(p2index, item_features['data'], edgelist_df)
lstm_dataloader = DataLoader(dataset, batch_size=300)
    

loaded labels, total of 327 positive users and 787 labeled users
Train: total of   314 users with   131 pos users and   183 neg users
Val:   total of    79 users with    38 pos users and    41 neg users
Test:  total of   393 users with   157 pos users and   236 neg users


In [123]:
data_dict = {
        'graph': data_loader.graph, 
        'lstm_dataloader': lstm_dataloader,
        'user_features': data_loader.user_features,
        'item_features': data_loader.item_features,
        'labels': data_loader.labels,
        'tvt_nids': data_loader.tvt_idx,
        'u2index': data_loader.u2index,
        'p2index': data_loader.p2index
    }


### load model config

In [124]:
import yaml

In [125]:
model_config_file = '../../conf/base/parameters/eland.yml'

In [126]:
with open(model_config_file, "r") as stream:
    try:
        mode_config=yaml.safe_load(stream)
        print(mode_config)
    except yaml.YAMLError as exc:
        print(exc)

{'eland_data_load_options': {'dataset': 'reddit', 'baseline': 'store_true'}, 'eland_model_options': {'dim_feats': 300, 'cuda': 0, 'hidden_size': 128, 'n_layers': 2, 'epochs': 50, 'batch_size': 300, 'seed': -1, 'lr': 0.0001, 'log': True, 'weight_decay': 1e-06, 'dropout': 0.4, 'tensorboard': False, 'name': 'debug', 'gnnlayer_type': 'gcn', 'rnn_type': 'lstm', 'pretrain_bm': 25, 'pretrain_nc': 200, 'alpha': 0.05, 'bmloss_type': 'mse', 'device': 'cpu', 'base_pred': 400, 'save_directory': 'data/07_model_output/user_behavior'}}


In [127]:
#open a log directory for notebook training session 
from pathlib import Path
log_dir = Path('logs/')
log_dir.mkdir(parents=True, exist_ok=True)

In [130]:
eland_config = ElandConfig(mode_config['eland_model_options'])

#### adjust model directory for notebook 

In [131]:
eland_config.save_directory

'data/07_model_output/user_behavior'

In [133]:
eland_config.save_directory = '../../data/07_model_output/user_behavior/'

In [134]:
eland_config.save_directory

'../../data/07_model_output/user_behavior/'

## kick off model training

In [135]:
model_obj = Eland_e2e(
    data_dict['graph'], 
    data_dict['lstm_dataloader'], 
    data_dict['user_features'],
    data_dict['item_features'], 
    data_dict['labels'], 
    data_dict['tvt_nids'], 
    data_dict['u2index'],
    data_dict['p2index'], 
    data_dict['item_features'], 
    eland_config
)
training_result,save_model_path = model_obj.train()

2022-07-22 19:53:31,316 - Parameters: {'dim_feats': 300, 'hidden_size': 128, 'n_layers': 2, 'lr': 0.0001, 'weight_decay': 1e-06, 'dropout': 0.4, 'gnnlayer_type': 'gcn', 'rnn_type': 'lstm', 'bmloss_type': 'mse'}
2022-07-22 19:53:36,365 - BM Module pretrain, Epoch 1/25: loss 105.70482286
2022-07-22 19:53:40,729 - BM Module pretrain, Epoch 2/25: loss 99.68473085
2022-07-22 19:53:45,593 - BM Module pretrain, Epoch 3/25: loss 91.29374441
2022-07-22 19:53:49,726 - BM Module pretrain, Epoch 4/25: loss 79.55009143
2022-07-22 19:53:53,783 - BM Module pretrain, Epoch 5/25: loss 65.8869578
2022-07-22 19:53:58,441 - BM Module pretrain, Epoch 6/25: loss 53.72576078
2022-07-22 19:54:02,598 - BM Module pretrain, Epoch 7/25: loss 45.002189
2022-07-22 19:54:08,097 - BM Module pretrain, Epoch 8/25: loss 38.43296417
2022-07-22 19:54:12,472 - BM Module pretrain, Epoch 9/25: loss 32.70000505
2022-07-22 19:54:16,636 - BM Module pretrain, Epoch 10/25: loss 27.95659669
2022-07-22 19:54:21,846 - BM Module pret

In [136]:
training_result

{1: {'train_auc': 0.9672131147540983, 'val_auc': 0.98395378690629},
 2: {'train_auc': 0.9746798481625162, 'val_auc': 0.9845956354300385},
 3: {'train_auc': 0.9728861636007176, 'val_auc': 0.9845956354300385},
 4: {'train_auc': 0.9773078046135234, 'val_auc': 0.98395378690629},
 5: {'train_auc': 0.9788094940140991, 'val_auc': 0.9878048780487805},
 6: {'train_auc': 0.9816877320318691, 'val_auc': 0.9845956354300385},
 7: {'train_auc': 0.9796437659033079, 'val_auc': 0.9858793324775352},
 8: {'train_auc': 0.9742209986234514, 'val_auc': 0.9845956354300385},
 9: {'train_auc': 0.97601468318525, 'val_auc': 0.9852374839537869},
 10: {'train_auc': 0.9764735327243148, 'val_auc': 0.98395378690629},
 11: {'train_auc': 0.9754306928628039, 'val_auc': 0.9852374839537869},
 12: {'train_auc': 0.9762649647520126, 'val_auc': 0.9845956354300385},
 13: {'train_auc': 0.9782255036916531, 'val_auc': 0.9845956354300385},
 14: {'train_auc': 0.97985233387561, 'val_auc': 0.9858793324775352},
 15: {'train_auc': 0.9766

## read in kedro pipeline training history

In [137]:
with open("../../data/07_model_output/user_behavior/train_result.pkl","rb") as f:
    train_result_loaded = pickle.load(f)

In [138]:
train_result_loaded

{1: {'train_auc': 0.9734284403287031, 'val_auc': 0.98395378690629},
 2: {'train_auc': 0.9772243774246026, 'val_auc': 0.9826700898587932},
 3: {'train_auc': 0.9738455762733075, 'val_auc': 0.9826700898587932},
 4: {'train_auc': 0.9714261877946022, 'val_auc': 0.98395378690629},
 5: {'train_auc': 0.9789346347974806, 'val_auc': 0.9813863928112965},
 6: {'train_auc': 0.9754724064572644, 'val_auc': 0.98395378690629},
 7: {'train_auc': 0.9765152463187752, 'val_auc': 0.98395378690629},
 8: {'train_auc': 0.9707587702832354, 'val_auc': 0.98395378690629},
 9: {'train_auc': 0.9767238142910775, 'val_auc': 0.9826700898587933},
 10: {'train_auc': 0.976682100696617, 'val_auc': 0.9826700898587932},
 11: {'train_auc': 0.9733032995453218, 'val_auc': 0.9813863928112965},
 12: {'train_auc': 0.977057523046761, 'val_auc': 0.98395378690629},
 13: {'train_auc': 0.9786843532307179, 'val_auc': 0.9833119383825417},
 14: {'train_auc': 0.9766403871021566, 'val_auc': 0.982028241335045},
 15: {'train_auc': 0.973720435

# References

Jason Baumgartner, Savvas Zannettou, Brian Keegan, Megan Squire, and Jeremy Blackburn. 2020. The Pushshift Reddit Dataset.

Tong Zhao, Bo Ni, Wenhao Yu, Zhichun Guo, Neil Shah, and Meng Jiang, 2021. Action Sequence Augmentation for Early Graph-based Anomaly Detection.