Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.

SPDX-License-Identifier: Apache-2.0

# This notebook is prepared to show eland model training results 

In [1]:
import os
import pickle
import sys
import json
import math
import logging
import pickle as pk
from collections import Counter
import numpy as np
import pandas as pd
import scipy.sparse as sp
from scipy.sparse import csr_matrix, coo_matrix
import torch
from torch.utils.data import DataLoader
import torch.nn.functional as F
import torch.nn as nn
from torch.nn import MSELoss, CosineEmbeddingLoss
from sklearn.metrics import precision_recall_curve, roc_curve, auc, average_precision_score, roc_auc_score, f1_score

  from .autonotebook import tqdm as notebook_tqdm


## loading data 

### user label

In [2]:
user_label = pd.read_csv("../../data/02_intermediate/user_behavior/user_label.csv")

In [3]:
user_label.head(10)

Unnamed: 0,author,label
0,ultimatt42,0
1,jonknee,0
2,dons,0
3,Jedravent,0
4,burtonmkz,0
5,pavel_lishin,0
6,sblinn,0
7,WebZen,0
8,doodahdei,0
9,Tack122,0


## user and subreddit topic index

In [4]:
with open("../../data/02_intermediate/user_behavior/u2index.pkl","rb") as f:
    u2index = pickle.load(f)

In [5]:
u2index

{'0_o': 0,
 '138': 1,
 '13ren': 2,
 '1812overture': 3,
 '1esproc': 4,
 '315was_an_inside_job': 5,
 '43P04T34': 6,
 '7oby': 7,
 'AAjax': 8,
 'ABabyAteMyDingo': 9,
 'ANSICL': 10,
 'AbouBenAdhem': 11,
 'Aerik': 12,
 'Ajenthavoc': 13,
 'AliasHandler': 14,
 'AmericanGoyBlog': 15,
 'AngelaMotorman': 16,
 'AngledLuffa': 17,
 'Anonymous7777': 18,
 'AnteChronos': 19,
 'ApostrophePosse': 20,
 'ArcticCelt': 21,
 'Bagel': 22,
 'Battleloser': 23,
 'BedtimeForSheeple': 24,
 'BeetleB': 25,
 'Benny_Lava': 26,
 'Bensch': 27,
 'Bixie': 28,
 'Bloodlustt': 29,
 'Bloody_Eye': 30,
 'BlueBeard': 31,
 'BobGaffney': 32,
 'BraveSirRobin': 33,
 'BrianBoyko': 34,
 'Browzer': 35,
 'Burlapin': 36,
 'Busybyeski': 37,
 'CampusTour': 38,
 'CannedMango': 39,
 'Captain-Obliviouss': 40,
 'Chirp08': 41,
 'ChunkyLaFunga': 42,
 'Ciserus': 43,
 'Clothos': 44,
 'CodeMonkey1': 45,
 'Codebender': 46,
 'ColdSnickersBar': 47,
 'Cookie': 48,
 'CrackIsGoodForYou': 49,
 'CrimsonSun99': 50,
 'D-Style': 51,
 'DCGaymer': 52,
 'DOGA': 5

In [6]:
with open("../../data/02_intermediate/user_behavior/p2index.pkl","rb") as f:
    p2index = pickle.load(f)

In [7]:
p2index

{'AskReddit': 0,
 'Drugs': 1,
 'Economics': 2,
 'Music': 3,
 'WTF': 4,
 'apple': 5,
 'area51': 6,
 'atheism': 7,
 'bestof': 8,
 'business': 9,
 'canada': 10,
 'cogsci': 11,
 'comics': 12,
 'entertainment': 13,
 'environment': 14,
 'funny': 15,
 'gadgets': 16,
 'gaming': 17,
 'geek': 18,
 'happy': 19,
 'lgbt': 20,
 'linux': 21,
 'lolcats': 22,
 'math': 23,
 'netsec': 24,
 'nsfw': 25,
 'obama': 26,
 'offbeat': 27,
 'philosophy': 28,
 'photography': 29,
 'pics': 30,
 'politics': 31,
 'programming': 32,
 'psychology': 33,
 'reddit.com': 34,
 'science': 35,
 'scifi': 36,
 'self': 37,
 'sex': 38,
 'software': 39,
 'sports': 40,
 'technology': 41,
 'videos': 42,
 'web_design': 43,
 'worldnews': 44,
 'xkcd': 45,
 'yourweek': 46}

## edge list data 

In [8]:
edgelist_df = pd.read_csv("../../data/02_intermediate/user_behavior/edge_list.csv")

In [9]:
edgelist_df.head(10)

Unnamed: 0,author,subreddit,retrieved_on
0,ultimatt42,science,1425846806
1,jonknee,programming,1425846807
2,burtonmkz,science,1425846810
3,pavel_lishin,reddit.com,1425846810
4,pavel_lishin,reddit.com,1425846810
5,sblinn,politics,1425846810
6,dons,programming,1425846811
7,Jedravent,politics,1425846811
8,WebZen,politics,1425846811
9,doodahdei,politics,1425846812


In [10]:
from scipy.sparse import csr_matrix, coo_matrix
def process_edgelist(edge_list, u2index, p2index):
    """ Load edge list and construct a graph """
    edges = Counter()

    for i, row in edge_list.iterrows():
        #u = row[0]
        #p = row[1]
        #t = row[2]
        u = row['author']
        p = row['subreddit']
        t = row['retrieved_on']

        if i<1:
            print(u, p, t)
        edges[(u2index[u], p2index[p])] += 1
    # Construct the graph
    row = []
    col = []
    entry = []
    for edge, w in edges.items():
        #print(w)
        i, j = edge
        row.append(i)
        col.append(j)
        entry.append(w)
    graph = csr_matrix(
        (entry, (row, col)), 
        shape=(len(u2index), len(p2index))
    )   
    return graph

In [11]:
graph = process_edgelist(edgelist_df, u2index, p2index)

ultimatt42 science 1425846806


In [12]:
type(graph)

scipy.sparse.csr.csr_matrix

## train/validation/test id split

In [13]:
with open("../../data/02_intermediate/user_behavior/data_tvt.pkl","rb") as f:
    tvt_idx = pickle.load(f)

In [14]:
idx_train, idx_val, idx_test = tvt_idx

In [15]:
idx_train.shape, idx_val.shape, idx_test.shape

((195,), (198,), (393,))

### convert label format (to numpy array)

In [16]:
def process_label(labels: pd.DataFrame) -> np.array:
    """process label information"""
    u_all = set()
    pos_uids = set()
    labeled_uids = set()
    #convert a dataframe to an numpy array, array index being mapped indexes from u2index
    for i,row in labels.iterrows():
        author = row['author']
        author_label = row['label']
        u_all.add(author)
        if author_label == 1:
            pos_uids.add(author)
            labeled_uids.add(author)
        elif author_label == 0:
            labeled_uids.add(author)
    print(f'loaded labels, total of {len(pos_uids)} positive users and {len(labeled_uids)} labeled users')
    labels = np.zeros(len(u2index))
    for u in u2index:
        if u in pos_uids:
            labels[u2index[u]] = 1
    labels = labels.astype(int)
    return labels

In [17]:
labels = process_label(user_label)

loaded labels, total of 327 positive users and 787 labeled users


In [18]:
print('Train: total of {:5} users with {:5} pos users and {:5} neg users'.format(
    len(idx_train), 
    np.sum(labels[idx_train]), 
    len(idx_train)-np.sum(labels[idx_train]))
     )
print('Val:   total of {:5} users with {:5} pos users and {:5} neg users'.format(
    len(idx_val), 
    np.sum(labels[idx_val]), 
    len(idx_val)-np.sum(labels[idx_val]))
     )
print('Test:  total of {:5} users with {:5} pos users and {:5} neg users'.format(
    len(idx_test), 
    np.sum(labels[idx_test]), 
    len(idx_test)-np.sum(labels[idx_test]))
     )

Train: total of   195 users with    79 pos users and   116 neg users
Val:   total of   198 users with    88 pos users and   110 neg users
Test:  total of   393 users with   159 pos users and   234 neg users


In [19]:
user_features = np.load("../../data/02_intermediate/user_behavior/user2vec_npy.npz")

In [20]:
user_features['data'].shape #787 users

(787, 300)

In [21]:
item_features = np.load("../../data/02_intermediate/user_behavior/prod2vec_npy.npz")

In [22]:
item_features['data'].shape #47 topics

(47, 300)

## setting up the model trainer 

In [23]:
sys.path.append('/home/ec2-user/SageMaker/anomaly-detection-spatial-temporal-data/src/')

In [24]:
from anomaly_detection_spatial_temporal_data.model.data_loader import DynamicGraphWNFDataSet, DynamicGraphWNodeFeatDatasetLoader
from anomaly_detection_spatial_temporal_data.model.dynamic_graph import Eland_e2e
from anomaly_detection_spatial_temporal_data.model.model_config import ElandConfig

### set up dataloader

In [25]:
data_loader = DynamicGraphWNodeFeatDatasetLoader(
    user_label, 
    u2index, 
    p2index, 
    edgelist_df, 
    tvt_idx, 
    user_features['data'], 
    item_features['data']
)

#sequential data loader
dataset = DynamicGraphWNFDataSet(p2index, item_features['data'], edgelist_df)
lstm_dataloader = DataLoader(dataset, batch_size=300)
    

loaded labels, total of 327 positive users and 787 labeled users
Train: total of   195 users with    79 pos users and   116 neg users
Val:   total of   198 users with    88 pos users and   110 neg users
Test:  total of   393 users with   159 pos users and   234 neg users


In [26]:
data_dict = {
        'graph': data_loader.graph, 
        'lstm_dataloader': lstm_dataloader,
        'user_features': data_loader.user_features,
        'item_features': data_loader.item_features,
        'labels': data_loader.labels,
        'tvt_nids': data_loader.tvt_idx,
        'u2index': data_loader.u2index,
        'p2index': data_loader.p2index
    }


### load model config

In [27]:
import yaml

In [28]:
model_config_file = '../../conf/base/parameters/eland.yml'

In [29]:
with open(model_config_file, "r") as stream:
    try:
        mode_config=yaml.safe_load(stream)
        print(mode_config)
    except yaml.YAMLError as exc:
        print(exc)

{'eland_data_load_options': {'dataset': 'reddit', 'baseline': 'store_true', 'batch_size': 300}, 'eland_model_options': {'dim_feats': 300, 'cuda': 0, 'hidden_size': 128, 'n_layers': 2, 'epochs': 50, 'batch_size': 300, 'seed': -1, 'lr': 0.0001, 'log': True, 'weight_decay': 1e-06, 'dropout': 0.4, 'tensorboard': False, 'name': 'debug', 'gnnlayer_type': 'gcn', 'rnn_type': 'lstm', 'pretrain_bm': 25, 'pretrain_nc': 200, 'alpha': 0.05, 'bmloss_type': 'mse', 'device': 'cpu', 'base_pred': 400, 'save_directory': 'data/07_model_output/user_behavior'}}


In [30]:
#open a log directory for notebook training session 
from pathlib import Path
log_dir = Path('logs/')
log_dir.mkdir(parents=True, exist_ok=True)

In [31]:
eland_config = ElandConfig(mode_config['eland_model_options'])

#### adjust model directory for notebook 

In [32]:
eland_config.save_directory

'data/07_model_output/user_behavior'

In [33]:
eland_config.save_directory = '../../data/07_model_output/user_behavior/'

In [34]:
eland_config.save_directory

'../../data/07_model_output/user_behavior/'

## kick off model training

In [35]:
model_obj = Eland_e2e(
    data_dict['graph'], 
    data_dict['lstm_dataloader'], 
    data_dict['user_features'],
    data_dict['item_features'], 
    data_dict['labels'], 
    data_dict['tvt_nids'], 
    data_dict['u2index'],
    data_dict['p2index'], 
    data_dict['item_features'], 
    eland_config
)
training_result,save_model_path = model_obj.train()

2022-07-29 05:31:09,794 - Parameters: {'dim_feats': 300, 'hidden_size': 128, 'n_layers': 2, 'lr': 0.0001, 'weight_decay': 1e-06, 'dropout': 0.4, 'gnnlayer_type': 'gcn', 'rnn_type': 'lstm', 'bmloss_type': 'mse'}
2022-07-29 05:31:14,642 - BM Module pretrain, Epoch 1/25: loss 104.74526374
2022-07-29 05:31:18,809 - BM Module pretrain, Epoch 2/25: loss 98.92015425
2022-07-29 05:31:22,629 - BM Module pretrain, Epoch 3/25: loss 91.10625966
2022-07-29 05:31:26,521 - BM Module pretrain, Epoch 4/25: loss 79.72125181
2022-07-29 05:31:30,115 - BM Module pretrain, Epoch 5/25: loss 65.84392325
2022-07-29 05:31:33,951 - BM Module pretrain, Epoch 6/25: loss 53.09317303
2022-07-29 05:31:37,540 - BM Module pretrain, Epoch 7/25: loss 43.98713462
2022-07-29 05:31:41,065 - BM Module pretrain, Epoch 8/25: loss 37.43398778
2022-07-29 05:31:44,768 - BM Module pretrain, Epoch 9/25: loss 32.04992612
2022-07-29 05:31:48,674 - BM Module pretrain, Epoch 10/25: loss 27.67630625
2022-07-29 05:31:52,118 - BM Module p

In [36]:
training_result

{1: {'train_auc': 0.9658446093408992, 'val_auc': 0.9755165289256199},
 2: {'train_auc': 0.9659537319947622, 'val_auc': 0.9740702479338843},
 3: {'train_auc': 0.9692274116106504, 'val_auc': 0.9774793388429752},
 4: {'train_auc': 0.9632256656481886, 'val_auc': 0.9735537190082645},
 5: {'train_auc': 0.9673723264949804, 'val_auc': 0.9759297520661157},
 6: {'train_auc': 0.9714098646879092, 'val_auc': 0.9755165289256199},
 7: {'train_auc': 0.9743561763422086, 'val_auc': 0.9755165289256198},
 8: {'train_auc': 0.9792666957660412, 'val_auc': 0.9756198347107438},
 9: {'train_auc': 0.9668267132256656, 'val_auc': 0.9729338842975207},
 10: {'train_auc': 0.9673723264949804, 'val_auc': 0.9747933884297522},
 11: {'train_auc': 0.9630074203404626, 'val_auc': 0.9770661157024795},
 12: {'train_auc': 0.9620253164556961, 'val_auc': 0.9727272727272727},
 13: {'train_auc': 0.9689000436490616, 'val_auc': 0.9753099173553719},
 14: {'train_auc': 0.9636621562636404, 'val_auc': 0.975},
 15: {'train_auc': 0.9685726

## read in kedro pipeline training history

In [37]:
with open("../../data/07_model_output/user_behavior/train_result.pkl","rb") as f:
    train_result_loaded = pickle.load(f)

In [38]:
train_result_loaded

{1: {'train_auc': 0.9604743083003953, 'val_auc': 1.0},
 2: {'train_auc': 0.9727849143610012, 'val_auc': 1.0},
 3: {'train_auc': 0.9725790513833992, 'val_auc': 1.0},
 4: {'train_auc': 0.9724967061923584, 'val_auc': 1.0},
 5: {'train_auc': 0.9749258893280633, 'val_auc': 1.0},
 6: {'train_auc': 0.9697381422924901, 'val_auc': 1.0},
 7: {'train_auc': 0.9713850461133071, 'val_auc': 1.0},
 8: {'train_auc': 0.9783432147562583, 'val_auc': 1.0},
 9: {'train_auc': 0.9783843873517788, 'val_auc': 1.0},
 10: {'train_auc': 0.9778491436100131, 'val_auc': 1.0},
 11: {'train_auc': 0.9665266798418972, 'val_auc': 1.0},
 12: {'train_auc': 0.9749258893280633, 'val_auc': 1.0},
 13: {'train_auc': 0.9724143610013175, 'val_auc': 1.0},
 14: {'train_auc': 0.9739789196310936, 'val_auc': 1.0},
 15: {'train_auc': 0.9762434123847168, 'val_auc': 1.0},
 16: {'train_auc': 0.9762845849802372, 'val_auc': 1.0},
 17: {'train_auc': 0.9747200263504611, 'val_auc': 1.0},
 18: {'train_auc': 0.973731884057971, 'val_auc': 1.0},
 1

# References

Jason Baumgartner, Savvas Zannettou, Brian Keegan, Megan Squire, and Jeremy Blackburn. 2020. The Pushshift Reddit Dataset.

Tong Zhao, Bo Ni, Wenhao Yu, Zhichun Guo, Neil Shah, and Meng Jiang, 2021. Action Sequence Augmentation for Early Graph-based Anomaly Detection.