In [1]:
import pandas as pd
import numpy as np
import pickle as pk
import os

In [5]:
!pip install torch==1.6.0

Collecting torch==1.6.0
  Downloading torch-1.6.0-cp36-cp36m-manylinux1_x86_64.whl (748.8 MB)
     |████████████████████████████████| 748.8 MB 4.3 kB/s             
Installing collected packages: torch
Successfully installed torch-1.6.0


In [6]:
import sys

sys.path.append("../../Eland/")

In [7]:

from model.Eland_e2e import Eland_e2e
from model.Eland_e2e_unsup import Eland_e2e_uns
from Datasets import MyDataSet

In [8]:
from torch.utils.data import DataLoader

# Examine the reddit data from the ELAND repo to understand what data it expects

In [9]:
data_dir = "../../Eland/data/reddit/"

In [10]:
from collections import Counter
from scipy.sparse import csr_matrix, coo_matrix
def load_data(data_dir, graph_num):
    """ Initialize u2index, labels, train/validation/test indices """
    u_all = set()
    pos_uids = set()
    labeled_uids = set()
    with open(f'{data_dir}/userlabels', 'r') as f:
        for line in f:
            arr = line.strip('\r\n').split(',')
            u_all.add(arr[0])
            if arr[1] == 'anomaly':
                pos_uids.add(arr[0])
                labeled_uids.add(arr[0])
            elif arr[1] == 'benign':
                labeled_uids.add(arr[0])
    print(f'loaded labels, total of {len(pos_uids)} positive users and {len(labeled_uids)} labeled users')

    # get users' features
    u2index = pk.load(open(f'{data_dir}/u2index.pkl', 'rb'))
    user_feats = np.load(open(f'{data_dir}/user2vec.npy', 'rb'), allow_pickle=True)
    # Get prod features
    p2index = pk.load(open(f'{data_dir}/p2index.pkl', 'rb'))
    item_feats = np.load(open(f'{data_dir}/prod2vec.npy', 'rb'), allow_pickle=True)

    labels = np.zeros(len(u2index))
    for u in u2index:
        if u in pos_uids:
            labels[u2index[u]] = 1
    labels = labels.astype(int)

    tvt_idx = pk.load(open(f'{data_dir}/tvt_idx.pkl', 'rb'))
    idx_train, idx_val, idx_test = tvt_idx
    print('Train: total of {:5} users with {:5} pos users and {:5} neg users'.format(len(idx_train), np.sum(labels[idx_train]), len(idx_train)-np.sum(labels[idx_train])))
    print('Val:   total of {:5} users with {:5} pos users and {:5} neg users'.format(len(idx_val), np.sum(labels[idx_val]), len(idx_val)-np.sum(labels[idx_val])))
    print('Test:  total of {:5} users with {:5} pos users and {:5} neg users'.format(len(idx_test), np.sum(labels[idx_test]), len(idx_test)-np.sum(labels[idx_test])))

    """ Get graph, graph features, and initialize u2index, p2index """
    edges = Counter()
    n = int(graph_num * 10)
    edgelist_file = f'{data_dir}/splitted_edgelist_{n}' if n < 10 else f'{data_dir}/edgelist'
    with open(edgelist_file, 'r') as f:
        for line in f:
            arr = line.strip('\r\n').split(',')
            u = arr[0]
            p = arr[1]
            t = int(arr[2])
            edges[(u2index[u], p2index[p])] += 1
    # Construct the graph
    row = []
    col = []
    entry = []
    for edge, w in edges.items():
        i, j = edge
        row.append(i)
        col.append(j)
        entry.append(w)
    graph = csr_matrix((entry, (row, col)), shape=(len(u2index), len(p2index)))
    return u2index, labels, tvt_idx, user_feats, p2index, item_feats, graph

In [11]:
graph_num = 0.1

In [12]:
u2index, labels, tvt_nids, user_features, p2index, item_features, graph = load_data(data_dir, graph_num)

loaded labels, total of 832 positive users and 6000 labeled users
Train: total of  1200 users with   174 pos users and  1026 neg users
Val:   total of  1200 users with   166 pos users and  1034 neg users
Test:  total of  3600 users with   492 pos users and  3108 neg users


In [13]:
base_pred = 30

# DataLoader

n = int(graph_num * 10)

edgelist_file = f'{data_dir}/splitted_edgelist_{n}' if n < 10 else f'{data_dir}/edgelist'

dataset = MyDataSet(p2index, item_features, edgelist_file)
lstm_dataloader = DataLoader(dataset, batch_size=300)

57975it [00:00, 309627.18it/s]


In [14]:
name='debug'
baseline=False
gnnlayer_type='gcn'
rnnlayer_type='lstm'
device='cpu'

method='gcn'

In [18]:
if method in ('dominant', 'deepae'):
    eland = Eland_e2e_uns(graph, lstm_dataloader, user_features,
            item_features, labels, tvt_nids, u2index,
            p2index, item_features, lr=0.01, n_layers=2, name=name, pretrain_bm=25,
            pretrain_nc=25, epochs=10, method=args.method, rnn_type=rnnlayer_type, bmloss_type='mse', device=device, base_pred=base_pred)
else:
    eland = Eland_e2e(graph, lstm_dataloader, user_features,
            item_features, labels, tvt_nids, u2index,
            p2index, item_features, lr=0.01, n_layers=2, name=name, pretrain_bm=25, epochs=10,
            pretrain_nc=300, gnnlayer_type=gnnlayer_type, rnn_type=rnnlayer_type, bmloss_type='mse', device=device, base_pred=base_pred)

2022-05-24 22:57:01,834 - Parameters: {'base_pred': 30, 'device': 'cpu', 'bmloss_type': 'mse', 'alpha': 0.05, 'pretrain_nc': 300, 'pretrain_bm': 25, 'rnn_type': 'lstm', 'gnnlayer_type': 'gcn', 'name': 'debug', 'log': True, 'tensorboard': False, 'dropout': 0.4, 'weight_decay': 1e-05, 'lr': 0.01, 'seed': -1, 'epochs': 10, 'n_layers': 2, 'hidden_size': 128, 'cuda': 0, 'dim_feats': 300}


In [19]:
# !mkdir logs

In [21]:
not baseline

True

In [20]:
if not baseline:
    auc, ap = eland.train()
else:
    auc, ap = eland.pretrain_nc_net(n_epochs=30)

2022-05-24 22:57:54,187 - BM Module pretrain, Epoch 1/25: loss 324.97962386
2022-05-24 22:58:41,713 - BM Module pretrain, Epoch 2/25: loss 0.87825827
2022-05-24 22:59:31,019 - BM Module pretrain, Epoch 3/25: loss 0.6821715
2022-05-24 23:00:19,954 - BM Module pretrain, Epoch 4/25: loss 0.63952341
2022-05-24 23:01:08,680 - BM Module pretrain, Epoch 5/25: loss 0.6182542
2022-05-24 23:01:57,446 - BM Module pretrain, Epoch 6/25: loss 0.59951662
2022-05-24 23:02:46,338 - BM Module pretrain, Epoch 7/25: loss 0.59130993
2022-05-24 23:03:34,953 - BM Module pretrain, Epoch 8/25: loss 0.58739793
2022-05-24 23:04:24,491 - BM Module pretrain, Epoch 9/25: loss 0.59353044
2022-05-24 23:05:13,173 - BM Module pretrain, Epoch 10/25: loss 0.58320346
2022-05-24 23:06:01,591 - BM Module pretrain, Epoch 11/25: loss 0.57342202
2022-05-24 23:06:49,564 - BM Module pretrain, Epoch 12/25: loss 0.56354089
2022-05-24 23:07:38,563 - BM Module pretrain, Epoch 13/25: loss 0.54526532
2022-05-24 23:08:26,313 - BM Modul