In [4]:
# Install required packages.
!pip install -q torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cu113.html
!pip install -q torch-sparse -f https://pytorch-geometric.com/whl/torch-1.10.0+cu113.html
!pip install -q git+https://github.com/rusty1s/pytorch_geometric.git

# Anomaly Detection in Graphs using Self-Supervised Learning

![CentraleSupelec Logo](https://www.centralesupelec.fr/sites/all/themes/cs_theme/medias/common/images/intro/logo_nouveau.jpg)

This project represents my end-of-studies project, that was developped for a big company and in association with French Engineering school CentraleSupélec.

Having signed an NDA, we do not have the right to share the company's data. The data we will use is from a re-adapted Kaggle dataset :
```
https://www.kaggle.com/datasets/mkechinov/ecommerce-events-history-in-cosmetics-shop/
```

The idea behind this project is to use the algorithm called DOMINANT (**D**eep An**om**aly Detect**i**o**n** on **A**tribbuted **N**e**t**works), a novel graph autoencoder framework, and to adapt it to a heterogeneous graph representing the ecosystem of users and softwares from the company.

## Preprocessing the tabular data

In our sample data, we have e-commerce events, and we would like to convert those events into a **heterogeneous graph** with three types of nodes, *product*, *customer* and *user*, and with that also three types of edges that add some logic to these nodes.

In [1]:
import pandas as pd

In [2]:
database_path = "data_out_head_head.csv"

dataframe = pd.read_csv(database_path, index_col=0).fillna("")

In [3]:
dataframe

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,Session_id,Customer_id,Location,License_id,Session_start_datetime,Session_end_datetime,duration,License_start_date,License_end_date
0,2020-01-25 23:46:12,cart,5921712,2115334439910245200,,,5.16,388018099,843d560b-2069-4a0d-68af-f767f5341312,480374496_65446,"[-7.1208, -34.5019]",5921712,2020-01-25 18:23:12,2020-01-26 05:19:12,656.0,2020-01-22 07:56:13,2020-02-27 05:38:51
1,2020-02-15 14:43:37,remove_from_cart,5921712,2115334439910245200,,,5.16,459659126,457cee31-cfd9-4f75-909d-64f17021da9d,552795963_171732,"[55.0342, 6.547499999999999]",5921712,2020-02-15 07:57:37,2020-02-15 15:52:37,475.0,2020-01-22 07:56:13,2020-02-27 05:38:51
2,2020-02-09 20:57:57,remove_from_cart,5921712,2115334439910245200,,,5.16,405986628,a4354a0c-f44a-484c-96b7-b319f81e99de,405986628_283400,"[20.6167, -96.1167]",5921712,2020-02-09 14:17:57,2020-02-10 02:48:57,751.0,2020-01-22 07:56:13,2020-02-27 05:38:51
3,2020-02-05 05:30:46,view,5921712,2115334439910245200,,,5.16,571731968,10ba57c9-187e-454a-b57c-cdc71388cbe5,610461263_109078,"[25.7206, 76.8472]",5921712,2020-02-04 21:41:46,2020-02-05 10:44:46,783.0,2020-01-22 07:56:13,2020-02-27 05:38:51
4,2020-01-28 07:17:14,cart,5921712,2115334439910245200,,,5.16,601508456,201af163-9d3f-45ae-9511-7f64d8e168c1,530951720_21726,"[55.268, 1.476]",5921712,2020-01-27 21:33:14,2020-01-28 14:48:14,1035.0,2020-01-22 07:56:13,2020-02-27 05:38:51
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,2020-02-12 10:30:12,view,4810,1487580006317032337,,,6.67,612193632,a469ba53-4047-4a0a-a22e-05e7c922cb65,561323751_958571,"[51.8844, -94.1464]",4810,2020-02-12 02:29:12,2020-02-12 17:27:12,898.0,2019-09-30 21:07:40,2020-03-01 03:55:29
996,2020-02-15 18:39:37,cart,4810,1487580006317032337,,,6.67,514701786,67748dce-b81f-46a4-b943-2667fc5edb15,514701786_629210,"[13.9833, 125.9]",4810,2020-02-15 16:25:37,2020-02-16 01:54:37,569.0,2019-09-30 21:07:40,2020-03-01 03:55:29
997,2019-10-09 11:13:26,view,4810,1487580006317032337,,,6.67,551061566,41af1dc9-7c2f-4222-9919-18a449341d1b,,"[-11.72, -56.3278]",4810,2019-10-09 10:26:26,2019-10-09 14:38:26,252.0,2019-09-30 21:07:40,2020-03-01 03:55:29
998,2020-02-13 12:17:35,cart,4810,1487580006317032337,,,6.67,465231019,b65bbb4c-769a-45e0-b7db-99cbd6808883,465231019_93796,"[18.822699999999998, 123.3295]",4810,2020-02-13 04:02:35,2020-02-13 16:53:35,771.0,2019-09-30 21:07:40,2020-03-01 03:55:29


In [4]:
print(dataframe.info())

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, 0 to 999
Data columns (total 17 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   event_time              1000 non-null   object 
 1   event_type              1000 non-null   object 
 2   product_id              1000 non-null   int64  
 3   category_id             1000 non-null   int64  
 4   category_code           1000 non-null   object 
 5   brand                   1000 non-null   object 
 6   price                   1000 non-null   float64
 7   user_id                 1000 non-null   int64  
 8   Session_id              1000 non-null   object 
 9   Customer_id             1000 non-null   object 
 10  Location                1000 non-null   object 
 11  License_id              1000 non-null   int64  
 12  Session_start_datetime  1000 non-null   object 
 13  Session_end_datetime    1000 non-null   object 
 14  duration                1000 non-null   float6

# Création du graphe


In [5]:
from utils import IdentityEncoder, LocationEncoder, SessionIdEncoder, DateTimeEncoder
import torch

customer_encodings = {
        "Location": LocationEncoder(dtype=torch.float),
}
product_encodings = {
        "price": IdentityEncoder(dtype=torch.float),
        "category_id": IdentityEncoder(dtype=torch.float),
}
session_encodings = {
        "Session_id": SessionIdEncoder(dtype=torch.float),
        "Session_start_datetime": DateTimeEncoder(dtype=torch.float),
        "Session_end_datetime": DateTimeEncoder(dtype=torch.float),
        "user_id": IdentityEncoder(dtype=torch.float),
}
licence_encodings = {
        "License_id": IdentityEncoder(dtype=torch.float),
        "License_start_date": DateTimeEncoder(dtype=torch.float),
        "License_end_date": DateTimeEncoder(dtype=torch.float),
}

In [6]:
from torch_geometric.data import HeteroData
from utils import load_edge_csv, load_node_csv

def generate_graph(dataframe):
    data = HeteroData()

    # Loading nodes into graph
    data['customer'].x, customer_mapping = load_node_csv(dataframe, "Customer_id",customer_encodings)
    data['product'].x, product_mapping = load_node_csv(dataframe, "product_id", product_encodings)
    _, user_mapping = load_node_csv(dataframe, "user_id")
    data['user'].num_nodes = len(user_mapping)  # user has no features

    # Loading edges into graph
    data['customer', 'has', 'user'].edge_index, _ = load_edge_csv(
        dataframe,
        src_index_col='Customer_id',
        src_mapping=customer_mapping,
        dst_index_col='user_id',
        dst_mapping=user_mapping,
    )

    data['product', 'license','customer'].edge_index, data[
        'product', 'license','customer'].edge_attr = load_edge_csv(
        dataframe,
        src_index_col='product_id',
        src_mapping=product_mapping,
        dst_index_col='Customer_id',
        dst_mapping=customer_mapping,
        encoders=licence_encodings
    )
    data['user', 'opened_session', 'product'].edge_index, data[
        'user', 'opened_session', 'product'].edge_attr = load_edge_csv(
        dataframe,
        src_index_col='user_id',
        src_mapping=user_mapping,
        dst_index_col='product_id',
        dst_mapping=product_mapping,
        encoders=session_encodings
    )
    return data

In [7]:
data = generate_graph(dataframe)
print(data)

HeteroData(
  [1mcustomer[0m={ x=[1000, 2] },
  [1mproduct[0m={ x=[1000, 2] },
  [1muser[0m={ num_nodes=581 },
  [1m(customer, has, user)[0m={ edge_index=[2, 1000] },
  [1m(product, license, customer)[0m={
    edge_index=[2, 1000],
    edge_attr=[1000, 3]
  },
  [1m(user, opened_session, product)[0m={
    edge_index=[2, 1000],
    edge_attr=[1000, 4]
  }
)


In [8]:
data.num_nodes_dict

{'customer': 1000, 'product': 1000, 'user': 581}

# Preprocessing du graphe


In [9]:
from torch_geometric.nn import MetaPath2Vec
from torch_sparse import SparseTensor

metapath = [
    ('customer', 'has', 'user'),
    ('user', 'opened_session', 'product'),
    ('product', 'license','customer')  
]

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = MetaPath2Vec(data.edge_index_dict, embedding_dim=2,
                     metapath=metapath, walk_length=5, context_size=3,
                     walks_per_node=3, num_negative_samples=1,
                     sparse=True).to(device)

loader = model.loader(batch_size=32, shuffle=True, num_workers=3)
optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=0.01)


def train(epoch, log_steps=100, eval_steps=2000):
    model.train()

    total_loss = 0
    for i, (pos_rw, neg_rw) in enumerate(loader):
        optimizer.zero_grad()
        loss = model.loss(pos_rw.to(device), neg_rw.to(device))
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        if (i + 1) % log_steps == 0:
            print((f'Epoch: {epoch}, Step: {i + 1:05d}/{len(loader)}, '
                   f'Loss: {total_loss / log_steps:.4f}'))
            total_loss = 0

        



In [10]:
for epoch in range(1, 10):
  train(epoch)

In [11]:
data['user'].x=model('user')

print(data)

HeteroData(
  [1mcustomer[0m={ x=[1000, 2] },
  [1mproduct[0m={ x=[1000, 2] },
  [1muser[0m={
    num_nodes=581,
    x=[581, 2]
  },
  [1m(customer, has, user)[0m={ edge_index=[2, 1000] },
  [1m(product, license, customer)[0m={
    edge_index=[2, 1000],
    edge_attr=[1000, 3]
  },
  [1m(user, opened_session, product)[0m={
    edge_index=[2, 1000],
    edge_attr=[1000, 4]
  }
)


Saving the graph for further testing

In [12]:
torch.save(data, "graph.pt")

# DOMINANT

![The proposed framework](framework.png)

In [13]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

On rajoute des transformations à notre graphe, propre à torch_geometric, pour que les performances d'apprentissage de nos réseaux de neurones soient plus hautes.

In [14]:
import torch_geometric.transforms as T

data = T.ToUndirected()(data)
data = T.AddSelfLoops()(data)
data = T.NormalizeFeatures()(data)
data = T.ToDevice(device)(data)

We train our model now

In [15]:
from torch_geometric.loader import NeighborLoader
from model import Dominant, loss_func_train, loss_func_test
from utils import dense_adj


model = Dominant(feat_size=2, hidden_size=64, num_nodes_dict = data.num_nodes_dict, dropout=0.3, metadata=data.metadata()).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr = 5e-3)
train_loader = NeighborLoader(
    data,
    # Sample 15 neighbors for each node and each edge type for 2 iterations:
    num_neighbors=[15] * 2,
    # Use a batch size of 128 for sampling training nodes of type "paper":
    batch_size=32,
    input_nodes=('customer'),
)

epochs = 1
X = data.x_dict
adj = dense_adj(data)

for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        A_hat, X_hat = model(data.x_dict, data.edge_index_dict)
        loss, struct_loss, feat_loss = loss_func_train(X, X_hat, adj, A_hat)
        l = torch.mean(loss)
        l.backward(retain_graph=True)
        optimizer.step()        
        print("Epoch:", '%04d' % (epoch), "train_loss=", "{:.5f}".format(l.item()),"train/feat_loss=", "{:.5f}".format(feat_loss.item()))

        # if epoch == epochs - 1:
        #     model.eval()
        #     A_hat, X_hat = model(data.x_dict, data.edge_index_dict)
        #     loss, struct_loss, feat_loss = loss_func(X['customer'], X_hat['customer'])
        #     score = loss.detach().cpu().numpy()
        #     print("Score = ", score)

Epoch: 0000 train_loss= 3.36127 train/feat_loss= 0.61973


In [16]:
model.eval()
A_hat, X_hat = model(data.x_dict, data.edge_index_dict)
loss, struct_loss, feat_loss = loss_func_test(X['customer'], X_hat['customer'])
score = loss.detach().cpu().numpy()
print("Score = ", score)

Score =  [0.5490602  0.46584153 0.5456789  0.30138168 0.28215313 0.13683842
 0.56195325 0.1568579  0.2823684  0.5904142  0.162146   0.31566143
 0.49102774 0.60641307 0.48135749 0.56343025 0.6026771  0.70979804
 0.5121803  0.24602896 0.36202833 0.722054   0.6650754  0.3425091
 0.3409965  0.3150705  0.5258348  0.54789335 0.07182362 0.44697732
 0.25242344 0.34971362 0.21716468 0.24417703 0.3005462  0.34793943
 0.20913118 0.4185117  0.11430482 0.27168134 0.37676388 0.51085454
 0.37776443 0.23599829 0.10316314 0.47775063 0.10582247 0.4729387
 0.32018968 0.9862269  0.51741534 0.71544814 0.25345492 0.71508497
 0.4228449  0.19055523 0.29404622 0.49799165 0.46902895 0.96300954
 0.5658795  0.7369426  0.65345377 0.39049697 0.56400496 0.37159526
 0.13241121 0.32234725 0.2686462  0.48280028 0.6277204  0.69611996
 0.29928225 0.6646999  0.24685448 0.5337853  0.4334252  0.14657682
 0.8489172  0.2067341  0.26053047 0.48895186 0.4685598  0.56706846
 0.40895364 0.07124864 0.5288472  0.3519657  0.39788744

In [26]:
len(score)

1000