In [4]:
# Install required packages.
!pip install -q torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cu113.html
!pip install -q torch-sparse -f https://pytorch-geometric.com/whl/torch-1.10.0+cu113.html
!pip install -q git+https://github.com/rusty1s/pytorch_geometric.git

# Preprocessing des données tabulaires

In [5]:
import pandas as pd

In [6]:
database_path = "/content/drive/MyDrive/Project Anomaly Detection/data/data_out_head_10000.csv"

dataframe = pd.read_csv(database_path).fillna("")

In [8]:
dataframe['latitude']=dataframe['Location'].apply(lambda x : float(x.split(',')[0][1:]))
dataframe['longitude']=dataframe['Location'].apply(lambda x : float(x.split(',')[1][:-1]))

dataframe[["session_id1", "session_id2", "session_id3", "session_id4", "session_id5"]] = dataframe.Session_id.str.split('-',expand=True)

def hex_to_dec(id):
  if id == "" or id is None: return 0
  else: return int(id, 16)

dataframe['session_id1'] = dataframe['session_id1'].apply(hex_to_dec)
dataframe['session_id2'] = dataframe['session_id2'].apply(hex_to_dec)
dataframe['session_id3'] = dataframe['session_id3'].apply(hex_to_dec)
dataframe['session_id4'] = dataframe['session_id4'].apply(hex_to_dec)
dataframe['session_id5'] = dataframe['session_id5'].apply(hex_to_dec)


In [9]:
print(dataframe.head())

   Unnamed: 0           event_time        event_type  product_id  \
0           0  2020-01-25 23:46:12              cart     5921712   
1           1  2020-02-15 14:43:37  remove_from_cart     5921712   
2           2  2020-02-09 20:57:57  remove_from_cart     5921712   
3           3  2020-02-05 05:30:46              view     5921712   
4           4  2020-01-28 07:17:14              cart     5921712   

           category_id category_code brand  price    user_id  \
0  2115334439910245200                       5.16  388018099   
1  2115334439910245200                       5.16  459659126   
2  2115334439910245200                       5.16  405986628   
3  2115334439910245200                       5.16  571731968   
4  2115334439910245200                       5.16  601508456   

                             Session_id  ... duration   License_start_date  \
0  843d560b-2069-4a0d-68af-f767f5341312  ...    656.0  2020-01-22 07:56:13   
1  457cee31-cfd9-4f75-909d-64f17021da9d  ...    47

In [10]:
print(dataframe["brand"].unique())

['' 'coifin' 'ingarden' 'concept' 'irisk' 'enas' 'grattol' 'pnb' 'masura'
 'matrix' 'milv' 'kapous' 'roubloff' 'domix' 'runail' 'pole' 'staleks'
 'max' 'lsanic' 'bpw.style' 'estel' 'kaaral' 'beautix' 'petitfee' 'zinger'
 'kosmekka' 'haruyama' 'italwax' 'bluesky' 'uskusi' 'zeitun' 'uno'
 'jessnail' 'farmstay' 'metzger' 'smart' 'levissime' 'shik' 'freedecor'
 'skinlite' 'cnd' 'depilflax' 'polarus' 'cosmoprofi' 'swarovski'
 'nagaraku' 'de.lux' 'provoc' 'art-visage' 'solomeya' 'dermacol'
 'airnails' 'lowence' 'dizao' 'f.o.x' 'dewal' 'benovy' 'beauty-free'
 'tertio' 'artex' 'emil' 'lovely' 'opi' 'thuya']


In [11]:
print(dataframe.columns)

Index(['Unnamed: 0', 'event_time', 'event_type', 'product_id', 'category_id',
       'category_code', 'brand', 'price', 'user_id', 'Session_id',
       'Customer_id', 'Location', 'License_id', 'Session_start_datetime',
       'Session_end_datetime', 'duration', 'License_start_date',
       'License_end_date', 'latitude', 'longitude', 'session_id1',
       'session_id2', 'session_id3', 'session_id4', 'session_id5'],
      dtype='object')


# Création du graphe

In [12]:
import torch
from datetime import datetime

def load_node_csv(dataframe, index_col, encoders=None):
    df = dataframe.set_index(index_col)
    mapping = {index: i for i, index in enumerate(df.index.unique())}

    x = None
    if encoders is not None:
        xs = [encoder(df[col]) for col, encoder in encoders.items()]
        x = torch.cat(xs, dim=-1)

    return x, mapping


# class SequenceEncoder(object):
#     def __init__(self, model_name='all-MiniLM-L6-v2', device=None):
#         self.device = device
#         self.model = SentenceTransformer(model_name, device=device)

#     @torch.no_grad()
#     def __call__(self, df):
#         x = self.model.encode(df.values, show_progress_bar=True,
#                               convert_to_tensor=True, device=self.device)
          
#         return x.cpu()


class GenresEncoder(object):
    def __init__(self, sep='|'):
        self.sep = sep

    def __call__(self, df):
        genres = set(g for col in df.values for g in col.split(self.sep))
        mapping = {genre: i for i, genre in enumerate(genres)}

        x = torch.zeros(len(df), len(mapping))
        for i, col in enumerate(df.values):
            for genre in col.split(self.sep):
                x[i, mapping[genre]] = 1
        return x


class IdentityEncoder(object):
    def __init__(self, dtype=None):
        self.dtype = dtype

    def __call__(self, df):
        return torch.from_numpy(df.values).view(-1, 1).to(self.dtype)

class DateTimeEncoder(object):
    def __init__(self, dtype=None):
        self.dtype = dtype

    def __call__(self, df):
        return torch.from_numpy(pd.to_datetime(df, infer_datetime_format=True).apply(lambda x: datetime.timestamp(x)).values).view(-1, 1).to(self.dtype)

class HexIdEncoder(object):
    def __init__(self, dtype=None):
        self.dtype = dtype
    def __call__(self,df):
        return torch.from_numpy(df.apply(hex_to_dec).values).view(-1, 1).to(self.dtype)


def load_edge_csv(dataframe, src_index_col, src_mapping, dst_index_col, dst_mapping,
                  encoders=None):
    df = dataframe

    src = [src_mapping[index] for index in df[src_index_col]]
    dst = [dst_mapping[index] for index in df[dst_index_col]]
    edge_index = torch.tensor([src, dst])
    
    edge_attr = None
    if encoders is not None:
        edge_attrs = [encoder(df[col]) for col, encoder in encoders.items()]
        edge_attr = torch.cat(edge_attrs, dim=-1)

    return edge_index, edge_attr

In [13]:
customer_encodings = {"latitude": IdentityEncoder(dtype=torch.float),"longitude":IdentityEncoder(dtype=torch.float)}
product_encodings = {  # "brand": SequenceEncoder(),
        "price": IdentityEncoder(dtype=torch.float),
        "category_id": IdentityEncoder(dtype=torch.float),
}
session_encodings = {
        "session_id1": IdentityEncoder(dtype=torch.float),
        "session_id2": IdentityEncoder(dtype=torch.float),
        "session_id3": IdentityEncoder(dtype=torch.float),
        "session_id4": IdentityEncoder(dtype=torch.float),
        "session_id5": IdentityEncoder(dtype=torch.float),
        "Session_start_datetime": DateTimeEncoder(dtype=torch.float),
        "Session_end_datetime": DateTimeEncoder(dtype=torch.float),
        "user_id": IdentityEncoder(dtype=torch.float),
}
licence_encodings = {
        "License_id": IdentityEncoder(dtype=torch.float),
        "License_start_date": DateTimeEncoder(dtype=torch.float),
        "License_end_date": DateTimeEncoder(dtype=torch.float),
}

In [14]:
from torch_geometric.data import HeteroData

def generate_graph(dataframe):
    data = HeteroData()

    # Loading nodes into graph
    data['customer'].x, customer_mapping = load_node_csv(dataframe, "Customer_id",customer_encodings)
    data['product'].x, product_mapping = load_node_csv(dataframe, "product_id", product_encodings)
    _, user_mapping = load_node_csv(dataframe, "user_id")
    data['user'].num_nodes = len(user_mapping)  # user has no features

    print(data)

    # Loading edges into graph
    data['customer', 'has', 'user'].edge_index, _ = load_edge_csv(
        dataframe,
        src_index_col='Customer_id',
        src_mapping=customer_mapping,
        dst_index_col='user_id',
        dst_mapping=user_mapping,
    )

    data['product', 'license','customer'].edge_index, data[
        'product', 'license','customer'].edge_attr = load_edge_csv(
        dataframe,
        src_index_col='product_id',
        src_mapping=product_mapping,
        dst_index_col='Customer_id',
        dst_mapping=customer_mapping,
        encoders=licence_encodings
    )
    data['user', 'opened_session', 'product'].edge_index, data[
        'user', 'opened_session', 'product'].edge_attr = load_edge_csv(
        dataframe,
        src_index_col='user_id',
        src_mapping=user_mapping,
        dst_index_col='product_id',
        dst_mapping=product_mapping,
        encoders=session_encodings
    )
    return data

In [16]:
data = generate_graph(dataframe)
print(data)

HeteroData(
  [1mcustomer[0m={ x=[10000, 2] },
  [1mproduct[0m={ x=[10000, 2] },
  [1muser[0m={ num_nodes=5436 }
)
HeteroData(
  [1mcustomer[0m={ x=[10000, 2] },
  [1mproduct[0m={ x=[10000, 2] },
  [1muser[0m={ num_nodes=5436 },
  [1m(customer, has, user)[0m={ edge_index=[2, 10000] },
  [1m(product, license, customer)[0m={
    edge_index=[2, 10000],
    edge_attr=[10000, 3]
  },
  [1m(user, opened_session, product)[0m={
    edge_index=[2, 10000],
    edge_attr=[10000, 8]
  }
)


In [17]:
data.num_nodes_dict

{'customer': 10000, 'product': 10000, 'user': 5436}

# Preprocessing du graphe


In [18]:
from torch_geometric.nn import MetaPath2Vec

metapath = [
    ('customer', 'has', 'user'),
    ('user', 'opened_session', 'product'),
    ('product', 'license','customer')
  
]


device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = MetaPath2Vec(data.edge_index_dict, embedding_dim=2,
                     metapath=metapath, walk_length=5, context_size=3,
                     walks_per_node=3, num_negative_samples=1,
                     sparse=True).to(device)

loader = model.loader(batch_size=32, shuffle=True, num_workers=3)
optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=0.01)


def train(epoch, log_steps=100, eval_steps=2000):
    model.train()

    total_loss = 0
    for i, (pos_rw, neg_rw) in enumerate(loader):
        optimizer.zero_grad()
        loss = model.loss(pos_rw.to(device), neg_rw.to(device))
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        if (i + 1) % log_steps == 0:
            print((f'Epoch: {epoch}, Step: {i + 1:05d}/{len(loader)}, '
                   f'Loss: {total_loss / log_steps:.4f}'))
            total_loss = 0

        

In [19]:
for epoch in range(1, 10):
  train(epoch)

Epoch: 1, Step: 00100/161, Loss: 1.6479
Epoch: 2, Step: 00100/161, Loss: 1.5230
Epoch: 3, Step: 00100/161, Loss: 1.4539
Epoch: 4, Step: 00100/161, Loss: 1.4033
Epoch: 5, Step: 00100/161, Loss: 1.3714
Epoch: 6, Step: 00100/161, Loss: 1.3411
Epoch: 7, Step: 00100/161, Loss: 1.3088
Epoch: 8, Step: 00100/161, Loss: 1.2663
Epoch: 9, Step: 00100/161, Loss: 1.1994


In [20]:
data['user'].x=model('user')

print(data)

HeteroData(
  [1mcustomer[0m={ x=[10000, 2] },
  [1mproduct[0m={ x=[10000, 2] },
  [1muser[0m={
    num_nodes=5436,
    x=[5436, 2]
  },
  [1m(customer, has, user)[0m={ edge_index=[2, 10000] },
  [1m(product, license, customer)[0m={
    edge_index=[2, 10000],
    edge_attr=[10000, 3]
  },
  [1m(user, opened_session, product)[0m={
    edge_index=[2, 10000],
    edge_attr=[10000, 8]
  }
)


Saving the graph for further testing

In [21]:
torch.save(data, "graph.pt")

# DOMINANT

In [22]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [23]:
import numpy as np

def dense_adj(data):
  adj_dict = {}
  for node_i in data.num_nodes_dict.keys():
    adj_dict[node_i] = {}
    shape = data.num_nodes_dict[node_i]
    for node_j in data.num_nodes_dict.keys():
      adj_dict[node_i][node_j] = torch.from_numpy(np.zeros((data.num_nodes_dict[node_i], data.num_nodes_dict[node_j])))
  for key in data.edge_index_dict.keys():
    a,_,b = key
    for (i,j) in data.edge_index_dict[key].numpy().transpose():
      adj_dict[a][b][i][j] = 1
    return adj_dict
  

On rajoute des transformations à notre graphe, propre à torch_geometric, pour que les performances d'apprentissage de nos réseaux de neurones soient plus hautes.

In [24]:
# data = torch.load("graph.pt")

print(data)

HeteroData(
  [1mcustomer[0m={ x=[10000, 2] },
  [1mproduct[0m={ x=[10000, 2] },
  [1muser[0m={
    num_nodes=5436,
    x=[5436, 2]
  },
  [1m(customer, has, user)[0m={ edge_index=[2, 10000] },
  [1m(product, license, customer)[0m={
    edge_index=[2, 10000],
    edge_attr=[10000, 3]
  },
  [1m(user, opened_session, product)[0m={
    edge_index=[2, 10000],
    edge_attr=[10000, 8]
  }
)


In [25]:
import torch_geometric.transforms as T

data = T.ToUndirected()(data)
data = T.AddSelfLoops()(data)
data = T.NormalizeFeatures()(data)
data = T.ToDevice(device)(data)

On introduit notre modèle DOMINANT, basé sur des couches de convolutions SAGEConv

In [26]:
import torch.nn as nn
import torch.nn.functional as F
import torch
from torch_geometric.nn import GCNConv, SAGEConv, to_hetero

class Encoder(nn.Module):
    def __init__(self, num_features, hidden_channels, dropout):
        super().__init__()
        self.conv1 = SAGEConv((num_features, -1), hidden_channels)
        self.conv2 = SAGEConv((-1, -1), hidden_channels)
        self.dropout = dropout

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = F.dropout(x, self.dropout, training=self.training)
        x = self.conv2(x, edge_index).relu()
        return x

class Attribute_Decoder(nn.Module):
    def __init__(self, num_features, hidden_channels, dropout):
        super().__init__()

        self.conv1 = SAGEConv((-1, -1), hidden_channels)
        self.conv2 = SAGEConv((-1, -1), num_features)
        self.dropout = dropout

    def forward(self, x, adj):

        x = F.relu(self.conv1(x, adj))
        x = F.dropout(x, self.dropout, training=self.training)
        x = F.relu(self.conv2(x, adj))

        return x

class Structure_Decoder(nn.Module):
    def __init__(self, num_nodes, hidden_channels, dropout):
        super().__init__()

        self.conv1 = SAGEConv((-1, -1), hidden_channels)
        self.conv2 = SAGEConv((-1, -1), num_nodes)
        self.dropout = dropout

    def forward(self, x, adj):

        x = F.relu(self.conv1(x, adj))
        x = F.dropout(x, self.dropout, training=self.training)
        x = F.relu(self.conv2(x, adj))

        return x.T


class Dominant(nn.Module):
    def __init__(self, feat_size, hidden_size, num_nodes_dict, dropout, metadata):
        super().__init__()
        
        self.shared_encoder = to_hetero(Encoder(feat_size, hidden_size, dropout), metadata, aggr='sum')
        self.attr_decoder = to_hetero(Attribute_Decoder(feat_size, hidden_size, dropout), metadata, aggr='sum')
        self.struct_decoder_dict = {}
        for key in num_nodes_dict.keys():
          self.struct_decoder_dict[key] = to_hetero(Structure_Decoder(num_nodes_dict[key], hidden_size, dropout), metadata, aggr='sum')
    
    def forward(self, x_dict, adj_dict):

        # encode
        x_dict = self.shared_encoder(x_dict, adj_dict)
        # decode feature matrix
        x_hat_dict = self.attr_decoder(x_dict, adj_dict)
        # decode adjacency matrix
        struct_reconstructed_dict={}
        for key in self.struct_decoder_dict.keys():
          struct_reconstructed_dict[key] = self.struct_decoder_dict[key](x_dict, adj_dict)
        # return reconstructed matrices
        return struct_reconstructed_dict, x_hat_dict

We train our model now

In [31]:
def loss_func_train(attrs, X_hat, adj, A_hat, alpha=0.8):
    # Attribute reconstruction loss
    diff_attribute = []
    for key in attrs.keys():
        diff_attribute.append(torch.pow(X_hat[key] - attrs[key], 2))
    diff_attribute = torch.cat(tuple(diff_attribute), 0)
    attribute_reconstruction_errors = torch.sqrt(torch.sum(diff_attribute, 1))
    attribute_cost = torch.mean(attribute_reconstruction_errors)

    # structure reconstruction loss
    diff_structure_all = []
    for key1 in adj.keys():
        structure = []
        for key2 in adj.keys():
            structure.append(torch.pow(A_hat[key1][key2] - adj[key1][key2], 2))
        diff_structure_all.append(torch.cat(tuple(structure), 1))
    diff_structure = torch.cat(tuple(diff_structure_all), 0)
    structure_reconstruction_errors = torch.sqrt(torch.sum(diff_structure, 1))
    structure_cost = torch.mean(structure_reconstruction_errors)

    cost =  alpha * attribute_reconstruction_errors + (1-alpha) * structure_reconstruction_errors

    return cost, structure_cost, attribute_cost

def loss_func_test(attrs, X_hat):
    # Attribute reconstruction loss
    diff_attribute = torch.pow(X_hat - attrs, 2)
    attribute_reconstruction_errors = torch.sqrt(torch.sum(diff_attribute, 1))
    attribute_cost = torch.mean(attribute_reconstruction_errors)

    # structure reconstruction loss
    # diff_structure = torch.pow(A_hat - adj, 2)
    # structure_reconstruction_errors = torch.sqrt(torch.sum(diff_structure, 1))
    # structure_cost = torch.mean(structure_reconstruction_errors)
    structure_cost = 0

    cost =  attribute_reconstruction_errors


    return cost, structure_cost, attribute_cost

model = Dominant(feat_size=2, hidden_size=64, num_nodes_dict = data.num_nodes_dict, dropout=0.3, metadata=data.metadata()).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr = 5e-3)

In [28]:
from torch_geometric.loader import NeighborLoader

train_loader = NeighborLoader(
    data,
    # Sample 15 neighbors for each node and each edge type for 2 iterations:
    num_neighbors=[15] * 2,
    # Use a batch size of 128 for sampling training nodes of type "paper":
    batch_size=32,
    input_nodes=('customer'),
)

In [29]:
epochs = 1
X = data.x_dict
adj = dense_adj(data)

for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        A_hat, X_hat = model(data.x_dict, data.edge_index_dict)
        loss, struct_loss, feat_loss = loss_func_train(X, X_hat, adj, A_hat)
        l = torch.mean(loss)
        l.backward(retain_graph=True)
        optimizer.step()        
        print("Epoch:", '%04d' % (epoch), "train_loss=", "{:.5f}".format(l.item()),"train/feat_loss=", "{:.5f}".format(feat_loss.item()))

        # if epoch == epochs - 1:
        #     model.eval()
        #     A_hat, X_hat = model(data.x_dict, data.edge_index_dict)
        #     loss, struct_loss, feat_loss = loss_func(X['customer'], X_hat['customer'])
        #     score = loss.detach().cpu().numpy()
        #     print("Score = ", score)

Epoch: 0000 train_loss= 8.76805 train/feat_loss= 0.71963


In [32]:
model.eval()
A_hat, X_hat = model(data.x_dict, data.edge_index_dict)
loss, struct_loss, feat_loss = loss_func_test(X['customer'], X_hat['customer'])
score = loss.detach().cpu().numpy()
print("Score = ", score)

Score =  [0.68097085 0.9017715  1.1054202  ... 0.5713983  0.45627385 0.37311247]


In [33]:
len(score)

10000