In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.utils import shuffle
from tqdm import tqdm
import pickle as pkl
import pandas as pd
import random
import sys
import os
from sklearn.metrics import f1_score

In [None]:
random.seed(0)
np.random.seed(0)

In [None]:
config = {
    #embedding computation
    'cleora_n_iter': 5,
    'cleora_dim': 1024,

    #dataset preparation
    'train_test_split': 0.2,
    'dataset_name': 'fb-pages',

    #training classification
    'input_embeddings': [
        'output/emb__cluster_id__StarNode.out',
        'output/emb__CliqueNode__CliqueNode.out',
    ],
    'batch_size': 256,
    'test_batch_size': 1000,
    'epochs': [20],
}

output_dir = 'output'

# Read and split data

In [None]:
df = pd.read_csv(f"data/{config['dataset_name']}/node-labels-{config['dataset_name']}.csv")
clique_input_filename = f"data/{config['dataset_name']}/hyperedges-{config['dataset_name']}.txt"
star_input_filename = f"data/{config['dataset_name']}/star-{config['dataset_name']}.txt"

In [None]:
train_filename = "output/train_set.txt"
test_filename = "output/test_set.txt"

In [None]:
train_set, test_set = train_test_split(df, test_size=config['train_test_split'])

In [None]:
with open(train_filename, "w") as f_train:
    for index, row in train_set.iterrows():
        f_train.write(f"{row['id']} {row['target']}\n")

In [None]:
with open(test_filename, "w") as f_test:
    for index, row in test_set.iterrows():
        f_test.write(f"{row['id']} {row['target']}\n")

# Cleora training

Download an appropriate binary Cleora release from: https://github.com/Synerise/cleora/releases . 

A Linux GNU version is assumed in this example, but any other will do.

In [None]:
import subprocess


def columns2output_filename(output_dir, columns):
    columns_split = columns.split()
    if len(columns_split) == 1 and 'reflexive' in columns:
        column_name = columns.split('::')[-1]
        return os.path.join(output_dir, f'emb__{column_name}__{column_name}.out')

    column_names = [i.split('::')[-1] for i in columns_split]
    return os.path.join(output_dir, 'emb__' + '__'.join(column_names) + '.out')


def train_cleora(dim, n_iter, columns, input_filename, output_dir):
    command = ['cleora-master/cleora-v1.1.1-x86_64-pc-windows-msvc',
               '--columns', columns,
               '--dimension', str(dim),
               '-n', str(n_iter),
               '--input', input_filename,
               '-o', output_dir]
    subprocess.run(command, check=True)
    return columns2output_filename(output_dir, columns)

## Star expansion

In the `fb_cleora_input_star.txt` file the first column is a virtual node. The parameter `-c "transient::cluster_id node"` means that embeddings will not be created for nodes from this column. This translates to star expansion scheme.

In [None]:
%%time
cleora_output_star_filename = train_cleora(config['cleora_dim'],
                                           config['cleora_n_iter'],
                                           "transient::cluster_id StarNode",
                                           star_input_filename,
                                           output_dir)

## Clique expansion

The `fb_cleora_input_clique.txt` file has the structure of adjacency list. The parameter `-c "complex::reflexive::node"` means that edges will be created for all cominations of nodes from each line. This translates to clique expansion scheme.

In [None]:
%%time
cleora_output_clique_filename = train_cleora(config['cleora_dim'],
                                             config['cleora_n_iter'],
                                             "complex::reflexive::CliqueNode",
                                             clique_input_filename,
                                             output_dir)

## No expansion

You can also compute Cleora without any expansion scheme by providing an input file in the edgelist format (single pair of nodes per line). Run with a simple parameter: `-c "node1 node2"`.

# Classification

We train a simple multiclass Logistic Regression classifier to predict the class of node based on its embedding. We assess the quality of the classifier with of 2 metrics: micro-F1 and macro-F1.

In [None]:
def read_embeddings(input_file):
    df_full = pd.read_csv(input_file, delimiter=" ", skiprows=[0], header=None,
                          index_col=0)
    df_full = df_full.drop([1], axis=1)

    return df_full

In [None]:
def read_train_test(embeddings):
    valid_idx = embeddings.index.to_numpy()

    train = np.loadtxt(train_filename, delimiter=" ", dtype=int)
    test = np.loadtxt(test_filename, delimiter=" ", dtype=int)

    train = train[np.isin(train[:, 0], valid_idx) & np.isin(train[:, 1], valid_idx)]
    test = [t for t in test if (t[0] in valid_idx) and (t[1] in valid_idx)]

    train = np.array(train)
    test = np.array(test)

    return train, test

In [None]:
batch_size = config['batch_size']
test_batch_size = config['test_batch_size']

In [None]:
targets = df['target'].unique()

for algo in config['input_embeddings']:
    embeddings = read_embeddings(algo)
    train, test = read_train_test(embeddings)

    y_train = train[:, 1]
    y_test = test[:, 1]

    clf = SGDClassifier(random_state=0, loss='log_loss', alpha=0.0001)
    for e in tqdm(range(0, max(config['epochs']))):
        for idx in range(0, train.shape[0], batch_size):
            ex = train[idx:min(idx + batch_size, train.shape[0]), :]

            ex_emb_in = embeddings.loc[ex[:, 0]].to_numpy()
            ex_y = y_train[idx:min(idx + batch_size, train.shape[0])]

            clf.partial_fit(ex_emb_in, ex_y, classes=targets)

        if e + 1 in config['epochs']:
            acc = 0.0
            y_pred = []
            for n, idx in enumerate(range(0, test.shape[0], test_batch_size)):
                ex = test[idx:min(idx + test_batch_size, train.shape[0]), :]
                ex_emb_in = embeddings.loc[ex[:, 0]].to_numpy()
                pred = clf.predict_proba(ex_emb_in)

                classes = np.argmax(pred, axis=1)
                y_pred.extend(classes)

            f1_micro = f1_score(y_test, y_pred, average='micro')
            f1_macro = f1_score(y_test, y_pred, average='macro')
            print('algo: {} epochs: {}, micro f1: {}, macro f1:{}'.format(algo, e + 1, f1_micro, f1_macro))
