In [25]:
# Libs
import os
import numpy as np
import json
from tqdm import tqdm
from operator import itemgetter
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import warnings

In [26]:
# Reference for the classes: https://github.com/keras-team/keras-io/blob/master/examples/graph/gat_node_classification.py

In [27]:
# Some constants we are gonna use
DATA_INPUT_PATH = 'data'
DATA_INPUT_NAME2 = 'dev.json'
DATA_INPUT_NAME3 = 'train.json'
DATA_INPUT_NAME4 = 'test.json'

In [28]:
# Openning the files provided by the academic community
# https://arxiv.org/abs/2106.13088
with open(os.path.join(DATA_INPUT_PATH, DATA_INPUT_NAME2)) as file_json:
    data_twitter_dev = json.load(file_json)
with open(os.path.join(DATA_INPUT_PATH, DATA_INPUT_NAME3)) as file_json:
    data_twitter_train = json.load(file_json)
with open(os.path.join(DATA_INPUT_PATH, DATA_INPUT_NAME4)) as file_json:
    data_twitter_test = json.load(file_json)

In [29]:
# Creating the dataset from json records and fields
df_data_twitter_all = pd.DataFrame.from_records(data_twitter_train).append(pd.DataFrame.from_records(data_twitter_dev)).append(pd.DataFrame.from_records(data_twitter_test))


In [30]:
# Function to retrieve followings of each user
def retrieve_following(x):
    try:
        f = x['following']
        if f == []:
            f = [-999]
        else:
            f = f
    except:
        f = [-999]
    return f

In [31]:
# Getting the profile features for each user and the label 
df_data_twitter_all_features = pd.DataFrame.from_dict(df_data_twitter_all['profile'].tolist())
df_data_twitter_all_features['label'] =  df_data_twitter_all['label'].reset_index(drop=True)

In [32]:
df_data_twitter_all_features.to_csv('./data/user_features_all.csv')
# Retrieving neigbors for each user
df_neighbors_twitter_all = pd.DataFrame.from_dict(df_data_twitter_all['neighbor'])
df_neighbors_twitter_all['ID'] = pd.DataFrame.from_dict(df_data_twitter_all['ID'])
df_neighbors_twitter_all.dropna(inplace=True)
df_neighbors_twitter_all['neighbor'] = df_neighbors_twitter_all['neighbor'].apply(lambda x: np.array(x['following']))

In [33]:
# Creating a dataframe in wich each row has 2 columns: Source and target. Source is and ID of an user and Target 
# Is the ID of other user which is connected with the source (source follows target)
array_source_target = np.empty((1,2))
for _, user in  df_neighbors_twitter_all.iterrows():
    following = user['neighbor']
    user_ = np.array([user['ID']])
    array_user_following = np.hstack([np.broadcast_to(user_, shape=(len(following), len(user_))), following.reshape(-1,1)])
    #array_user_follower = np.hstack([np.broadcast_to(user_, shape=(len(followers), len(user_))), followers.reshape(-1,1)])
    array_source_target = np.vstack([array_source_target,array_user_following])
df_source_target_all = pd.DataFrame(array_source_target, columns=['source', 'target'])

In [34]:
df_source_target_all.drop(0, axis=0, inplace=True)
df_source_target_all.to_csv('./data/source_target_all.csv')
# Data prep - False:0 and True: 1
df_data_twitter_all_features = df_data_twitter_all_features.replace({'False ':0, 'True ':1})
df_source_target_all = df_source_target_all.astype('int')

In [35]:
# listing the features we are gonna use
list_features_to_convert = ['id', 'protected', 'followers_count', 'friends_count', 'listed_count', 'favourites_count', 'statuses_count', 'label']
df_data_twitter_all_features[list_features_to_convert] = df_data_twitter_all_features[list_features_to_convert].astype('int')
# Casting all to int
df_data_twitter_all_features_bool = df_data_twitter_all_features.select_dtypes(include=['int'])

In [36]:
# clases
class_values = [0, 1]
# id para as classes
class_idx = {name: id for id, name in enumerate(class_values)}
# id para os papers
user_idx = {int(name): int(idx) for idx, name in enumerate(sorted(df_data_twitter_all_features_bool['id'].tolist()))}
df_data_twitter_all_features_bool['id'] = df_data_twitter_all_features_bool['id'].apply(lambda x: user_idx.get(x, np.nan))
df_source_target_all['target'] = df_source_target_all['target'].apply(lambda x: user_idx.get(x, np.nan))
df_source_target_all['source'] = df_source_target_all['source'].apply(lambda x: user_idx.get(x, np.nan))
df_data_twitter_all_features_bool['label'] = df_data_twitter_all_features_bool['label'].apply(lambda x: class_idx.get(x, np.nan))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_data_twitter_all_features_bool['id'] = df_data_twitter_all_features_bool['id'].apply(lambda x: user_idx.get(x, np.nan))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_data_twitter_all_features_bool['label'] = df_data_twitter_all_features_bool['label'].apply(lambda x: class_idx.get(x, np.nan))


In [37]:
df_source_target_all.dropna(inplace=True)
df_source_target_all = df_source_target_all.astype('int')
df_source_target_all.reset_index(drop=True, inplace=True)

In [38]:
df_data_twitter_all_features_bool['Label'] = df_data_twitter_all_features_bool['id']
df_data_twitter_all_features_bool['Id'] = df_data_twitter_all_features_bool['id']
df_data_twitter_all_features_bool[['Id', 'Label']].to_csv('./data/id_label.csv')
df_source_target_all['Type'] = 'Directed'
df_source_target_all['Source'] = df_source_target_all['source']
df_source_target_all['Target'] = df_source_target_all['target']
df_source_target_all[['Source', 'Target', 'Type']].to_csv('./data/source_target_ids_all.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_data_twitter_all_features_bool['Label'] = df_data_twitter_all_features_bool['id']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_data_twitter_all_features_bool['Id'] = df_data_twitter_all_features_bool['id']


In [39]:
# Obtain random indices
np.random.seed(100)
random_indices = np.random.permutation(range(df_data_twitter_all_features_bool.shape[0]))
n = int(len(random_indices)*0.8)
# 50/50 split
train_data = df_data_twitter_all_features_bool.iloc[random_indices[:n]]
test_data = df_data_twitter_all_features_bool.iloc[random_indices[n:]]

In [40]:
# Obtain paper indices which will be used to gather node states
# from the graph later on when training the model
train_indices = train_data["id"].to_numpy()
test_indices = test_data["id"].to_numpy()

# Obtain ground truth labels corresponding to each paper_id
train_labels = train_data["label"].to_numpy()
test_labels = test_data["label"].to_numpy()

# Define graph, namely an edge tensor and a node feature tensor
edges = tf.convert_to_tensor(df_source_target_all[["source", "target"]])
node_features = tf.convert_to_tensor(df_data_twitter_all_features_bool.sort_values("id").iloc[:, 1:-1])

# Print shapes of the graph
print("Edges shape:\t\t", edges.shape)
print("Node features shape:", node_features.shape)


Edges shape:		 (11890, 2)
Node features shape: (11826, 18)


2021-09-20 09:25:07.303784: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [41]:
 class GraphAttention(layers.Layer):
    def __init__(
        self,
        units,
        kernel_initializer="glorot_uniform",
        kernel_regularizer=None,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.units = units
        self.kernel_initializer = keras.initializers.get(kernel_initializer)
        self.kernel_regularizer = keras.regularizers.get(kernel_regularizer)

    def build(self, input_shape):

        self.kernel = self.add_weight(
            shape=(input_shape[0][-1], self.units),
            trainable=True,
            initializer=self.kernel_initializer,
            regularizer=self.kernel_regularizer,
        )
        self.kernel_attention = self.add_weight(
            shape=(self.units * 2, 1),
            trainable=True,
            initializer=self.kernel_initializer,
            regularizer=self.kernel_regularizer,
        )
        self.built = True

    def call(self, inputs):
        node_features, edges = inputs

        # Linearly transform node features (node states)
        node_features_transformed = tf.matmul(node_features, self.kernel)

        # (1) Compute pair-wise attention scores
        node_features_expanded = tf.gather(node_features_transformed, edges)
        node_features_expanded = tf.reshape(
            node_features_expanded, (tf.shape(edges)[0], -1)
        )
        attention_scores = tf.nn.leaky_relu(
            tf.matmul(node_features_expanded, self.kernel_attention)
        )
        attention_scores = tf.squeeze(attention_scores, -1)

        # (2) Normalize attention scores
        attention_scores = tf.math.exp(tf.clip_by_value(attention_scores, -2, 2))
        attention_scores_sum = tf.math.unsorted_segment_sum(
            data=attention_scores,
            segment_ids=edges[:, 0],
            num_segments=tf.reduce_max(edges[:, 0]) + 1,
        )
        attention_scores_sum = tf.repeat(
            attention_scores_sum, tf.math.bincount(tf.cast(edges[:, 0], "int32"))
        )
        attention_scores_norm = attention_scores / attention_scores_sum

        # (3) Gather node states of neighbors, apply attention scores and aggregate
        node_features_neighbors = tf.gather(node_features_transformed, edges[:, 1])
        out = tf.math.unsorted_segment_sum(
            data=node_features_neighbors * attention_scores_norm[:, tf.newaxis],
            segment_ids=edges[:, 0],
            num_segments=tf.shape(node_features)[0],
        )
        return out


class MultiHeadGraphAttention(layers.Layer):
    def __init__(self, units, num_heads=8, merge_type="concat", **kwargs):
        super().__init__(**kwargs)
        self.num_heads = num_heads
        self.merge_type = merge_type
        self.attention_layers = [GraphAttention(units) for _ in range(num_heads)]

    def call(self, inputs):
        atom_features, pair_indices = inputs

        # Obtain outputs from each attention head
        outputs = [
            attention_layer([atom_features, pair_indices])
            for attention_layer in self.attention_layers
        ]
        # Concatenate or average the node states from each head
        if self.merge_type == "concat":
            outputs = tf.concat(outputs, axis=-1)
        else:
            outputs = tf.reduce_mean(tf.stack(outputs, axis=-1), axis=-1)
        # Activate and return node states
        return tf.nn.relu(outputs)

In [42]:
class GraphAttention(layers.Layer):
    def __init__(
        self,
        units,
        kernel_initializer="glorot_uniform",
        kernel_regularizer=None,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.units = units
        self.kernel_initializer = keras.initializers.get(kernel_initializer)
        self.kernel_regularizer = keras.regularizers.get(kernel_regularizer)

    def build(self, input_shape):

        self.kernel = self.add_weight(
            shape=(input_shape[0][-1], self.units),
            trainable=True,
            initializer=self.kernel_initializer,
            regularizer=self.kernel_regularizer,
        )
        self.kernel_attention = self.add_weight(
            shape=(self.units * 2, 1),
            trainable=True,
            initializer=self.kernel_initializer,
            regularizer=self.kernel_regularizer,
        )
        self.built = True

    def call(self, inputs):
        node_features, edges = inputs

        # Linearly transform node features (node states)
        node_features_transformed = tf.matmul(node_features, self.kernel)

        # (1) Compute pair-wise attention scores
        node_features_expanded = tf.gather(node_features_transformed, edges)
        node_features_expanded = tf.reshape(
            node_features_expanded, (tf.shape(edges)[0], -1)
        )
        attention_scores = tf.nn.leaky_relu(
            tf.matmul(node_features_expanded, self.kernel_attention)
        )
        attention_scores = tf.squeeze(attention_scores, -1)

        # (2) Normalize attention scores
        attention_scores = tf.math.exp(tf.clip_by_value(attention_scores, -2, 2))
        attention_scores_sum = tf.math.unsorted_segment_sum(
            data=attention_scores,
            segment_ids=edges[:, 0],
            num_segments=tf.reduce_max(edges[:, 0]) + 1,
        )
        attention_scores_sum = tf.repeat(
            attention_scores_sum, tf.math.bincount(tf.cast(edges[:, 0], "int32"))
        )
        attention_scores_norm = attention_scores / attention_scores_sum

        # (3) Gather node states of neighbors, apply attention scores and aggregate
        node_features_neighbors = tf.gather(node_features_transformed, edges[:, 1])
        out = tf.math.unsorted_segment_sum(
            data=node_features_neighbors * attention_scores_norm[:, tf.newaxis],
            segment_ids=edges[:, 0],
            num_segments=tf.shape(node_features)[0],
        )
        return out


class MultiHeadGraphAttention(layers.Layer):
    def __init__(self, units, num_heads=8, merge_type="concat", **kwargs):
        super().__init__(**kwargs)
        self.num_heads = num_heads
        self.merge_type = merge_type
        self.attention_layers = [GraphAttention(units) for _ in range(num_heads)]

    def call(self, inputs):
        atom_features, pair_indices = inputs

        # Obtain outputs from each attention head
        outputs = [
            attention_layer([atom_features, pair_indices])
            for attention_layer in self.attention_layers
        ]
        # Concatenate or average the node states from each head
        if self.merge_type == "concat":
            outputs = tf.concat(outputs, axis=-1)
        else:
            outputs = tf.reduce_mean(tf.stack(outputs, axis=-1), axis=-1)
        # Activate and return node states
        return tf.nn.relu(outputs)

In [43]:
class GraphAttentionNetwork(keras.Model):
    def __init__(
        self,
        node_features,
        edges,
        hidden_units,
        num_heads,
        num_layers,
        output_dim,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.node_features = node_features
        self.edges = edges
        self.preprocess = layers.Dense(hidden_units * num_heads, activation="relu")
        self.attention_layers = [
            MultiHeadGraphAttention(hidden_units, num_heads) for _ in range(num_layers)
        ]
        self.output_layer = layers.Dense(output_dim)

    def call(self, inputs):
        node_features, edges = inputs
        x = self.preprocess(node_features)
        for attention_layer in self.attention_layers:
            x = attention_layer([x, edges]) + x
        outputs = self.output_layer(x)
        return outputs

    def train_step(self, data):
        indices, labels = data

        with tf.GradientTape() as tape:
            # Forward pass
            outputs = self([self.node_features, self.edges])
            # Compute loss
            loss = self.compiled_loss(labels, tf.gather(outputs, indices))
        # Compute gradients
        grads = tape.gradient(loss, self.trainable_weights)
        # Apply gradients (update weights)
        optimizer.apply_gradients(zip(grads, self.trainable_weights))
        # Update metric(s)
        self.compiled_metrics.update_state(labels, tf.gather(outputs, indices))

        return {m.name: m.result() for m in self.metrics}

    def predict_step(self, data):
        indices = data
        # Forward pass
        outputs = self([self.node_features, self.edges])
        # Compute probabilities
        return tf.nn.softmax(tf.gather(outputs, indices))

    def test_step(self, data):
        indices, labels = data
        # Forward pass
        outputs = self([self.node_features, self.edges])
        # Compute loss
        loss = self.compiled_loss(labels, tf.gather(outputs, indices))
        # Update metric(s)
        self.compiled_metrics.update_state(labels, tf.gather(outputs, indices))

        return {m.name: m.result() for m in self.metrics}

In [44]:
# Define hyper-parameters
HIDDEN_UNITS = 100
NUM_HEADS = 8
NUM_LAYERS = 3
OUTPUT_DIM = len(class_values)

NUM_EPOCHS = 100
BATCH_SIZE = 256
VALIDATION_SPLIT = 0.1
LEARNING_RATE = 3e-1
MOMENTUM = 0.9

loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = keras.optimizers.Adam(LEARNING_RATE)
accuracy_fn = keras.metrics.SparseCategoricalAccuracy(name="acc")
early_stopping = keras.callbacks.EarlyStopping(
    monitor="val_acc", min_delta=1e-5, patience=5, restore_best_weights=True
)

# Build model
gat_model = GraphAttentionNetwork(
    node_features, edges, HIDDEN_UNITS, NUM_HEADS, NUM_LAYERS, OUTPUT_DIM
)

# Compile model
gat_model.compile(loss=loss_fn, optimizer=optimizer, metrics=[accuracy_fn])

gat_model.fit(
    x=train_indices,
    y=train_labels,
    validation_split=VALIDATION_SPLIT,
    batch_size=BATCH_SIZE,
    epochs=NUM_EPOCHS,
    callbacks=[early_stopping],
    verbose=2,
)

_, test_accuracy = gat_model.evaluate(x=test_indices, y=test_labels, verbose=0)

print("--" * 38 + f"\nTest Accuracy {test_accuracy*100:.1f}%")

2021-09-20 09:25:07.605670: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/100
34/34 - 71s - loss: 2932511604736.0000 - acc: 0.4866 - val_loss: 213604597760.0000 - val_acc: 0.4715
Epoch 2/100
34/34 - 92s - loss: 260003872768.0000 - acc: 0.4725 - val_loss: 899242917888.0000 - val_acc: 0.4746
Epoch 3/100
34/34 - 72s - loss: 42957647872.0000 - acc: 0.4702 - val_loss: 89943695360.0000 - val_acc: 0.5106
Epoch 4/100
34/34 - 68s - loss: 237015728128.0000 - acc: 0.4663 - val_loss: 41093193728.0000 - val_acc: 0.5053
Epoch 5/100
34/34 - 73s - loss: 105348784128.0000 - acc: 0.4733 - val_loss: 104669241344.0000 - val_acc: 0.5233
Epoch 6/100
34/34 - 59s - loss: 100939177984.0000 - acc: 0.4785 - val_loss: 21179029504.0000 - val_acc: 0.5032
Epoch 7/100
34/34 - 60s - loss: 18365208576.0000 - acc: 0.4777 - val_loss: 3736955904.0000 - val_acc: 0.5063
Epoch 8/100
34/34 - 63s - loss: 8580612096.0000 - acc: 0.4792 - val_loss: 7638352384.0000 - val_acc: 0.4810
Epoch 9/100
34/34 - 72s - loss: 6369082880.0000 - acc: 0.4737 - val_loss: 20396111872.0000 - val_acc: 0.5053
Epoch

In [45]:
test_probs = gat_model.predict(x=test_indices)

mapping = {v: k for (k, v) in class_idx.items()}

for i, (probs, label) in enumerate(zip(test_probs[:2], test_labels[:2])):
    print(f"Example {i+1}: {mapping[label]}")
    for j, c in zip(probs, class_idx.keys()):
        print(f"\tProbability of {c: <24} = {j*100:7.3f}%")
    print("---" * 20)

Example 1: 0
	Probability of 0                        = 100.000%
	Probability of 1                        =   0.000%
------------------------------------------------------------
Example 2: 0
	Probability of 0                        =   0.000%
	Probability of 1                        = 100.000%
------------------------------------------------------------


In [None]:
# Ressalvas: Estamos utilizando apenas um conjunto pequeno de features de perfil para validar a 
# aplicação da GAT na detecção de BOTs. Além disso, não otimizamos hiperparâmetros, estamos usando apenas as relações diretas,o que torna
# o problema bastante esparso. 
# Para próximos passos deveremos incluir, além das features de perfil, os tweets como um embedding semântico
# E também a vizinhança nível 2 (amigos dos meus amigos) para tornar o grafo mais denso
# nesse primeiro momento, com a arquitetura disponível para utilização e com nossos dados, nosso classificador
# Está com um erro altíssimo. 