In [None]:
!pip install numpy pandas collections
!pip install pickle
!pip install nltk
!pip install tensorflow
!pip install scikit-learn
!pip install fasttext

[31mERROR: Could not find a version that satisfies the requirement collections (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for collections[0m[31m
[0m[31mERROR: Could not find a version that satisfies the requirement pickle (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for pickle[0m[31m
Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.8/68.8 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.12.0-py3-none-any.whl (234 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp310-cp310-linux_x86_64.whl size=4227139 sha256=ed189dee4d91ef5f84176494a1e75a52c34f8e81fd7afa7b71785d4cdb1e8275
  Stored in director

In [None]:
import string
import numpy as np
import pandas as pd
import math
from collections import Counter
import pickle

import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

nltk.download('punkt')

import tensorflow as tf
from tensorflow.keras import layers, Model, Input
from tensorflow.keras.layers import Input, Embedding, Conv1D, LSTM, Dense, Flatten, Subtract, Bidirectional, GlobalMaxPooling1D, TimeDistributed, Lambda, Concatenate, Layer, Activation, Softmax
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.models import load_model

from sklearn.metrics import matthews_corrcoef, roc_auc_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

import fasttext


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# from google.colab import drive
# drive.mount('/content/drive')
# base_dir_path = '/content/drive/My Drive/NLU/cw/'

base_dir_path = ''
training_data_path = base_dir_path + 'train.csv'
development_data_path = base_dir_path + 'dev.csv'

Mounted at /content/drive


##Helper functions


In [None]:
def load_fasttext_embeddings(embeddings_path, word_index, embedding_dim=300):
  """
  Loads FastText word embeddings. This function constructs an embedding matrix
  that is used to initialise the weights in the embedding layer of the neural
  network model.

  param embeddings_path: The path to the FastText embeddings file.
  param word_index: A dictionary mapping words to their indices in the embedding
                    matrix.
  param embedding_dim: The dimensionality of the word vectors.

  return: Embedding matrix
  """
  # Initialize the embedding matrix
  embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))

  # Open the FastText embeddings file
  with open(embeddings_path, 'r', encoding='utf-8') as f:
      for line in f:
          values = line.split()
          word = values[0]
          if word in word_index:
              vector = np.asarray(values[1:], dtype='float32')
              embedding_matrix[word_index[word]] = vector

  return embedding_matrix


def data_generator(df, char_tokenizer, word_tokenizer, batch_size, max_sent_length, max_word_length, max_char_length):
  """
  Generator function for the data. It yields batches of data from the given dataframe to avoid running out of memory.

  param df: the dataframe containing the data.
  param char_tokenizer: the character tokenizer.
  param word_tokenizer: the word tokenizer.
  param batch_size: the batch size.
  param max_sent_length: the maximum number of sentences allowed per document.
  param max_word_length: the maximum number of words allowed per sentence.
  param max_char_length: the maximum number of characters allowed per word.

  yield batches of data.
  """
  num_samples = len(df)
  while True:
    for offset in range(0, num_samples, batch_size):
      # Get the batch of data
      batch_samples = df.iloc[offset:min(offset + batch_size, num_samples)]

      # Preprocess the text data for the current batch
      char_data_1, word_data_1 = preprocess_text(batch_samples['text_1'], char_tokenizer, word_tokenizer, max_sent_length, max_word_length, max_char_length)
      char_data_2, word_data_2 = preprocess_text(batch_samples['text_2'], char_tokenizer, word_tokenizer, max_sent_length, max_word_length, max_char_length)

      # Get the labels
      batch_labels = batch_samples['label'].values

      yield [np.array(char_data_1), np.array(word_data_1), np.array(char_data_2), np.array(word_data_2)], np.array(batch_labels)


def save_tokenizer(path, tokenizer):
  with open(path, 'wb') as f:
    pickle.dump(tokenizer, f, protocol=pickle.HIGHEST_PROTOCOL)


def euclidean_distance(vectors):
  """
  Computes the euclidean distance between 2 tensors

  param vectors: a list containing two tensors of same length

  returns: the euclidean distance
  """
  vector1, vector2 = vectors
  sum_square = tf.reduce_sum(tf.square(vector1 - vector2), axis=1, keepdims=True)
  return tf.sqrt(tf.maximum(sum_square, tf.keras.backend.epsilon()))


def inverse_exponential(x):
  return tf.exp(-x)

####Define the preprocessing function

In [None]:
def preprocess_text(df_column, char_tokenizer, word_tokenizer, max_sent_length, max_word_length, max_char_length):
  """
  Puts the text samples in the structure required by the model. The function tokenizes text at three
  levels: sentences, words, and characters, and applies padding to standardize lengths at each level.

  param df_column: the column containing the text samples.
  param char_tokenizer: the tokenizer for character-level tokenization.
  param word_tokenizer: the tokenizer for word-level tokenization.
  param max_sent_length: maximum number of sentences allowed per document.
  param max_word_length: maximum number of words allowed per sentence.
  param max_char_length: maximum number of characters allowed per word.

  Returns:
    - char_data_padded: 4D array where each entry contains the padded character-level representations of words,
      structured as [documents, sentences, words, chars].
    - word_data_padded: 3D array where each entry contains the padded word-level representations of sentences,
      structured as [documents, sentences, words].
  """
  # Initialize empty lists to hold the padded data
  char_data_padded = []
  word_data_padded = []


  for document in df_column:
    # Tokenize the document into sentences, then words, then characters
    sent_tokens = sent_tokenize(document)
    word_tokens = [word_tokenize(sent) for sent in sent_tokens]
    char_tokens = [[list(word) for word in sent] for sent in word_tokens]

    # Convert tokens to sequences using the respective tokenizers
    char_sequences = [[[char_tokenizer.word_index.get(char, 0) for char in word] for word in sent] for sent in char_tokens]
    word_sequences = [[word_tokenizer.word_index.get(word, 0) for word in sent] for sent in word_tokens]

    # Pad sequences to the same length
    char_sequences_padded = pad_sequences([pad_sequences(seq, maxlen=max_char_length, padding='post', truncating='post') for seq in char_sequences], maxlen=max_word_length, padding='post', truncating='post')
    word_sequences_padded = pad_sequences(word_sequences, maxlen=max_word_length, padding='post', truncating='post')

    # Append the padded data to the lists
    char_data_padded.append(char_sequences_padded)
    word_data_padded.append(word_sequences_padded)

  # Pad the lists to have uniform sentence length
  char_data_padded = pad_sequences(char_data_padded, maxlen=max_sent_length, padding='post', truncating='post')
  word_data_padded = pad_sequences(word_data_padded, maxlen=max_sent_length, padding='post', truncating='post')

  return char_data_padded, word_data_padded

##Data preparation

####Load and split the training dataset

In [None]:
df = pd.read_csv(training_data_path)
df['text_1'] = df['text_1'].astype(str)
df['text_2'] = df['text_2'].astype(str)
train_df, val_df = train_test_split(df, test_size=0.2, random_state=37, shuffle=True)

####Fit and save the tokenizers

In [None]:
# Tokenizers
char_tokenizer = Tokenizer(char_level=True)
word_tokenizer = Tokenizer()

# Fit tokenizers
char_tokenizer.fit_on_texts(pd.concat([df['text_1'], df['text_2']], axis=0))
word_tokenizer.fit_on_texts(pd.concat([df['text_1'], df['text_2']], axis=0))

# Save tokenizers
save_tokenizer(base_dir_path + 'char_tokenizer_B.pkl', char_tokenizer)
save_tokenizer(base_dir_path + 'word_tokenizer_B.pkl', word_tokenizer)

####Load FastText word embeddings

In [None]:
word_embedding_matrix = load_fasttext_embeddings(base_dir_path + 'cc.en.300.vec', word_tokenizer.word_index)

##Model Building

####Define the custom attention layer

In [None]:
class Attention(layers.Layer):
  """
  This class implements a simple attention mechanism in a neural network layer.

  Attributes:
      W (tf.Tensor): A trainable weight matrix that transforms the input features before computing
                     attention scores. The shape of W is (feature_dim, feature_dim) where feature_dim
                     is the last dimension of the input.
      v (tf.Tensor): A trainable vector that computes the raw attention scores from the transformed
                     input. It's used to convert the tanh output into a score for each feature across
                     the input sequence.
      build(input_shape): Sets up the weights of the layer based on the shape of the input it will receive.
      call(x): Processes the input 'x' through the attention mechanism, computes attention scores, and
               returns a weighted sum of the input features based on these scores.

  Parameters:
      x (tf.Tensor): The input tensor to the attention layer. This is typically the output of an RNN,
      LSTM, or another layer that processes sequences.

  Returns:
      output (tf.Tensor): A tensor where the input sequences are aggregated (via a weighted sum) based
                          on the learned attention scores. This output tensor typically has shape
                          (batch_size, features) after reducing the sequence dimension.
  """
  def __init__(self, **kwargs):
    super(Attention, self).__init__(**kwargs)

  def build(self, input_shape):
    self.W = self.add_weight(name='attention_weight', shape=(input_shape[-1], input_shape[-1]),
                              initializer='random_normal', trainable=True)
    self.v = self.add_weight(name='attention_score_vector', shape=(input_shape[-1], 1),
                              initializer='random_normal', trainable=True)
    super(Attention, self).build(input_shape)

  def call(self, x):
    # u^(w) = tanh(W^(a) h^(w))
    u = tf.tanh(tf.tensordot(x, self.W, axes=[2, 0]))
    # Compute the raw attention scores v^(a)
    scores = tf.tensordot(u, self.v, axes=[2, 0])
    # Turn raw scores into probabilities using softmax (alpha)
    a = tf.nn.softmax(scores, axis=1)
    # Weighted sum of the input sequence
    output = tf.reduce_sum(x * a, axis=1)
    return output

####Define the functions to build the model

In [None]:
def build_document_feature_network(char_vocab_size, word_vocab_size, char_embedding_dim,
                                   word_embedding_dim, max_sent_length, max_word_length,
                                   max_char_length, lstm_units, mlp_units,
                                   word_embedding_matrix, h):
  """
  Builds a branch of the siamese model. Each branch works independently on to extract the document features
  from the input documents (text_1 and text_2).

  The network is based on the ADHOMINEM architecture proposed by Boenninghoff et al 2019:
    - Boenninghoff, B., Hessler, S., Kolossa, D. and Nickel, R.M., 2019, December. Explainable
      authorship verification in social media via attention-based similarity learning. In 2019
      IEEE International Conference on Big Data (Big Data) (pp. 36-45). IEEE.

  In summary:
  - the model uses a CNN + global-max-pooling to exatract information from char embeddeings (to gain insight
    into affixes and suffixes).
  - the output of global-max-pooling is concatenated with the corresponding word embedding to get the word
    representation of the document.
  - the word representation of the document is fed into a BiLSTM then Attention to get the sentence representation
    of the document.
  - the sentence representation of the document is fed into a BiLSTM then Attention to get the document representation.
  - the document representation is fed into a MLP to get the document features.

  TimeDistributed is used to process the inputs in hierarchical order.

  param char_vocab_size: the number of unique characters in the character vocabulary.
  param word_vocab_size: the number of unique words in the word vocabulary.
  param char_embedding_dim: the dimensionality of the character embeddings.
  param word_embedding_dim: the dimensionality of the word embeddings.
  param max_sent_length: the maximum number of sentences allowed per document.
  param max_word_length: the maximum number of words allowed per sentence.
  param max_char_length: the maximum number of characters allowed per word.
  param lstm_units: the number of units in the LSTM layers.
  param mlp_units: the number of units in the MLP layer.
  param word_embedding_matrix: the word embedding matrix.
  param h: the size of the kernel in the CNN.

  return: the document feature network.

  """
  # Input Layers
  char_input = Input(shape=(max_sent_length, max_word_length, max_char_length), dtype='int32', name='char_input')
  word_input = Input(shape=(max_sent_length, max_word_length), dtype='int32', name='word_input')

  # Character Embedding and Convolution
  char_embeddings = TimeDistributed(TimeDistributed(Embedding(input_dim=char_vocab_size, output_dim=char_embedding_dim)))(char_input)

  conv1d_out = TimeDistributed(TimeDistributed(Conv1D(filters=char_embedding_dim, kernel_size=h, activation='tanh')))(char_embeddings)
  char_representations = TimeDistributed(TimeDistributed(GlobalMaxPooling1D()))(conv1d_out)

  # Word Embeddings
  embedding_layer = Embedding(input_dim=word_vocab_size,
                            output_dim=300,
                            weights=[word_embedding_matrix],
                            trainable=False)
  word_embeddings = TimeDistributed(embedding_layer)(word_input)

  # Combine Char and Word Representations
  combined_representations = TimeDistributed(Concatenate())([char_representations, word_embeddings])

  # Word to Sentence Encoding with BiLSTM
  sentence_encoder = TimeDistributed(Bidirectional(LSTM(lstm_units, return_sequences=True)))(combined_representations)
  sentence_attention = TimeDistributed(Attention())(sentence_encoder)

  # Sentence to Document Encoding with BiLSTM
  document_encoder = Bidirectional(LSTM(lstm_units, return_sequences=True))(sentence_attention)
  document_representation = Attention()(document_encoder)

  # Document_representation to document features
  document_features = Dense(mlp_units, activation='tanh')(document_representation)

  model = Model(inputs=[char_input, word_input], outputs=document_features, name='document_feature_network')
  return model

def build_siamese_model(document_feature_network, max_sent_length, max_word_length, max_char_length):
  """
  Builds the siamese model. It defines 4 inputs (2 for each input document), creates 2 branches of the
  document feature network to extract the document features from the input documents, and computes the
  similarity score between the two document features.


  param document_feature_network: the document feature network.
  param max_sent_length: the maximum number of sentences allowed per document.
  param max_word_length: the maximum number of words allowed per sentence.
  param max_char_length: the maximum number of characters allowed per word.

  return: the siamese model.
  """
  # Inputs for two documents
  char_input_1 = Input(shape=(max_sent_length, max_word_length, max_char_length), dtype='int32', name='char_input_1')
  word_input_1 = Input(shape=(max_sent_length, max_word_length), dtype='int32', name='word_input_1')

  char_input_2 = Input(shape=(max_sent_length, max_word_length, max_char_length), dtype='int32', name='char_input_2')
  word_input_2 = Input(shape=(max_sent_length, max_word_length), dtype='int32', name='word_input_2')

  # Generate document features for both documents
  document_features_1 = document_feature_network([char_input_1, word_input_1])
  document_features_2 = document_feature_network([char_input_2, word_input_2])

  # Compute a similarity score between the two document feature vectors
  distance = Lambda(euclidean_distance)([document_features_1, document_features_2])
  similarity_score = Lambda(inverse_exponential)(distance)

  # Construct the Siamese model
  siamese_model = Model(inputs=[char_input_1, word_input_1, char_input_2, word_input_2], outputs=similarity_score, name='siamese_document_network')

  return siamese_model

####Build the model

In [None]:
# Parameters
char_vocab_size = len(char_tokenizer.word_index) + 1      # number of unique chars in the chars tokenizer
word_vocab_size = len(word_tokenizer.word_index) + 1      # number of unique words in the words tokenizer
char_embedding_dim = 100                                  # character embedding dimension
word_embedding_dim = 300                                  # word embedding dimension
lstm_units = 64                                           # number of units in the LSTM layer
mlp_units = 128                                           # number of units in the MLP layer
h = 5                                                     # kernel size for the CNN
max_sent_length = 30                                      # maximum number of sentences per document
max_word_length = 50                                      # maximum number of words per sentence
max_char_length = 20                                      # maximum number of characters per word
batch_size = 64                                           # batch size for training
epochs = 20                                               # number of epochs for training


# Build the document feature network
document_feature_network = build_document_feature_network(char_vocab_size, word_vocab_size, char_embedding_dim,
                                                          word_embedding_dim, max_sent_length, max_word_length,
                                                          max_char_length, lstm_units, mlp_units,
                                                          word_embedding_matrix, h)

# Build the Siamese model
siamese_model = build_siamese_model(document_feature_network, max_sent_length, max_word_length, max_char_length)

# Compile the model
siamese_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

siamese_model.summary()

Model: "siamese_document_network"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 char_input_1 (InputLayer)   [(None, 30, 50, 20)]         0         []                            
                                                                                                  
 word_input_1 (InputLayer)   [(None, 30, 50)]             0         []                            
                                                                                                  
 char_input_2 (InputLayer)   [(None, 30, 50, 20)]         0         []                            
                                                                                                  
 word_input_2 (InputLayer)   [(None, 30, 50)]             0         []                            
                                                                           

## Model training

In [None]:
# Create the training and validation generators
train_generator = data_generator(train_df, char_tokenizer, word_tokenizer, batch_size, max_sent_length, max_word_length, max_char_length)
val_generator = data_generator(val_df, char_tokenizer, word_tokenizer, batch_size, max_sent_length, max_word_length, max_char_length)


# Calculate the steps per epoch for training and validation
steps_per_epoch = math.ceil(len(train_df) / batch_size)
validation_steps = math.ceil(len(val_df) / batch_size)

# Define the callbacks
checkpoint = ModelCheckpoint(base_dir_path + 'best_model_B.keras', save_best_only=True, monitor='val_loss', mode='min')
early_stopping = EarlyStopping(monitor='val_loss', patience=5, verbose=1, mode='min')
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, verbose=1, mode='min', min_lr=0.001)

# Train the model
history = siamese_model.fit(
    x=train_generator,
    steps_per_epoch=steps_per_epoch,
    epochs=epochs,
    validation_data=val_generator,
    validation_steps=validation_steps,
    callbacks=[checkpoint, early_stopping, reduce_lr]
)

# Save the model
siamese_model.save(base_dir_path + 'siamese_model_B.keras')

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 14: early stopping


##Model Evaluation

####Load the development dataset and set up the data generator

In [None]:
# Load the development dataset
development_df = pd.read_csv(development_data_path)
development_df['text_1'] = development_df['text_1'].astype(str)
development_df['text_2'] = development_df['text_2'].astype(str)
dev_labels = development_df['label'].values

# Setup the data generator
development_generator = data_generator(development_df, char_tokenizer, word_tokenizer, batch_size, max_sent_length, max_word_length, max_char_length)

####Use the model to classify the pairs of text from the development dataset

In [None]:
# Predict using the model
predictions = siamese_model.predict(development_generator, steps=math.ceil(len(development_df) / batch_size))

# Set a threshold and use it to map the predictions to 0s and 1s.
threshold = 0.5
binary_predictions = (predictions > threshold).astype(int)



####Print the evaluation metrics

In [None]:
# Calculate Matthews Correlation Coefficient
mcc = matthews_corrcoef(dev_labels, binary_predictions)

# Calculate ROC-AUC Score
roc_auc = roc_auc_score(dev_labels, binary_predictions)

# Calculate confusion matrix to get specificity and false positive rate
tn, fp, fn, tp = confusion_matrix(dev_labels, binary_predictions).ravel()

# Specificity = TN / (TN + FP)
specificity = tn / (tn + fp)

# False Positive Rate = FP / (FP + TN)
false_positive_rate = fp / (fp + tn)

# Generate classification report
class_report = classification_report(dev_labels, binary_predictions, target_names=['Different Authors', 'Same Authors'])

print("Matthew's Correlation Coefficient:", mcc)
print("ROC-AUC Score:", roc_auc)
print("Specificity:", specificity)
print("False Positive Rate:", false_positive_rate)
print("Classification Report:")
print(class_report)

Matthew's Correlation Coefficient: 0.34375964015159816
ROC-AUC Score: 0.6714633607851838
Specificity: 0.7069253931080629
False Positive Rate: 0.2930746068919371
Classification Report:
                   precision    recall  f1-score   support

Different Authors       0.66      0.71      0.68      2989
     Same Authors       0.69      0.64      0.66      3011

         accuracy                           0.67      6000
        macro avg       0.67      0.67      0.67      6000
     weighted avg       0.67      0.67      0.67      6000

