In [None]:
!pip install numpy pandas collections
!pip install pickle
!pip install nltk
!pip install tensorflow
!pip install scikit-learn

[31mERROR: Could not find a version that satisfies the requirement collections (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for collections[0m[31m
[0m[31mERROR: Could not find a version that satisfies the requirement pickle (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for pickle[0m[31m


In [None]:
import pandas as pd
import numpy as np
import pickle
import math

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt')

import tensorflow as tf
from tensorflow.keras import layers, Model, Input
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model

from sklearn.metrics import matthews_corrcoef, roc_auc_score, confusion_matrix, classification_report

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


##Define the paths to be used

In [None]:
# All files should be in the same folder.
# If you are using Google drive make sure base_dir_path is the actual
# path to your folder, otherwise it should be base_dir_path=''.

base_dir_path = '/content/drive/My Drive/NLU/cw/'
dataset_path = base_dir_path + 'test.csv'
siamese_model_path = base_dir_path + 'siamese_model_B.keras'
char_tokenizer_path = base_dir_path + 'char_tokenizer_B.pkl'
word_tokenizer_path = base_dir_path + 'word_tokenizer_B.pkl'
predictions_output_path = base_dir_path + 'Group_62_B.csv'

The model can be found at https://livemanchesterac-my.sharepoint.com/:u:/g/personal/ismail_albrashdi_student_manchester_ac_uk/EbwaeE2pd1pMn69tY75q7OMBxfkiKGlRmrsoJSXy9urd0Q?e=V9UxbZ

The tokenizers were uploaded to Blackboard with notebook.

##Define the helper functions needed

In [None]:
def load_tokenizer(path):
  with open(path, 'rb') as f:
    tokenizer = pickle.load(f)
  return tokenizer


def preprocess_text(df_column, char_tokenizer, word_tokenizer, max_sent_length, max_word_length, max_char_length):
  # Initialize empty lists to hold the padded data
  char_data_padded = []
  word_data_padded = []

  for document in df_column:
    # Tokenize the document into sentences, then words, then characters
    sent_tokens = sent_tokenize(document)
    word_tokens = [word_tokenize(sent) for sent in sent_tokens]
    char_tokens = [[list(word) for word in sent] for sent in word_tokens]

    # Convert tokens to sequences using the respective tokenizers
    char_sequences = [[[char_tokenizer.word_index.get(char, 0) for char in word] for word in sent] for sent in char_tokens]
    word_sequences = [[word_tokenizer.word_index.get(word, 0) for word in sent] for sent in word_tokens]

    # Pad sequences to the same length
    char_sequences_padded = pad_sequences([pad_sequences(seq, maxlen=max_char_length, padding='post', truncating='post') for seq in char_sequences], maxlen=max_word_length, padding='post', truncating='post')
    word_sequences_padded = pad_sequences(word_sequences, maxlen=max_word_length, padding='post', truncating='post')

    # Append the padded data to the lists
    char_data_padded.append(char_sequences_padded)
    word_data_padded.append(word_sequences_padded)

  # Pad the lists to have uniform sentence length
  char_data_padded = pad_sequences(char_data_padded, maxlen=max_sent_length, padding='post', truncating='post')
  word_data_padded = pad_sequences(word_data_padded, maxlen=max_sent_length, padding='post', truncating='post')

  return char_data_padded, word_data_padded


# Same custom attention layer used for training
class Attention(layers.Layer):
  """
  This class implements a simple attention mechanism in a neural network layer.

  Attributes:
      W (tf.Tensor): A trainable weight matrix that transforms the input features before computing
                     attention scores. The shape of W is (feature_dim, feature_dim) where feature_dim
                     is the last dimension of the input.
      v (tf.Tensor): A trainable vector that computes the raw attention scores from the transformed
                     input. It's used to convert the tanh output into a score for each feature across
                     the input sequence.
      build(input_shape): Sets up the weights of the layer based on the shape of the input it will receive.
      call(x): Processes the input 'x' through the attention mechanism, computes attention scores, and
               returns a weighted sum of the input features based on these scores.

  Parameters:
      x (tf.Tensor): The input tensor to the attention layer. This is typically the output of an RNN,
      LSTM, or another layer that processes sequences.

  Returns:
      output (tf.Tensor): A tensor where the input sequences are aggregated (via a weighted sum) based
                          on the learned attention scores. This output tensor typically has shape
                          (batch_size, features) after reducing the sequence dimension.
  """
  def __init__(self, **kwargs):
    super(Attention, self).__init__(**kwargs)

  def build(self, input_shape):
    self.W = self.add_weight(name='attention_weight', shape=(input_shape[-1], input_shape[-1]),
                              initializer='random_normal', trainable=True)
    self.v = self.add_weight(name='attention_score_vector', shape=(input_shape[-1], 1),
                              initializer='random_normal', trainable=True)
    super(Attention, self).build(input_shape)

  def call(self, x):
    # u^(w) = tanh(W^(a) h^(w))
    u = tf.tanh(tf.tensordot(x, self.W, axes=[2, 0]))
    # Compute the raw attention scores v^(a)
    scores = tf.tensordot(u, self.v, axes=[2, 0])
    # Turn raw scores into probabilities using softmax (alpha)
    a = tf.nn.softmax(scores, axis=1)
    # Weighted sum of the input sequence
    output = tf.reduce_sum(x * a, axis=1)
    return output

##Define the prediction function

In [None]:
def predict_model_B(dataset_path, model_path, char_tokenizer_path, word_tokenizer_path, output_predictions_path):
  # Load the model and the tokenizers
  siamese_model = load_model(model_path, custom_objects={'Attention': Attention}, safe_mode=False)
  char_tokenizer = load_tokenizer(char_tokenizer_path)
  word_tokenizer = load_tokenizer(word_tokenizer_path)

  # Define the parameters that were used for training
  max_sent_length = 30                                      # maximum number of sentences per document
  max_word_length = 50                                      # maximum number of words per sentence
  max_char_length = 20                                      # maximum number of characters per word
  batch_size = 64

  # Load the dataset
  df = pd.read_csv(dataset_path)
  df['text_1'] = df['text_1'].astype(str)
  df['text_2'] = df['text_2'].astype(str)

  # Prepare the input data
  char_data_1, word_data_1 = preprocess_text(df['text_1'], char_tokenizer, word_tokenizer, max_sent_length, max_word_length, max_char_length)
  char_data_2, word_data_2 = preprocess_text(df['text_2'], char_tokenizer, word_tokenizer, max_sent_length, max_word_length, max_char_length)

  # Use the model to predict
  predictions = siamese_model.predict([np.array(char_data_1), np.array(word_data_1), np.array(char_data_2), np.array(word_data_2)])

  # Convert the predictions to 0s and 1s
  threshold = 0.5
  binary_predictions = (predictions > threshold).astype(int)

  # Save the predictions to a CSV file
  predictions_df = pd.DataFrame(binary_predictions, columns=['prediction'])
  predictions_df.to_csv(output_predictions_path, index=False)

##Call the prediction function with all the required paramaters

In [None]:
predict_model_B(dataset_path, siamese_model_path, char_tokenizer_path, word_tokenizer_path, predictions_output_path)

