In [None]:
!pip install torchtext==0.10.0
import torch
import torch.nn as nn
import torch.optim as optim

from torchtext.legacy.data import Field, BucketIterator
import numpy as np
import pandas as pd
import spacy
import random
from torchtext.data.metrics import bleu_score
from pprint import pprint
from torch.utils.tensorboard import SummaryWriter
from torchsummary import summary
'''
# Seeding for reproducible results everytime
SEED = 777

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True'''

In [None]:
!python -m spacy download en_core_web_sm

In [None]:
id2label = {0 : "Limitation of liability",
           1 : "Unilateral termination",
           2: "Unilateral change",
           3: "Content removal",
           4: "Contract by using",
           5: "Choice of law",
           6: "Jurisdiction",
           7: "Arbitration", }

label2id = {v: k for k, v in id2label.items()}

In [None]:
spacy_english = spacy.load('en_core_web_sm')

In [None]:
def tokenize_english(text):
  return [token.text for token in spacy_english.tokenizer(text)]

### Sample Run ###

sample_text = "I love machine learning"
print(tokenize_english(sample_text))

['I', 'love', 'machine', 'learning']


In [None]:
from torchtext.legacy import data
# SRC = unfair sentence
SRC = Field(tokenize = tokenize_english, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

# TRG = labels as a single string with comma as delimiter
TRG = Field(tokenize = tokenize_english, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

fields = [('src', SRC), ('trg', TRG)]

train_data, valid_data, test_data = data.TabularDataset.splits(
                                        path = '/content/',
                                        train = 'train_alphabetical.csv',
                                        validation = 'val.csv',
                                        test = 'test.csv',
                                        format = 'csv',
                                        fields = fields,
                                        skip_header = True
)

# build the vocabulary for the source (TEXT) and target (LABELS) languages
SRC.build_vocab(train_data, max_size=10000, min_freq = 2)
TRG.build_vocab(train_data, max_size=10000)

print(f"Unique tokens in source (text) vocabulary: {len(SRC.vocab)}")
print(f"Unique tokens in target (label) vocabulary: {len(TRG.vocab)}")

In [None]:
# dir(english.vocab)

print(TRG.vocab.__dict__.keys())
print(list(TRG.vocab.__dict__.values()))
e = list(TRG.vocab.__dict__.values())
for i in e:
  print(i)

In [None]:
word_2_idx = dict(e[3])
idx_2_word = {}
for k,v in word_2_idx.items():
  idx_2_word[v] = k

In [None]:
print(f"Number of training examples: {len(train_data.examples)}")
print(f"Number of validation examples: {len(valid_data.examples)}")
print(f"Number of testing examples: {len(test_data.examples)}")

print(train_data[5].__dict__.keys())
pprint(train_data[5].__dict__.values())

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
BATCH_SIZE = 32

train_iterator, valid_iterator, test_iterator = BucketIterator.splits((train_data, valid_data, test_data), 
                                                                      batch_size = BATCH_SIZE, 
                                                                      sort_within_batch=True,
                                                                      sort_key=lambda x: len(x.src),
                                                                      device = device)

In [None]:
count = 0
max_len_lbl = []
max_len_text = []
for data in train_data:
  max_len_text.append(len(data.src))
  max_len_lbl.append(len(data.trg))
  if count < 10 :
    print("Text - ",*data.src, " Length - ", len(data.src))
    print("Labels - ",*data.trg, " Length - ", len(data.trg))
    print()
  count += 1

print("Maximum Length of Lbl sentence {} and Text sentence {} in the dataset".format(max(max_len_lbl),max(max_len_text)))
print("Minimum Length of Lbl sentence {} and Text sentence {} in the dataset".format(min(max_len_lbl),min(max_len_text)))

In [None]:
count = 0
for data in train_iterator:
  if count < 1 :
    print("Shapes", data.src.shape, data.trg.shape)
    print()
    print("Text - ",*data.src, " Length - ", len(data.src))
    print()
    print("Label - ",*data.trg, " Length - ", len(data.trg))
    temp_text = data.src
    temp_label= data.trg
    count += 1

In [None]:
temp_label_idx = (temp_label).cpu().detach().numpy()
temp_text_idx = (temp_text).cpu().detach().numpy()

In [None]:
df_label_idx = pd.DataFrame(data = temp_label_idx, columns = [str("S_")+str(x) for x in np.arange(1, 33)])
df_label_idx.index.name = 'Time Steps'
df_label_idx.index = df_label_idx.index + 1 
df_label_idx

In [None]:
df_label_word = pd.DataFrame(columns = [str("S_")+str(x) for x in np.arange(1, 33)])
df_label_word = df_label_idx.replace(idx_2_word)
df_label_word

In [None]:
class EncoderLSTM(nn.Module):
  def __init__(self, input_size, embedding_size, hidden_size, num_layers, p):
    super(EncoderLSTM, self).__init__()

    # Size of the one hot vectors that will be the input to the encoder
    #self.input_size = input_size

    # Output size of the word embedding NN
    #self.embedding_size = embedding_size

    # Dimension of the NN's inside the lstm cell/ (hs,cs)'s dimension.
    self.hidden_size = hidden_size

    # Number of layers in the lstm
    self.num_layers = num_layers

    # Regularization parameter
    self.dropout = nn.Dropout(p)
    self.tag = True

    # Shape --------------------> (5376, 300) [input size, embedding dims]
    self.embedding = nn.Embedding(input_size, embedding_size)
    
    # Shape -----------> (300, 2, 1024) [embedding dims, hidden size, num layers]
    self.LSTM = nn.LSTM(embedding_size, hidden_size, num_layers, dropout = p)

  # Shape of x (26, 32) [Sequence_length, batch_size]
  def forward(self, x):

    # Shape -----------> (26, 32, 300) [Sequence_length , batch_size , embedding dims]
    embedding = self.dropout(self.embedding(x))
    
    # Shape --> outputs (26, 32, 1024) [Sequence_length , batch_size , hidden_size]
    # Shape --> (hs, cs) (2, 32, 1024) , (2, 32, 1024) [num_layers, batch_size size, hidden_size]
    outputs, (hidden_state, cell_state) = self.LSTM(embedding)

    return hidden_state, cell_state

input_size_encoder = len(SRC.vocab)
encoder_embedding_size = 300
hidden_size = 1024
num_layers = 2
encoder_dropout = 0.5

encoder_lstm = EncoderLSTM(input_size_encoder, encoder_embedding_size,
                           hidden_size, num_layers, encoder_dropout).to(device)
print(encoder_lstm)

In [None]:
class DecoderLSTM(nn.Module):
  def __init__(self, input_size, embedding_size, hidden_size, num_layers, p, output_size):
    super(DecoderLSTM, self).__init__()

    # Size of the one hot vectors that will be the input to the encoder
    #self.input_size = input_size

    # Output size of the word embedding NN
    #self.embedding_size = embedding_size

    # Dimension of the NN's inside the lstm cell/ (hs,cs)'s dimension.
    self.hidden_size = hidden_size

    # Number of layers in the lstm
    self.num_layers = num_layers

    # Size of the one hot vectors that will be the output to the encoder (English Vocab Size)
    self.output_size = output_size

    # Regularization parameter
    self.dropout = nn.Dropout(p)

    # Shape --------------------> (5376, 300) [input size, embedding dims]
    self.embedding = nn.Embedding(input_size, embedding_size)

    # Shape -----------> (300, 2, 1024) [embedding dims, hidden size, num layers]
    self.LSTM = nn.LSTM(embedding_size, hidden_size, num_layers, dropout = p)

    # Shape -----------> (1024, 4556) [embedding dims, hidden size, num layers]
    self.fc = nn.Linear(hidden_size, output_size)

  # Shape of x (32) [batch_size]
  def forward(self, x, hidden_state, cell_state):

    # Shape of x (1, 32) [1, batch_size]
    x = x.unsqueeze(0)

    # Shape -----------> (1, 32, 300) [1, batch_size, embedding dims]
    embedding = self.dropout(self.embedding(x))

    # Shape --> outputs (1, 32, 1024) [1, batch_size , hidden_size]
    # Shape --> (hs, cs) (2, 32, 1024) , (2, 32, 1024) [num_layers, batch_size size, hidden_size] (passing encoder's hs, cs - context vectors)
    outputs, (hidden_state, cell_state) = self.LSTM(embedding, (hidden_state, cell_state))

    # Shape --> predictions (1, 32, 4556) [ 1, batch_size , output_size]
    predictions = self.fc(outputs)

    # Shape --> predictions (32, 4556) [batch_size , output_size]
    predictions = predictions.squeeze(0)

    return predictions, hidden_state, cell_state

input_size_decoder = len(TRG.vocab)
decoder_embedding_size = 300
hidden_size = 1024
num_layers = 2
decoder_dropout = 0.5
output_size = len(TRG.vocab)

decoder_lstm = DecoderLSTM(input_size_decoder, decoder_embedding_size,
                           hidden_size, num_layers, decoder_dropout, output_size).to(device)
print(decoder_lstm)

In [None]:
for batch in train_iterator:
  print(batch.src.shape)
  print(batch.trg.shape)
  break

x = batch.trg[1]
print(x)

In [None]:
class Seq2Seq(nn.Module):
  def __init__(self, Encoder_LSTM, Decoder_LSTM):
    super(Seq2Seq, self).__init__()
    self.Encoder_LSTM = Encoder_LSTM
    self.Decoder_LSTM = Decoder_LSTM

  def forward(self, source, target, tfr=0.5):
    # Shape - Source : (10, 32) [(Sentence length German + some padding), Number of Sentences]
    batch_size = source.shape[1]

    # Shape - Source : (14, 32) [(Sentence length English + some padding), Number of Sentences]
    target_len = target.shape[0]
    target_vocab_size = len(TRG.vocab)
    
    # Shape --> outputs (14, 32, 5766) 
    outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)

    # Shape --> (hs, cs) (2, 32, 1024) ,(2, 32, 1024) [num_layers, batch_size size, hidden_size] (contains encoder's hs, cs - context vectors)
    hidden_state, cell_state = self.Encoder_LSTM(source)

    # Shape of x (32 elements)
    x = target[0] # Trigger token <SOS>

    for i in range(1, target_len):
      # Shape --> output (32, 5766) 
      output, hidden_state, cell_state = self.Decoder_LSTM(x, hidden_state, cell_state)
      outputs[i] = output
      best_guess = output.argmax(1) # 0th dimension is batch size, 1st dimension is word embedding
      x = target[i] if random.random() < tfr else best_guess # Either pass the next word correctly from the dataset or use the earlier predicted word

    # Shape --> outputs (14, 32, 5766) 
    return outputs


In [None]:
# Hyperparameters

learning_rate = 0.001
writer = SummaryWriter(f"runs/loss_plot")
step = 0

model = Seq2Seq(encoder_lstm, decoder_lstm).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

pad_idx = TRG.vocab.stoi["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

In [None]:
model

In [None]:
def translate_sentence(model, sentence, SRC, TRG, device, max_length=50):
    spacy_ger = spacy.load('en_core_web_sm')

    if type(sentence) == str:
        tokens = [token.text.lower() for token in spacy_ger(sentence)]
    else:
        tokens = [token.lower() for token in sentence]
    tokens.insert(0, SRC.init_token)
    tokens.append(SRC.eos_token)
    text_to_indices = [SRC.vocab.stoi[token] for token in tokens]
    sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)

    # Build encoder hidden, cell state
    with torch.no_grad():
        hidden, cell = model.Encoder_LSTM(sentence_tensor)

    outputs = [TRG.vocab.stoi["<sos>"]]

    for _ in range(max_length):
        previous_word = torch.LongTensor([outputs[-1]]).to(device)

        with torch.no_grad():
            output, hidden, cell = model.Decoder_LSTM(previous_word, hidden, cell)
            best_guess = output.argmax(1).item()

        outputs.append(best_guess)

        # Model predicts it's the end of the sentence
        if output.argmax(1).item() == TRG.vocab.stoi["<eos>"]:
            break

    translated_sentence = [TRG.vocab.itos[idx] for idx in outputs]
    return translated_sentence[1:]

def bleu(data, model, SRC, TRG, device):
    targets = []
    outputs = []

    for example in data:
        src = vars(example)["src"]
        trg = vars(example)["trg"]

        prediction = translate_sentence(model, src, SRC, TRG, device)
        prediction = prediction[:-1]  # remove <eos> token

        targets.append([trg])
        outputs.append(prediction)

    return bleu_score(outputs, targets)

def checkpoint_and_save(model, best_loss, epoch, optimizer, epoch_loss):
    print('saving')
    print()
    state = {'model': model,'best_loss': best_loss,'epoch': epoch,'rng_state': torch.get_rng_state(), 'optimizer': optimizer.state_dict(),}
    torch.save(state, '/content/checkpoint-NMT')
    torch.save(model.state_dict(),'/content/checkpoint-NMT-SD')

In [None]:
epoch_loss = 0.0
num_epochs = 100
best_loss = 999999
best_epoch = -1
sentence1 = '''you agree that this agreement and the relationship between you and 
linden lab shall be governed by the laws of the state of california without regard 
to conflict of law principles or the united nations convention on the 
international sale of goods .   '''

ts1  = []

for epoch in range(num_epochs):
  print("Epoch - {} / {}".format(epoch+1, num_epochs))
  model.eval()
  translated_sentence1 = translate_sentence(model, sentence1, SRC, TRG, device, max_length=50)
  print(f"Translated example sentence 1: \n {translated_sentence1}")
  ts1.append(translated_sentence1)

  model.train(True)
  for batch_idx, batch in enumerate(train_iterator):
    input = batch.src.to(device)
    target = batch.trg.to(device)

    # Pass the input and target for model's forward method
    output = model(input, target)
    output = output[1:].reshape(-1, output.shape[2])
    target = target[1:].reshape(-1)

    # Clear the accumulating gradients
    optimizer.zero_grad()

    # Calculate the loss value for every epoch
    loss = criterion(output, target)

    # Calculate the gradients for weights & biases using back-propagation
    loss.backward()

    # Clip the gradient value is it exceeds > 1
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

    # Update the weights values using the gradients we calculated using bp 
    optimizer.step()
    step += 1
    epoch_loss += loss.item()
    writer.add_scalar("Training loss", loss, global_step=step)

  if epoch_loss < best_loss:
    best_loss = epoch_loss
    best_epoch = epoch
    checkpoint_and_save(model, best_loss, epoch, optimizer, epoch_loss) 
    if ((epoch - best_epoch) >= 10):
      print("no improvement in 10 epochs, break")
      break
  print("Epoch_Loss - {}".format(loss.item()))
  print()
  
print(epoch_loss / len(train_iterator))

score = bleu(test_data[1:100], model, SRC, TRG, device)
print(f"Bleu score {score*100:.2f}")

# 11. Seq2Seq Model Inference

In [None]:
progress  = []
import nltk
from nltk.tokenize.treebank import TreebankWordDetokenizer
for i, sen in enumerate(ts1):
  progress.append(TreebankWordDetokenizer().detokenize(sen))
print(progress)

In [None]:
progress_df = pd.DataFrame(data = progress, columns=['Predicted_Sentence'])
progress_df.index.name = "Epochs"
progress_df.to_csv('/content/predicted_sentence.csv')
progress_df.head()

In [None]:
model.eval()
# test_sentences  = ["academia.edu reserves the right , at its sole discretion , to discontinue or terminate the site and services and to terminate these terms , at any time and without prior notice . ", "by using amazon services , you agree to these conditions . "]
# actual_sentences  = ["Unilateral termination", "Contract by using"]
val_df = pd.read_csv('/content/val.csv')
test_sentences = val_df['text'].to_numpy()
actual_sentences = val_df['labels'].to_numpy()

pred_sentences = []

final_df = pd.DataFrame(columns=['Generated_Text', 'Actual_Text'])
# use validation set

for idx, i in enumerate(test_sentences):
  model.eval()
  translated_sentence = translate_sentence(model, i, SRC, TRG, device, max_length=50)
  progress.append(TreebankWordDetokenizer().detokenize(translated_sentence))
  # print("Text : {}".format(i))
  # print("Actual Label: {}".format(actual_sentences[idx]))
  # print("Predicted Label : {}".format(progress[-1]))
  print(idx)
  final_df.loc[len(final_df.index)] = [progress[-1], actual_sentences[idx]]
  # print()


In [None]:
final_df.head(300)

In [None]:
id2label = {0 : "Limitation of liability",
           1 : "Unilateral termination",
           2: "Unilateral change",
           3: "Content removal",
           4: "Contract by using",
           5: "Choice of law",
           6: "Jurisdiction",
           7: "Arbitration", 
           8: "No violation"}

label2id = {v: k for k, v in id2label.items()}

In [None]:
modif_final_df = pd.DataFrame(columns = ['Generated_Text', 'Actual_Text'])
def clean_pred(row_pred):
  cleaned_pred = ''
  lower_id2label = list(map(lambda x: x.lower(), id2label.values()))
  for predefined_label in list(lower_id2label):
    if str(row_pred).find(predefined_label) > -1:
      cleaned_pred += predefined_label + ', '
  return cleaned_pred[:-2]

modif_final_df['Generated_Text'] = final_df['Generated_Text'].apply(clean_pred)
modif_final_df['Actual_Text'] = final_df['Actual_Text'].apply(lambda x: x.lower())

In [None]:
modif_final_df.head(100)

In [None]:
# Evaluate performance
from sklearn import metrics
val_preds = modif_final_df['Generated_Text'].to_numpy()
val_targets = modif_final_df['Actual_Text'].to_numpy()

f1_score_micro = metrics.f1_score(val_targets, val_preds, average='micro')
f1_score_macro = metrics.f1_score(val_targets, val_preds, average='macro')

print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

In [None]:
for label in list(id2label.values()):
  copy_df = modif_final_df[final_df.Actual_Text == label]

  test_preds = copy_df['Generated_Text'].to_numpy()
  test_targets = copy_df['Actual_Text'].to_numpy()

  f1_score_micro = metrics.f1_score(test_targets, test_preds, average='micro')
  f1_score_macro = metrics.f1_score(test_targets, test_preds, average='macro')
  print(f"F1 Score (Micro) {label} = {f1_score_micro}")
  print(f"F1 Score (Macro) {label} = {f1_score_macro}")
  print()

In [None]:
test_sentences  = ["you acknowledge and agree that , by accessing or using the site or services or by downloading or posting any content from or on the site or through the services , you are indicating that you have read , and that you understand and agree to be bound by , these terms , whether or not you have registered on or through the site . ", "if we believe , in our sole discretion , that any member of academia.edu or academia premium or other academia.edu paid services is in breach of our terms , or act outside of the letter or spirit of our terms , we reserve the right to add limitations to your access to www.academia.edu , up to and including terminating all access to www.academia.edu . ", "by accessing or using the site or services you represent and warrant that you are 13 years of age or older . ", 'in this case , the member in question is not eligible for any refunds on any portion of their subscription payment .']
actual_sentences  = ["Contract by using", "Unilateral change, Unilateral termination", "Contract by using", 'No violation']

for idx, i in enumerate(test_sentences):
  translated_sentence = translate_sentence(model, i, SRC, TRG, device, max_length=50)
  progress.append(TreebankWordDetokenizer().detokenize(translated_sentence))
  # print("Text : {}".format(i))
  print("Actual Label: {}".format(actual_sentences[idx]))
  print("Predicted Label : {}".format(progress[-1]))
  print()