In [None]:
# Mounting Drive

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Installing Dependencies

!pip install xformers
!pip install transformers[torch]
!pip install datasets
!pip install  torchtext

Collecting xformers
  Downloading xformers-0.0.24-cp310-cp310-manylinux2014_x86_64.whl (218.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m218.2/218.2 MB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
Collecting torch==2.2.0 (from xformers)
  Downloading torch-2.2.0-cp310-cp310-manylinux1_x86_64.whl (755.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m755.5/755.5 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.2.0->xformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch==2.2.0->xformers)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m5.0 MB/s

In [None]:
# Loading Required Libraries

import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap
import time
import copy

import transformers
from transformers import AdamW, get_linear_schedule_with_warmup
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import math
import random
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence
import gc
import torchtext
from tqdm import tqdm

# Misc.
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Hyperparameter Values for Training

batch_size = 16
MAX_LEN = 160
RANDOM_SEED = 16
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
EPOCHS = 20


In [None]:
# Checking device type

device

device(type='cuda')

# Dataset

In [None]:
# Loading the dataset from drive into a Pandas dataframe

df = pd.read_csv("/content/drive/MyDrive/NLP4RE-data/Expanded_PROMISE.csv")

In [None]:
# Checking first 10 rows of the dataframe

df.head(10)

Unnamed: 0,Id,RequirementText,Class,Binary_Label
0,1,The product shall be available during normal b...,A,1
1,2,The product shall be available for use 24 hour...,A,1
2,2,Out of 1000 accesses to the system the system ...,A,1
3,3,The system shall be available for use between ...,A,1
4,3,The system shall achieve 95% up time.,A,1
5,5,The product shall adhere to the corporate onli...,A,1
6,5,The product shall achieve a 98% uptime. The pr...,A,1
7,6,Aside from server failure the software produc...,A,1
8,8,The website shall be available for use 24 hour...,A,1
9,8,The website shall achieve 99.5% up time.,A,1


In [None]:
# Dataframe metadata information

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 969 entries, 0 to 968
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Id               969 non-null    int64 
 1   RequirementText  969 non-null    object
 2   Class            969 non-null    object
 3   Binary_Label     969 non-null    int64 
dtypes: int64(2), object(2)
memory usage: 30.4+ KB


In [None]:
# Distribution of the dataframe

df['Binary_Label'].value_counts()

1    525
0    444
Name: Binary_Label, dtype: int64

In [None]:
# Creating a train and validation set from data

df_train, df_val= train_test_split(df, test_size=0.2, random_state=RANDOM_SEED)

In [None]:
# Distribution of train and validation set

df_train['Binary_Label'].value_counts() , df_val['Binary_Label'].value_counts()

(1    414
 0    361
 Name: Binary_Label, dtype: int64,
 1    111
 0     83
 Name: Binary_Label, dtype: int64)

In [None]:
df2 = df.copy()

def map_label(label):
    if label == 0:
        return 'function'
    elif label == 1:
        return 'non-function'
    else:
        return 'unknown'

# Apply the mapping function to create a new column
df2['category'] = df2["Binary_Label"].apply(map_label)

In [None]:
df_train2, df_val2= train_test_split(df2, test_size=0.2, random_state=RANDOM_SEED)

In [None]:
trg_langs = df_train2["category"].unique().tolist()
trg_langs

['function', 'non-function']

In [None]:
class LangDataset(Dataset):
    def __init__(self, ds, trg_langs, train_vocab=None):
        self.corpus = ds

        if not train_vocab:
            self.src_vocab, self.trg_vocab = self._build_vocab()
        else:
            self.src_vocab, self.trg_vocab = train_vocab

    def __len__(self):
        return len(self.corpus)

    def __getitem__(self, item):
        text = self.corpus.iloc[item].RequirementText
        lang = self.corpus.iloc[item].category

        return {
            'src': self.src_vocab.lookup_indices(text.lower().split()),
            'trg': self.trg_vocab.lookup_indices([lang])
        }

    def _build_vocab(self):
        # Here one could remove stopwords and use word lemmatisation.
        # Both techniques will reduce the vocab size and hence model size
        # and could also enhance the model's performance
        src_tokens = self.corpus.RequirementText.str.cat().lower().split()

        src_vocab = build_vocab_from_iterator([src_tokens], specials=["<unk>", "<pad>"])
        src_vocab.set_default_index(src_vocab['<unk>'])

        trg_vocab = build_vocab_from_iterator([trg_langs])

        return src_vocab, trg_vocab

In [None]:
def collate_fn(batch, pad_value, device):
    trgs = []
    srcs = []
    for row in batch:
        srcs.append(torch.tensor(row["src"], dtype=torch.long).to(device))
        trgs.append(torch.tensor(row["trg"]).to(device))

    padded_srcs = pad_sequence(srcs, padding_value=pad_value)
    return {"src": padded_srcs, "trg": torch.tensor([trgs]).to(device)}

train_langds = LangDataset(df_train2, trg_langs)
test_langds = LangDataset(df_val2, trg_langs, (train_langds.src_vocab, train_langds.trg_vocab))

SRC_PAD_IDX = train_langds.src_vocab["<pad>"]

train_dt = DataLoader(train_langds, batch_size= batch_size, shuffle=
                   True, collate_fn=lambda batch_size: collate_fn(batch_size, SRC_PAD_IDX, device))

test_dt = DataLoader(test_langds, batch_size=batch_size, shuffle=
                   True, collate_fn=lambda batch_size: collate_fn(batch_size, SRC_PAD_IDX, device))


hyp_params = {
    "batch_size": batch_size,
    "lr": 0.00001,
    "num_epochs": EPOCHS,
    "d_model": 512, # Input embedding dimension
    "n_head": 8, # No. of multi-head attention block (aka paralle self-attention layers)
    "n_layers": 3,
    "feedforward_dim": 128,
    "dropout": 0.1
}


hyp_params["src_vocab_size"] = len(train_langds.src_vocab)
hyp_params["trg_vocab_size"] = len(trg_langs)

In [None]:
next(iter(train_dt))

{'src': tensor([[ 163,  103,   81,    2,    2,    2,   53,    2,   83,   23,    2,    2,
             2,    7,   12,    2],
         [   2,   26,   98,   10,   46,    7,  112,   10,    2,    3,   10,    7,
             7,   15, 1315,    7],
         [ 392,   45,    9,    3,    3,   15,   12,    3,  188,  252,    3,    3,
             3,   21,    9,    3],
         [   4,  109,   80,  396,  222,  105,    3,    6,  155, 1022,  171,  163,
            29,   78,  970,   29],
         [ 253,   87,   62,    2,  126,  326,   39,  165,   13,   76,    8,   12,
           211,    4,   44,    2],
         [ 637,   63,   75, 2052,   12,  357,    2,  758,  659,    2,   56,    4,
            10, 2081,  545, 1662],
         [  84, 1146,    3,  123, 1326,   83,  228,  879,    8,  972,   31,  920,
          1391,   11,  937,  178],
         [ 787,   94,    6,   25,    5,  113,  235,   10,  311, 1184,  317,   99,
             4,    8,    2,  367],
         [  24,   11,   16,    4,  126,  193,   53,   14,

# Transformers

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout, maxlen = 5000):
        super(PositionalEncoding, self).__init__()

        # A tensor consists of all the possible positions (index) e.g 0, 1, 2, ... max length of input
        # Shape (pos) --> [max len, 1]
        pos = torch.arange(0, maxlen).unsqueeze(1)
        pos_encoding = torch.zeros((maxlen, d_model))

        sin_den = 10000 ** (torch.arange(0, d_model, 2)/d_model) # sin for even item of position's dimension
        cos_den = 10000 ** (torch.arange(1, d_model, 2)/d_model) # cos for odd

        pos_encoding[:, 0::2] = torch.sin(pos / sin_den)
        pos_encoding[:, 1::2] = torch.cos(pos / cos_den)

        # Shape (pos_embedding) --> [max len, d_model]
        # Adding one more dimension in-between
        pos_encoding = pos_encoding.unsqueeze(-2)
        # Shape (pos_embedding) --> [max len, 1, d_model]

        self.dropout = nn.Dropout(dropout)

        # We want pos_encoding be saved and restored in the `state_dict`, but not trained by the optimizer
        # hence registering it!
        # Source & credits: https://discuss.pytorch.org/t/what-is-the-difference-between-register-buffer-and-register-parameter-of-nn-module/32723/2
        self.register_buffer('pos_encoding', pos_encoding)

    def forward(self, token_embedding):
        # shape (token_embedding) --> [sentence len, batch size, d_model]

        # Concatenating embeddings with positional encodings
        # Note: As we made positional encoding with the size max length of sentence in our dataset
        #       hence here we are picking till the sentence length in a batch
        #       Another thing to notice is in the Transformer's paper they used FIXED positional encoding,
        #       there are methods where we can also learn them
        return self.dropout(token_embedding + self.pos_encoding[:token_embedding.size(0), :])


class InputEmbedding(nn.Module):
    def __init__(self, vocab_size, d_model):
        super(InputEmbedding, self).__init__()

        self.embedding = nn.Embedding(vocab_size, d_model)
        self.d_model = d_model

    def forward(self, tokens):
        # shape (tokens) --> [sentence len, batch size]
        # shape (inp_emb) --> [sentence len, batch size, d_model]
        # Multiplying with square root of d_model as they mentioned in the Transformer's paper
        inp_emb = self.embedding(tokens.long()) * math.sqrt(self.d_model)
        return inp_emb


class TransformerClassifier(nn.Module):
    def __init__(self,
                  src_vocab_size,
                 trg_vocab_size ,
                 d_model,
                 dropout,
                 n_head,
                 dim_feedforward,
                 n_layers,
                ):
        super().__init__()

        self.src_inp_emb = InputEmbedding(src_vocab_size, d_model)
        self.trg_inp_emb = InputEmbedding(trg_vocab_size, d_model)

        self.positional_encoding = PositionalEncoding(d_model, dropout=dropout)

        # Only using Encoder of Transformer model
        encoder_layers = nn.TransformerEncoderLayer(d_model, n_head, dim_feedforward, dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, n_layers)

        self.d_model = d_model
        self.decoder = nn.Linear(d_model, trg_vocab_size)

    def forward(self, x):
        x_emb = self.positional_encoding(self.src_inp_emb(x))
        # Shape (output) -> (Sequence length, batch size, d_model)
        output = self.transformer_encoder(x_emb)
        # We want our output to be in the shape of (batch size, d_model) so that
        # we can use it with CrossEntropyLoss hence averaging using first (Sequence length) dimension
        # Shape (mean) -> (batch size, d_model)
        # Shape (decoder) -> (batch size, d_model)
        return self.decoder(output.mean(0))

In [None]:
def train_model(model, train_dataloader, criterion, optimizer , n_examples ):
    model.train()
    epoch_loss = 0
    correct_predictions = 0
    for batch_idx, batch in enumerate(tqdm(train_dataloader)):
        # Clear the accumulating gradients
        optimizer.zero_grad()

        src = batch["src"]  # shape --> [seq len, batch size]
        trg = batch["trg"]  # shape --> [1, batch size]

        # shape (out) --> [batch size, trg size]
        out = model(src)
        _, preds = torch.max(out, dim=1)
        correct_predictions += torch.sum(preds == trg)
        loss = criterion(out, trg.squeeze(0))

        loss.backward()

        optimizer.step()
        epoch_loss += loss.detach().cpu()

    return correct_predictions.double() / n_examples, epoch_loss/len(train_dataloader)


def evaluate_model(model, valid_dataloader, criterion , n_examples):
    model.eval()
    epoch_loss = 0
    correct_predictions = 0
    with torch.no_grad():
        for batch_idx, batch in enumerate(valid_dataloader):
            src = batch["src"]  # shape --> [seq len, batch size]
            trg = batch["trg"]  # shape --> [1, batch size]

            # shape (out) --> [batch size, trg size]
            out = model(src)
            _, preds = torch.max(out, dim=1)
            correct_predictions += torch.sum(preds == trg)
            loss = criterion(out, trg.squeeze(0))

            epoch_loss += loss.detach().cpu()



    return correct_predictions.double() / n_examples, epoch_loss/len(valid_dataloader)

In [None]:
model2 = TransformerClassifier(  hyp_params["src_vocab_size"],
                                hyp_params["trg_vocab_size"] ,
                                hyp_params["d_model"],
                                hyp_params["dropout"],
                                hyp_params["n_head"],
                                hyp_params["feedforward_dim"],
                                hyp_params["n_layers"]
                                ).to(device)


criterion = nn.CrossEntropyLoss().to(device)

optimizer = optim.Adam(model2.parameters(), lr=hyp_params["lr"])

print(model2)

TransformerClassifier(
  (src_inp_emb): InputEmbedding(
    (embedding): Embedding(2602, 512)
  )
  (trg_inp_emb): InputEmbedding(
    (embedding): Embedding(2, 512)
  )
  (positional_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-2): 3 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (linear1): Linear(in_features=512, out_features=128, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=128, out_features=512, bias=True)
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (decoder): Linear

In [None]:
hyp_params

{'batch_size': 16,
 'lr': 1e-05,
 'num_epochs': 20,
 'd_model': 512,
 'n_head': 8,
 'n_layers': 3,
 'feedforward_dim': 128,
 'dropout': 0.1,
 'src_vocab_size': 2602,
 'trg_vocab_size': 2}

In [None]:
def train(model,EPOCHS = EPOCHS):


  for epoch in range(EPOCHS):
    start = time.time()
    gc.collect()
    torch.cuda.empty_cache()

    train_acc,train_loss = train_model(model, train_dt, criterion, optimizer,len(df_train2))
    val_acc,val_loss = evaluate_model(model, test_dt, criterion,len(df_val2))

    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)
    print(f' train_loss {train_loss:.5f}  train_acc {train_acc:.5f} <-> Val_loss {val_loss:.5f} val_accuracy {val_acc:.5f}')

In [None]:
train(model2, 20)


# {'batch_size': 16,
#  'lr': 0.0005,
#  'num_epochs': 40,
#  'd_model': 768,
#  'n_head': 12,
#  'n_layers': 12,
#  'feedforward_dim': 128,
#  'dropout': 0.1,
#  'src_vocab_size': 2602,
#  'trg_vocab_size': 2}

100%|██████████| 49/49 [00:00<00:00, 61.29it/s]


Epoch 1/20
----------
 train_loss 0.69208  train_acc 0.54710 <-> Val_loss 0.68288 val_accuracy 0.52577


100%|██████████| 49/49 [00:00<00:00, 66.18it/s]


Epoch 2/20
----------
 train_loss 0.65471  train_acc 0.62710 <-> Val_loss 0.63298 val_accuracy 0.61856


100%|██████████| 49/49 [00:00<00:00, 70.29it/s]


Epoch 3/20
----------
 train_loss 0.62922  train_acc 0.65677 <-> Val_loss 0.65093 val_accuracy 0.63402


100%|██████████| 49/49 [00:00<00:00, 69.92it/s]


Epoch 4/20
----------
 train_loss 0.59066  train_acc 0.71871 <-> Val_loss 0.56916 val_accuracy 0.72680


100%|██████████| 49/49 [00:00<00:00, 69.52it/s]


Epoch 5/20
----------
 train_loss 0.54566  train_acc 0.72645 <-> Val_loss 0.56578 val_accuracy 0.69588


100%|██████████| 49/49 [00:00<00:00, 69.76it/s]


Epoch 6/20
----------
 train_loss 0.52113  train_acc 0.75484 <-> Val_loss 0.57739 val_accuracy 0.68041


100%|██████████| 49/49 [00:00<00:00, 62.70it/s]


Epoch 7/20
----------
 train_loss 0.49639  train_acc 0.76903 <-> Val_loss 0.53431 val_accuracy 0.75258


100%|██████████| 49/49 [00:00<00:00, 53.54it/s]


Epoch 8/20
----------
 train_loss 0.46193  train_acc 0.80645 <-> Val_loss 0.51528 val_accuracy 0.76804


100%|██████████| 49/49 [00:00<00:00, 53.59it/s]


Epoch 9/20
----------
 train_loss 0.42218  train_acc 0.83613 <-> Val_loss 0.48075 val_accuracy 0.76804


100%|██████████| 49/49 [00:00<00:00, 67.65it/s]


Epoch 10/20
----------
 train_loss 0.39280  train_acc 0.83613 <-> Val_loss 0.51761 val_accuracy 0.79381


100%|██████████| 49/49 [00:00<00:00, 70.13it/s]


Epoch 11/20
----------
 train_loss 0.36355  train_acc 0.85419 <-> Val_loss 0.47922 val_accuracy 0.78351


100%|██████████| 49/49 [00:00<00:00, 70.47it/s]


Epoch 12/20
----------
 train_loss 0.32534  train_acc 0.88129 <-> Val_loss 0.42090 val_accuracy 0.81443


100%|██████████| 49/49 [00:00<00:00, 69.04it/s]


Epoch 13/20
----------
 train_loss 0.29080  train_acc 0.88645 <-> Val_loss 0.52546 val_accuracy 0.77320


100%|██████████| 49/49 [00:00<00:00, 68.49it/s]


Epoch 14/20
----------
 train_loss 0.27092  train_acc 0.89935 <-> Val_loss 0.49068 val_accuracy 0.77835


100%|██████████| 49/49 [00:00<00:00, 69.18it/s]


Epoch 15/20
----------
 train_loss 0.24530  train_acc 0.91097 <-> Val_loss 0.50253 val_accuracy 0.80412


100%|██████████| 49/49 [00:00<00:00, 70.59it/s]


Epoch 16/20
----------
 train_loss 0.21724  train_acc 0.91097 <-> Val_loss 0.56126 val_accuracy 0.79381


100%|██████████| 49/49 [00:00<00:00, 69.17it/s]


Epoch 17/20
----------
 train_loss 0.20759  train_acc 0.92258 <-> Val_loss 0.44534 val_accuracy 0.81959


100%|██████████| 49/49 [00:00<00:00, 70.39it/s]


Epoch 18/20
----------
 train_loss 0.18337  train_acc 0.93677 <-> Val_loss 0.47698 val_accuracy 0.82990


100%|██████████| 49/49 [00:00<00:00, 69.86it/s]


Epoch 19/20
----------
 train_loss 0.18520  train_acc 0.92903 <-> Val_loss 0.44381 val_accuracy 0.83505


100%|██████████| 49/49 [00:00<00:00, 58.60it/s]


Epoch 20/20
----------
 train_loss 0.15384  train_acc 0.95097 <-> Val_loss 0.54295 val_accuracy 0.83505


# Unlabeled Data

In [None]:
import zipfile
import os

# Path to the zip file
zip_file_path = "/content/drive/MyDrive/NLP4RE-data/pure-requirements-xml.zip"

# Directory to extract the files
extract_dir = "/content/drive/MyDrive/NLP4RE-data"

# # Open the zip file
# with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
#     # Extract all the files into the directory specified by extract_dir
#     zip_ref.extractall(extract_dir)

extracted_path = os.path.join(extract_dir, "pure-requirements-xml")
# Iterate through each file in the directory
for f in os.listdir(extracted_path):
  # join path
  extracted_file_path = os.path.join(extracted_path, f)
  # Open and read the contents of the extracted file
  with open(extracted_file_path, 'r') as f1:
      contents = f1.read()
      print(contents)
      break

/content/drive/MyDrive/NLP4RE-data/pure-requirements-xml/2007-ertms.xml
<?xml version="1.0" encoding="UTF-8"?>
<!-- New document created with EditiX at Tue Jun 07 14:21:44 CEST 2011 -->
<req_document xsi:schemaLocation="req_document.xsd req_document.xsd" 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
xmlns="req_document.xsd">

<title>ERTMS/ETCS Functional Requirements Specification FRS</title>
<version>5.00</version>
<issue_date>2007-06-21</issue_date>
<file_number>ERA/ERTMS/003204</file_number>
<change_log>
	<change_log_item>
	<version></version>
	<change></change>
	<change_date>2007-06-21</change_date>
	</change_log_item>
</change_log>
<p id="1">
<title>Introduction</title>
<p id="1.1">
<text_body>
This document defines the functional requirements for ERTMS/ETCS (EUROPEAN RAIL TRAFFIC MANAGEMENT SYSTEM / EUROPEAN TRAIN CONTROL SYSTEM). The document primarily defines the operational requirements and therefore contains only a few technical terms. For consistency reasons, all f