### Antoine EDY
# Natural Language Processing (COMM061) - Coursework

Introduction:


Preparation of the notebook:

In [1]:
import tqdm
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datasets import load_dataset
import torch

Definition of the pipeline:

In [2]:
pipeline = {
    "pre-processing": None,
    "text-encoding": None,
    "algorithm": None,
}

def print_pipeline():
    for key, value in pipeline.items():
        print(f"{key}: {value}")

print_pipeline()

def train(train_dataset, validation_dataset, pipeline):
    pass

def evaluate(test_dataset, pipeline):
    pass

pre-processing: None
text-encoding: None
algorithm: None


## 1. Data Visualization

In [3]:
dataset = load_dataset("surrey-nlp/PLOD-CW")

In [4]:
TEXT2ID = {
    "<PAD>": 0,
    "B-O": 1,
    "B-AC": 2,
    "B-LF": 3,
    "I-LF": 4,
}
ID2TEXT = {v: k for k, v in TEXT2ID.items()}

print(f"TEXT2ID: {TEXT2ID}\nID2TEXT: {ID2TEXT}")

def preprocess(df):
    df = df.drop(columns=['pos_tags'])
    df = df.rename(columns={"ner_tags": "labels"})
    df["ids"] = df["labels"].apply(lambda x: [TEXT2ID[i] for i in x])

    return df


train_dataset = preprocess(pd.DataFrame(dataset['train']))
test_dataset = preprocess(pd.DataFrame(dataset['test']))
val_dataset = preprocess(pd.DataFrame(dataset['validation']))

train_dataset.info()


# Here the exploration to add at the end of the work.

TEXT2ID: {'<PAD>': 0, 'B-O': 1, 'B-AC': 2, 'B-LF': 3, 'I-LF': 4}
ID2TEXT: {0: '<PAD>', 1: 'B-O', 2: 'B-AC', 3: 'B-LF', 4: 'I-LF'}
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1072 entries, 0 to 1071
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   tokens  1072 non-null   object
 1   labels  1072 non-null   object
 2   ids     1072 non-null   object
dtypes: object(3)
memory usage: 25.2+ KB


In [5]:
train_dataset.head()

Unnamed: 0,tokens,labels,ids
0,"[For, this, purpose, the, Gothenburg, Young, P...","[B-O, B-O, B-O, B-O, B-LF, I-LF, I-LF, I-LF, I...","[1, 1, 1, 1, 3, 4, 4, 4, 4, 1, 2, 1, 1, 1, 1]"
1,"[The, following, physiological, traits, were, ...","[B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-LF, I-LF...","[1, 1, 1, 1, 1, 1, 1, 3, 4, 1, 2, 1, 1, 1, 1, ..."
2,"[Minor, H, antigen, alloimmune, responses, rea...","[B-O, B-AC, B-O, B-O, B-O, B-O, B-O, B-O, B-O,...","[1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 4, 4, 1, ..."
3,"[EPI, =, Echo, planar, imaging, .]","[B-AC, B-O, B-LF, I-LF, I-LF, B-O]","[2, 1, 3, 4, 4, 1]"
4,"[Furthermore, ,, eNOS, -, derived, NO, S, -, n...","[B-O, B-O, B-AC, B-O, B-O, B-AC, B-O, B-O, B-O...","[1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


# 2. Experimentations

### 1. Pre-processing techniques
1-gram, n-gram

In [6]:
from preprocessing import Preprocessing

pipeline["pre-processing"] = "Nothing" # Nothing, No-stop-words
print_pipeline()

preprocessing = Preprocessing(pipeline["pre-processing"])
preprocessing.fit(train_dataset)
train_dataset = preprocessing.transform(train_dataset)
val_dataset = preprocessing.transform(val_dataset)
test_dataset = preprocessing.transform(test_dataset)

print(len(train_dataset))
print(len(val_dataset))
print(len(test_dataset))

pre-processing: Nothing
text-encoding: None
algorithm: None
1072
126
153


### 2. Text encoding

In [7]:
from vectorization import Vectorization

pipeline["text-encoding"] = "Word2Vec" # Word2Vec
print_pipeline()

vectorization = Vectorization(pipeline["text-encoding"])
vectorization.fit(train_dataset['tokens'])

train_X, train_y = vectorization.transform(train_dataset['tokens'], train_dataset['ids'].values)
print(train_X.shape, train_y.shape)

val_X, val_y = vectorization.transform(val_dataset['tokens'], val_dataset['ids'].values)
print(val_X.shape, val_y.shape)

test_X, test_y = vectorization.transform(test_dataset['tokens'], test_dataset['ids'].values)
print(test_X.shape, test_y.shape)

pre-processing: Nothing
text-encoding: Word2Vec
algorithm: None
torch.Size([1072, 50, 100]) torch.Size([1072, 50, 5])
torch.Size([126, 50, 100]) torch.Size([126, 50, 5])
torch.Size([153, 50, 100]) torch.Size([153, 50, 5])


### 3. NLP algorithms

Conditional Random Fields, RNNs, Transformers

In [8]:
from torch.utils.data import DataLoader, TensorDataset

train = TensorDataset(train_X, train_y)
val = TensorDataset(val_X, val_y)
test = TensorDataset(test_X, test_y)

train_loader = DataLoader(train, batch_size=32, shuffle=True)
val_loader = DataLoader(val, batch_size=32, shuffle=True)
test_loader = DataLoader(test, batch_size=32, shuffle=True)

In [9]:
pipeline["algorithm"] = "pretrained" # rnn, pretrained
print_pipeline()

batch_size = 100
n_iters = 10000
num_epochs = n_iters / (len(train) / batch_size)
num_epochs = int(num_epochs)
print(num_epochs)

from models import Models

model = Models(pipeline["algorithm"])

model.fit(train_loader, val_loader, num_epochs, input_dim=100)

model.



pre-processing: Nothing
text-encoding: Word2Vec
algorithm: pretrained
932


### 4. Loss functions and Optimisers

# 3. Testing

# 4. Best model

# 5. Final evaluation