### Antoine EDY
# Natural Language Processing (COMM061) - Coursework

In [1]:
import tqdm
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datasets import load_dataset
import torch

In [2]:
dataset = load_dataset("surrey-nlp/PLOD-CW")

In [3]:
TEXT2ID = {
    "B-O": 0,
    "B-AC": 1,
    "PAD": 2,
    "B-LF": 3,
    "I-LF": 4,
}
ID2TEXT = {v: k for k, v in TEXT2ID.items()}

print(f"TEXT2ID: {TEXT2ID}\nID2TEXT: {ID2TEXT}\n")

def preprocess(df):
    df = df.drop(columns=['pos_tags'])
    df = df.rename(columns={"ner_tags": "labels"})
    df["ids"] = df["labels"].apply(lambda x: [TEXT2ID[i] for i in x])
    df["sentences"] = df["tokens"].apply(lambda x: " ".join(x))

    return df


train_dataset = preprocess(pd.DataFrame(dataset['train']))
test_dataset = preprocess(pd.DataFrame(dataset['test']))
val_dataset = preprocess(pd.DataFrame(dataset['validation']))

train_dataset.info()


# Here the exploration to add at the end of the work.

TEXT2ID: {'B-O': 0, 'B-AC': 1, 'PAD': 2, 'B-LF': 3, 'I-LF': 4}
ID2TEXT: {0: 'B-O', 1: 'B-AC', 2: 'PAD', 3: 'B-LF', 4: 'I-LF'}

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1072 entries, 0 to 1071
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   tokens     1072 non-null   object
 1   labels     1072 non-null   object
 2   ids        1072 non-null   object
 3   sentences  1072 non-null   object
dtypes: object(4)
memory usage: 33.6+ KB


In [4]:
train_dataset.head()

Unnamed: 0,tokens,labels,ids,sentences
0,"[For, this, purpose, the, Gothenburg, Young, P...","[B-O, B-O, B-O, B-O, B-LF, I-LF, I-LF, I-LF, I...","[0, 0, 0, 0, 3, 4, 4, 4, 4, 0, 1, 0, 0, 0, 0]",For this purpose the Gothenburg Young Persons ...
1,"[The, following, physiological, traits, were, ...","[B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-LF, I-LF...","[0, 0, 0, 0, 0, 0, 0, 3, 4, 0, 1, 0, 0, 0, 0, ...",The following physiological traits were measur...
2,"[Minor, H, antigen, alloimmune, responses, rea...","[B-O, B-AC, B-O, B-O, B-O, B-O, B-O, B-O, B-O,...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 4, 0, ...",Minor H antigen alloimmune responses readily o...
3,"[EPI, =, Echo, planar, imaging, .]","[B-AC, B-O, B-LF, I-LF, I-LF, B-O]","[1, 0, 3, 4, 4, 0]",EPI = Echo planar imaging .
4,"[Furthermore, ,, eNOS, -, derived, NO, S, -, n...","[B-O, B-O, B-AC, B-O, B-O, B-AC, B-O, B-O, B-O...","[0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","Furthermore , eNOS - derived NO S - nitrosylat..."


In [5]:
print(len(train_dataset))
print(len(val_dataset))
print(len(test_dataset))

1072
126
153


In [6]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("surrey-nlp/roberta-base-finetuned-abbr")
model = AutoModelForTokenClassification.from_pretrained("surrey-nlp/roberta-base-finetuned-abbr")

In [7]:
#tokenize

X_train = tokenizer(list(train_dataset['sentences']), return_tensors="pt", padding=True, truncation=True, max_length=128)
X_val = tokenizer(list(val_dataset['sentences']), return_tensors="pt", padding=True, truncation=True, max_length=128)
X_test = tokenizer(list(test_dataset['sentences']), return_tensors="pt", padding=True, truncation=True, max_length=128)

outputs = model(**X_test)

In [10]:
from vizualization import vizu
words =[tokenizer.decode(token) for token in X_test['input_ids'][0]]
#words = [word for word in words if word != '<pad>']
preds = outputs['logits'][0].max(1).indices
print(preds)

vizu(words, preds)

tensor([0, 0, 0, 0, 0, 0, 1, 1, 0, 3, 4, 4, 4, 4, 4, 4, 0, 1, 0, 3, 3, 4, 4, 0,
        0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3])
[40m Ab [40mbre [40mvi [40mations [40m : [41m G [41mEMS [40m, [44m Global [45m Enter [45mic [45m Mult [45mic [45menter [45m Study [40m ; [41m VIP [40m, [44m vent [44milated [45m improved [45m pit [40m. [40m</s> [44m<pad> [44m<pad> [44m<pad> [44m<pad> [44m<pad> [44m<pad> [44m<pad> [44m<pad> [44m<pad> [44m<pad> [44m<pad> [44m<pad> [44m<pad> [44m<pad> [44m<pad> [44m<pad> [44m<pad> [44m<pad> [44m<pad> [44m<pad> [44m<pad> [44m<pad> [44m<pad> [44m<pad> [44m<pad> [44m<pad> [44m<pad> [44m<pad> [44m<pad> [44m<pad> [44m<pa

In [40]:
preds = outputs['logits'].max(2).indices
print(preds, preds.shape)

sent = " ".join(test_dataset['tokens'][0])
print(sent)
tokenizer.tokenize(sent)

tensor([[0, 0, 0,  ..., 3, 3, 3],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 4, 4, 4],
        ...,
        [0, 0, 3,  ..., 0, 0, 0],
        [0, 0, 1,  ..., 0, 0, 0],
        [0, 1, 1,  ..., 1, 1, 1]]) torch.Size([153, 126])
Abbreviations : GEMS , Global Enteric Multicenter Study ; VIP , ventilated improved pit .


['ĠAb',
 'bre',
 'vi',
 'ations',
 'Ġ:',
 'ĠG',
 'EMS',
 'Ġ,',
 'ĠGlobal',
 'ĠEnter',
 'ic',
 'ĠMult',
 'ic',
 'enter',
 'ĠStudy',
 'Ġ;',
 'ĠVIP',
 'Ġ,',
 'Ġvent',
 'ilated',
 'Ġimproved',
 'Ġpit',
 'Ġ.']

### 3. NLP algorithms

Conditional Random Fields, RNNs, Transformers

In [9]:
from torch.utils.data import DataLoader, TensorDataset

train = TensorDataset(train_X, train_y)
val = TensorDataset(val_X, val_y)
test = TensorDataset(test_X, test_y)

train_loader = DataLoader(train, batch_size=32, shuffle=True)
val_loader = DataLoader(val, batch_size=32, shuffle=True)
test_loader = DataLoader(test, batch_size=32, shuffle=True)

NameError: name 'train_X' is not defined

In [None]:
pipeline["algorithm"] = "pretrained" # rnn, pretrained
print_pipeline()

batch_size = 100
n_iters = 10000
num_epochs = n_iters / (len(train) / batch_size)
num_epochs = int(num_epochs)
print(num_epochs)

from models import Models

model = Models(pipeline["algorithm"])

model.fit(train_loader, val_loader, num_epochs, input_dim=100)

model.



SyntaxError: invalid syntax (3891515695.py, line 16)

### 4. Loss functions and Optimisers

# 3. Testing

# 4. Best model

# 5. Final evaluation