In [1]:
!pip install  datasets
!pip install transformers




In [2]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW, get_linear_schedule_with_warmup
from torch.nn.utils import clip_grad_norm_
from datasets import load_dataset
#from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm


# compute metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import seaborn as sb

In [3]:
from datasets import load_dataset
data = load_dataset("surrey-nlp/PLOD-CW")
data

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


DatasetDict({
    train: Dataset({
        features: ['tokens', 'pos_tags', 'ner_tags'],
        num_rows: 1072
    })
    validation: Dataset({
        features: ['tokens', 'pos_tags', 'ner_tags'],
        num_rows: 126
    })
    test: Dataset({
        features: ['tokens', 'pos_tags', 'ner_tags'],
        num_rows: 153
    })
})

In [4]:
# Get the dataset
train_dataset = data["train"]

# Extract the 'ner_tags' feature values
ner_tags_values = train_dataset["ner_tags"]

# Collect unique label names
label_names = list(set(label for tags in ner_tags_values for label in tags))

print(label_names)


['I-LF', 'B-O', 'B-AC', 'B-LF']


In [5]:
data.set_format(type="pandas")
train_df = data['train'][:]
valid_df = data['validation'][:]
test_df = data['test'][:]

In [6]:
# this is for training data
# Check if 'text' column already exists
if 'text' not in train_df.columns:
    # Merge 'tokens' and 'pos_tags' into a new column 'text'
    train_df['text'] = train_df.apply(lambda row: ", ".join([f"{token} {pos}" for token, pos in zip(row['tokens'], row['pos_tags'])]), axis=1)

    # Drop the original 'tokens' and 'pos_tags' columns if needed
    train_df = train_df.drop(columns=['tokens', 'pos_tags'])
    print("New 'text' column created.")
else:
    print("The 'text' column already exists in the DataFrame.")

# this is for validation data
if 'text' not in valid_df.columns:
    # Merge 'tokens' and 'pos_tags' into a new column 'text'
    valid_df['text'] = valid_df.apply(lambda row: ", ".join([f"{token} {pos}" for token, pos in zip(row['tokens'], row['pos_tags'])]), axis=1)

    # Drop the original 'tokens' and 'pos_tags' columns if needed
    valid_df = valid_df.drop(columns=['tokens', 'pos_tags'])
    print("New 'text' column created.")
else:
    print("The 'text' column already exists in the DataFrame.")

# this is for test data
if 'text' not in test_df.columns:
    # Merge 'tokens' and 'pos_tags' into a new column 'text'
    test_df['text'] = test_df.apply(lambda row: ", ".join([f"{token} {pos}" for token, pos in zip(row['tokens'], row['pos_tags'])]), axis=1)

    # Drop the original 'tokens' and 'pos_tags' columns if needed
    test_df = test_df.drop(columns=['tokens', 'pos_tags'])
    print("New 'text' column created.")
else:
    print("The 'text' column already exists in the DataFrame.")
train_df[:3]


New 'text' column created.
New 'text' column created.
New 'text' column created.


Unnamed: 0,ner_tags,text
0,"[B-O, B-O, B-O, B-O, B-LF, I-LF, I-LF, I-LF, I...","For ADP, this DET, purpose NOUN, the DET, Goth..."
1,"[B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-LF, I-LF...","The DET, following ADJ, physiological ADJ, tra..."
2,"[B-O, B-AC, B-O, B-O, B-O, B-O, B-O, B-O, B-O,...","Minor ADJ, H PROPN, antigen NOUN, alloimmune A..."


In [7]:
# Check if the DataFrame has already been processed

# This is for training dataset:
if 'labels' not in train_df.columns or 'text' not in train_df.columns:
    # Split 'ner_tags' and 'text' into separate lists based on commas
    train_df['labels'] = train_df['ner_tags'].apply(lambda x: [tag.split(',')[0].strip() for tag in x])
    train_df['text'] = train_df['text'].apply(lambda x: [tag.strip() for tag in x.split(',')])

    # Explode the lists into separate rows
    train_df = train_df.explode('labels').explode('text')

# This is for Validation dataset:
if 'labels' not in valid_df.columns or 'text' not in valid_df.columns:
    # Split 'ner_tags' and 'text' into separate lists based on commas
    valid_df['labels'] = valid_df['ner_tags'].apply(lambda x: [tag.split(',')[0].strip() for tag in x])
    valid_df['text'] = valid_df['text'].apply(lambda x: [tag.strip() for tag in x.split(',')])

    # Explode the lists into separate rows
    valid_df = valid_df.explode('labels').explode('text')

# This is for Test dataset:
if 'labels' not in test_df.columns or 'text' not in test_df.columns:
    # Split 'ner_tags' and 'text' into separate lists based on commas
    test_df['labels'] = test_df['ner_tags'].apply(lambda x: [tag.split(',')[0].strip() for tag in x])
    test_df['text'] = test_df['text'].apply(lambda x: [tag.strip() for tag in x.split(',')])

    # Explode the lists into separate rows
    test_df = test_df.explode('labels').explode('text')

train_df.tail()

Unnamed: 0,ner_tags,text,labels
1071,"[B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, ...",PCR NOUN,B-O
1071,"[B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, ...",experiments NOUN,B-O
1071,"[B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, ...",demonstrated VERB,B-O
1071,"[B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, ...",that SCONJ,B-O
1071,"[B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, B-O, ...",β PUNCT,B-O


In [8]:
from sklearn.utils import shuffle

# Define label mapping
label_mapping = {'B-O': 0, 'I-LF': 1, 'B-LF': 2, 'B-AC': 3}

# For training dataset
# Shuffle the DataFrame
train_df = shuffle(train_df, random_state=42)
# Reset the index after shuffling
train_df.reset_index(drop=True, inplace=True)
# Remove the 'ner_tags' column if it exists
if 'ner_tags' in train_df.columns:
    train_df.drop(columns=['ner_tags'], inplace=True)
# Add the 'label' column based on label mapping
train_df['label'] = train_df['labels'].map(label_mapping)

# For Validation dataset
# Shuffle the DataFrame
valid_df = shuffle(valid_df, random_state=42)
# Reset the index after shuffling
valid_df.reset_index(drop=True, inplace=True)
# Remove the 'ner_tags' column if it exists
if 'ner_tags' in valid_df.columns:
    valid_df.drop(columns=['ner_tags'], inplace=True)
# Add the 'label' column based on label mapping
valid_df['label'] = valid_df['labels'].map(label_mapping)

# For testing dataset
# Shuffle the DataFrame
test_df = shuffle(test_df, random_state=42)
# Reset the index after shuffling
test_df.reset_index(drop=True, inplace=True)
# Remove the 'ner_tags' column if it exists
if 'ner_tags' in test_df.columns:
    test_df.drop(columns=['ner_tags'], inplace=True)
# Add the 'label' column based on label mapping
test_df['label'] = test_df['labels'].map(label_mapping)

train_df.head()

Unnamed: 0,text,labels,label
0,that PRON,B-O,0
1,PUNCT,B-O,0
2,CI NOUN,B-AC,3
3,reduction NOUN,B-O,0
4,mapping NOUN,B-O,0


In [9]:
print(train_df.shape)
print(valid_df.shape)
print(test_df.shape)


(2196470, 3)
(293025, 3)
(226745, 3)


In [10]:
print(train_df['labels'].unique())
print(train_df['labels'].value_counts())

['B-O' 'B-AC' 'I-LF' 'B-LF']
labels
B-O     1842151
I-LF     165281
B-AC     117001
B-LF      72037
Name: count, dtype: int64


In [32]:
# # Calculate the minimum sample size for each label group
# min_sample_size_train = min(len(group) for label, group in train_df.groupby('label'))
# min_sample_size_valid = min(len(group) for label, group in valid_df.groupby('label'))
# min_sample_size_test = min(len(group) for label, group in test_df.groupby('label'))

# # Sample each label group with the minimum sample size
# train_df = train_df.groupby('label').apply(lambda x: x.sample(20000)).reset_index(drop=True)
# valid_df = valid_df.groupby('label').apply(lambda x: x.sample(8000)).reset_index(drop=True)
# test_df = test_df.groupby('label').apply(lambda x: x.sample(3000)).reset_index(drop=True)


In [34]:
# Sample size for each group
sample_size_train = 20000
sample_size_valid = 8000
sample_size_test = 3000

# Group by 'label' and sample from each group
train_df = train_df.groupby('label').apply(lambda x: x.sample(min(sample_size_train, len(x)))).reset_index(drop=True)
valid_df = valid_df.groupby('label').apply(lambda x: x.sample(min(sample_size_valid, len(x)))).reset_index(drop=True)
test_df = test_df.groupby('label').apply(lambda x: x.sample(min(sample_size_test, len(x)))).reset_index(drop=True)


In [35]:
print(train_df.shape)
print(valid_df.shape)
print(test_df.shape)

(80000, 3)
(30284, 3)
(12000, 3)


In [37]:
print(train_df['label'].value_counts())
print(valid_df['label'].value_counts())
print(test_df['label'].value_counts())


label
0    20000
1    20000
2    20000
3    20000
Name: count, dtype: int64
label
0    7571
1    7571
2    7571
3    7571
Name: count, dtype: int64
label
0    3000
1    3000
2    3000
3    3000
Name: count, dtype: int64


In [38]:
from transformers import BertTokenizer
PRETRAINED_LM = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_LM, do_lower_case=True)
tokenizer

BertTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [39]:
def encode(docs):
    '''
    This function takes list of texts and returns input_ids and attention_mask of texts
    '''
    encoded_dict = tokenizer.batch_encode_plus(docs, add_special_tokens=True, max_length=128, padding='max_length',
                            return_attention_mask=True, truncation=True, return_tensors='pt')
    input_ids = encoded_dict['input_ids']
    attention_masks = encoded_dict['attention_mask']
    return input_ids, attention_masks

In [40]:
train_input_ids, train_att_masks = encode(train_df['text'].values.tolist())
valid_input_ids, valid_att_masks = encode(valid_df['text'].values.tolist())
test_input_ids, test_att_masks = encode(test_df['text'].values.tolist())

In [50]:
import torch
train_y = torch.LongTensor(train_df['label'].values.tolist())
valid_y = torch.LongTensor(valid_df['label'].values.tolist())
test_y = torch.LongTensor(test_df['label'].values.tolist())
train_y.size(),valid_y.size(),test_y.size()

(torch.Size([80000]), torch.Size([30284]), torch.Size([12000]))

In [51]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

BATCH_SIZE = 32
train_dataset = TensorDataset(train_input_ids, train_att_masks, train_y)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=BATCH_SIZE)

valid_dataset = TensorDataset(valid_input_ids, valid_att_masks, valid_y)
valid_sampler = SequentialSampler(valid_dataset)
valid_dataloader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=BATCH_SIZE)

test_dataset = TensorDataset(test_input_ids, test_att_masks, test_y)
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=BATCH_SIZE)

In [52]:
N_labels = len(train_df.label.unique())
N_labels

4

In [53]:
from transformers import BertForSequenceClassification
N_labels = len(train_df.label.unique())
model = BertForSequenceClassification.from_pretrained(PRETRAINED_LM,
                                                      num_labels=N_labels,
                                                      output_attentions=False,
                                                      output_hidden_states=False)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [54]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [55]:
model = model.cpu()

In [56]:
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup

EPOCHS = 3
LEARNING_RATE = 2e-6

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
scheduler = get_linear_schedule_with_warmup(optimizer,
             num_warmup_steps=0,
            num_training_steps=len(train_dataloader)*EPOCHS )

In [None]:
#collapse-output
from torch.nn.utils import clip_grad_norm_
from tqdm.notebook import tqdm
import numpy as np
import math

train_loss_per_epoch = []
val_loss_per_epoch = []


for epoch_num in range(EPOCHS):
    print('Epoch: ', epoch_num + 1)
    '''
    Training
    '''
    model.train()
    train_loss = 0
    for step_num, batch_data in enumerate(tqdm(train_dataloader,desc='Training')):
        input_ids, att_mask, labels = [data.to(device) for data in batch_data]
        output = model(input_ids = input_ids, attention_mask=att_mask, labels= labels)

        loss = output.loss
        train_loss += loss.item()

        model.zero_grad()
        loss.backward()
        del loss

        clip_grad_norm_(parameters=model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()

    train_loss_per_epoch.append(train_loss / (step_num + 1))


    '''
    Validation
    '''
    model.eval()
    valid_loss = 0
    valid_pred = []
    with torch.no_grad():
        for step_num_e, batch_data in enumerate(tqdm(valid_dataloader,desc='Validation')):
            input_ids, att_mask, labels = [data.to(device) for data in batch_data]
            output = model(input_ids = input_ids, attention_mask=att_mask, labels= labels)

            loss = output.loss
            valid_loss += loss.item()

            valid_pred.append(np.argmax(output.logits.cpu().detach().numpy(),axis=-1))

    val_loss_per_epoch.append(valid_loss / (step_num_e + 1))
    valid_pred = np.concatenate(valid_pred)

    '''
    Loss message
    '''
    print("{0}/{1} train loss: {2} ".format(step_num+1, math.ceil(len(train_df) / BATCH_SIZE), train_loss / (step_num + 1)))
    print("{0}/{1} val loss: {2} ".format(step_num_e+1, math.ceil(len(valid_df) / BATCH_SIZE), valid_loss / (step_num_e + 1)))

Epoch:  1


Training:   0%|          | 0/2500 [00:00<?, ?it/s]

Validation:   0%|          | 0/947 [00:00<?, ?it/s]

2500/2500 train loss: 1.388851763343811 
947/947 val loss: 1.383148849828694 
Epoch:  2


Training:   0%|          | 0/2500 [00:00<?, ?it/s]