In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch

pd.set_option('display.max_colwidth', None)

Get and Preprocess Dataset


In [2]:
df = pd.read_csv('Datasets/GUM/rst/total_relations_with_doc_name.csv')
print(df.shape)
df.head(1)

(29339, 6)


Unnamed: 0,doc_name,nucleus,satellite,relation,word_count,converted_relation
0,GUM_academic_art,Aesthetic Appreciation and Spanish Art :,Insights from Eye - Tracking,elaboration-additional,13,Elaboration


In [3]:
df = df.sample(frac=1, ignore_index=True)

In [4]:
# df_switch = df.copy() # swith positions of nucleus and satellite
# df_switch.rename(columns={'nucleus': 'satellite', 'satellite': 'nucleus'}, inplace=True)

In [5]:
# combined_df = pd.concat([df, df_switch], ignore_index=True)
combined_df = df

In [6]:
def split_dataset_by_doc(dataset, test_range, doc_name_label='doc_name'):
    """Split dataset into train and test set, ensuring each doc stay reside in only one.

    Args:
        dataset (DataFrame):
        test_range (tuple): (start, end) e.g. (0, 0.2) means the first 20%
        doc_name_label (str): label of documents' name in the dataframe
    """

    groups = dataset.groupby(doc_name_label)
    documents = [group for _, group in groups]

    test_start_idx = int(len(documents) * test_range[0])
    test_end_idx = int(len(documents) * test_range[1])

    test_docs = documents[test_start_idx:test_end_idx]
    train_docs = documents[:test_start_idx] + documents[test_end_idx:]
    
    train_df = pd.concat(train_docs).reset_index(drop=True)
    test_df = pd.concat(test_docs).reset_index(drop=True)
    assert len(set(train_df[doc_name_label])) + len(set(test_df[doc_name_label])) == len(set(dataset[doc_name_label]))

    return train_df, test_df

In [7]:
label_text = ['Attribution', 'Background', 'Cause', 'Condition', 'Contrast',
       'Elaboration', 'Enablement', 'Evaluation', 'Explanation', 'Joint',
       'Manner-Means', 'Same-Unit', 'Summary', 'Temporal',
       'Textual-Organization', 'Topic-Change', 'Topic-Comment']

In [8]:
from sklearn.preprocessing import LabelEncoder

def preprocess_data(dataset, tokenizer, label_col='converted_relation'):
    input_ids = []
    token_type_ids = []
    attention_mask = []

    for row in dataset.iterrows():
        tokens = tokenizer(row[1]['nucleus'], row[1]['satellite'], padding='max_length', truncation=True, return_tensors='pt')
 
        input_ids.append(tokens['input_ids'][0])
        token_type_ids.append(tokens['token_type_ids'][0])
        attention_mask.append(tokens['attention_mask'][0])

        
    le = LabelEncoder()
    le.fit(label_text)
    labels = le.transform(dataset[label_col])

    return {
        'input_ids': input_ids,
        'token_type_ids': token_type_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }

In [9]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-base")

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
# from sklearn.preprocessing import LabelEncoder

# def get_dataset(dataset, tokenizer, label_col='converted_relation'):
#     """Turn dataframe into list(dict), dict with keys "text" and "label" """

#     # get text    
#     separation_token = tokenizer.sep_token
#     input_sentences = dataset.apply(lambda x: ''.join([x['nucleus'], separation_token, x['satellite']]), axis=1)
#     np.array(input_sentences)

#     # get labels
#     le = LabelEncoder()
#     le.fit(label_text)
#     labels = le.transform(dataset[label_col])

#     data = []   
#     for text, label in zip(input_sentences, labels):
#         datapoint = {'text': text, 'label': label}
#         data.append(datapoint)
#     data = np.array(data)
    
#     return data

In [11]:
# split train/test set while preserving class distribution
# access split data like this: for i, (train_index, test_index) in enumerate(sss.split(X, y)):

# WORKING: may try incorporating this with doc split 

# from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
# stk = StratifiedKFold(n_splits=5)
# data_split = stk.split(input_sentences, labels)

Prepare for Training


In [12]:
# def freeze_encoder_layers(model, num_frozen_layers):
#   """Freezes the first `num_frozen_layers` of DeBERTa model.

#   Args:
#       model: The DeBERTa model to be fine-tuned.
#       num_frozen_layers: The number of layers to freeze.
#   """
#   for name, param in model.named_parameters():
#     if name.startswith("deberta.encoder.layer.") and int(name.split(".")[3]) < num_frozen_layers:
#       param.requires_grad = False

In [13]:
# def freeze_embeddings_layers(model):
#   """Freezes all embeddings-related layers of DeBERTa model. (no option for number of layers 'cause there's only one)

#   Args:
#       model: The DeBERTa model to be fine-tuned.
#   """
#   for name, param in model.named_parameters():
#     if name.startswith("deberta.embeddings."):
#       param.requires_grad = False

In [14]:
# freeze_encoder_layers(model, 12)
# freeze_embeddings_layers(model)

In [15]:
# for name, param in model.named_parameters():
#     print(param.requires_grad, '-', name)         

Trainer


In [16]:
# metric
import evaluate

metric = evaluate.load('f1')
def compute_metrics(eval_pred): 
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels, average='macro')

In [17]:
# # tokenizer
# from transformers import AutoTokenizer, AutoModelForSequenceClassification
# tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-base")

# def tokenize_function(dataset):
    # return tokenizer(dataset["text"], padding='max_length', truncation=True, return_tensors='pt')

In [18]:
import datasets
from transformers import Trainer
from transformers import TrainingArguments

model = AutoModelForSequenceClassification.from_pretrained("microsoft/deberta-base", 
                                                                num_labels=17)
model.to(torch.device('cuda'))

# test_ranges = [(0, 0.2), (0.2, 0.4), (0.4, 0.6), (0.6, 0.8), (0.8, 1)]

train_df, test_df = split_dataset_by_doc(combined_df, (0.5, 0.7))

tokenized_trainset = preprocess_data(train_df, tokenizer)
tokenized_testset = preprocess_data(test_df, tokenizer)

trainset = datasets.Dataset.from_dict(tokenized_trainset)
valset = datasets.Dataset.from_dict(tokenized_testset)

# training args
training_args = TrainingArguments(
output_dir=f"./output_dataset_correct_order_correct_format", 
learning_rate=4e-6,
num_train_epochs=5,
per_device_train_batch_size=5,
per_device_eval_batch_size=5,
warmup_steps=500,
weight_decay=0.01,
load_best_model_at_end=True,
evaluation_strategy="steps",
metric_for_best_model="eval_loss",
greater_is_better=False,
save_total_limit=3,
save_steps=500,
eval_steps=500)

# trainer object to perfrom training-related tasks
trainer = Trainer(
model=model,
tokenizer=tokenizer,
args=training_args,
train_dataset=trainset,
eval_dataset=valset,
compute_metrics=compute_metrics)

trainer.train()

Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss,Validation Loss,F1
500,2.6935,2.499886,0.054031
1000,2.4289,2.266045,0.145723
1500,2.2168,2.071317,0.201409
2000,2.056,1.849533,0.273534
2500,1.831,1.653843,0.388036
3000,1.6827,1.513986,0.438738
3500,1.578,1.413495,0.476492
4000,1.4427,1.334139,0.507569
4500,1.3822,1.264921,0.526616
5000,1.2189,1.187122,0.57551


KeyboardInterrupt: 

In [None]:
eval_results = trainer.evaluate(eval_dataset=valset)
print("Score: ", eval_results, '\n')

Inference


In [None]:
device = 'cuda'
sample_n = """To work out the evolutionary history, development and relationships among groups of organisms,"""
sample_s = """biologists compare the characteristics of living species in a process called phylogenetic analysis"""
sample = sample_n + separation_token + sample_s

with torch.no_grad():
    tokens = tokenizer(sample, padding=True, truncation=True, return_tensors='pt').to(device)
    output = cur_model(**tokens)
    
logits = output.logits
logits = torch.Tensor.cpu(logits)
prediction = int(np.argmax(logits))
label = le.classes_[prediction]
print("Sentence:", sample)
print("Label:", label)  

Sentence: To work out the evolutionary history, development and relationships among groups of organisms,[SEP]biologists compare the characteristics of living species in a process called phylogenetic analysis
Label: Enablement


Helper Blocks


End Helper Blocks
