In [3]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
import yaml
import transformers
from transformers import AutoModel, BertTokenizerFast
from tqdm.notebook import tqdm
import torch.optim as optim
from ray import tune
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW

In [4]:
df = pd.read_json(r"../data/df_final_document.json") # document

le = LabelEncoder()
df['label.132'] = le.fit_transform(df['label_132'])
df['label.134'] = le.fit_transform(df['label_134'])
df = df[df['submit']==1]

In [3]:
# specify GPU
device = "cuda" if torch.cuda.is_available() else "cpu"

In [5]:
# split data into train, test, validation sets
train_text, temp_text, train_labels, temp_labels = train_test_split(df['text_clean'], df['label.132'], 
                                                                    random_state=2018, 
                                                                    test_size=0.3, 
                                                                    stratify=df['label.132'])

val_text, test_text, val_labels, test_labels = train_test_split(temp_text, temp_labels, 
                                                                random_state=2018, 
                                                                test_size=0.5, 
                                                                stratify=temp_labels)

In [13]:
print(len(train_text))
print(len(val_text))
print(len(test_text))

51
11
11


In [6]:
# import BERT-base pretrained model
bert = AutoModel.from_pretrained('bert-base-uncased', return_dict=False)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
# Load the BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased', return_dict=False)

In [9]:
# tokenize and encode sequences in the training set
tokens_train = tokenizer.batch_encode_plus(
    train_text.tolist(),
    max_length = 25,
    pad_to_max_length=True,
    truncation=True
)

# tokenize and encode sequences in the validation set
tokens_val = tokenizer.batch_encode_plus(
    val_text.tolist(),
    max_length = 25,
    pad_to_max_length=True,
    truncation=True
)

# tokenize and encode sequences in the test set
tokens_test = tokenizer.batch_encode_plus(
    test_text.tolist(),
    max_length = 25,
    pad_to_max_length=True,
    truncation=True
)



In [10]:
## convert lists to tensors

train_seq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_train['attention_mask'])
train_y = torch.tensor(train_labels.tolist())

val_seq = torch.tensor(tokens_val['input_ids'])
val_mask = torch.tensor(tokens_val['attention_mask'])
val_y = torch.tensor(val_labels.tolist())

test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])
test_y = torch.tensor(test_labels.tolist())

In [11]:
# Define Model Architecture
class BERT_Arch(nn.Module):

    def __init__(self, bert):
      
      super(BERT_Arch, self).__init__()

      self.bert = bert 
      self.dropout = nn.Dropout(0.1)
      self.relu =  nn.ReLU()
      self.fc1 = nn.Linear(768,512)
      self.fc2 = nn.Linear(512,2)
      self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, sent_id, mask):

      #pass the inputs to the model  
      _, cls_hs = self.bert(sent_id, attention_mask=mask)
      x = self.fc1(cls_hs)
      x = self.relu(x)
      x = self.dropout(x)
      x = self.fc2(x)
      x = self.softmax(x)

      return x


In [12]:
# method to freeze all the parameters if freeze = T
def set_parameter_requires_grad(model, freeze):
    if freeze:
        for param in model.parameters():
            param.requires_grad = False

In [14]:
# freeze all parameters
set_parameter_requires_grad(model=bert, freeze=True)

# pass the pre-trained BERT to our define architecture
model = BERT_Arch(bert)

# specify GPU
device = "cuda" if torch.cuda.is_available() else "cpu"

# push the model to GPU
model = model.to(device)

In [110]:
def run_model(config, n_epochs):
    # set initial loss to infinite
    best_valid_loss = float('inf')

    train_losses=[]
    valid_losses=[]

    for epoch in range(n_epochs):
        print('\n Epoch {:} / {:}'.format(epoch + 1, n_epochs))
        # print('\n Epoch {:} / {:}'.format(epoch + 1, config["n_epochs"]))
        train_loss, _ = train(model, criterion, config)
        # train_loss, _ = train(model, train_dataloader, criterion, optimizer, config)
        valid_loss, _ = evaluate(model, criterion, config)
        # valid_loss, _ = evaluate(model, val_dataloader, criterion, config)
        
        #save the best model
        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), 'saved_weights_hp-tuned.pt')
        
        # append training and validation loss
        train_losses.append(train_loss)
        valid_losses.append(valid_loss)

In [None]:
#load weights of best model
path = 'saved_weights_hp-tuned.pt'
model.load_state_dict(torch.load(path))


## Hyperparameter Tunign with `Ray Tune`

In [None]:
# config file for ray tune hyperparameter tuning

config = {
    "batch_size": tune.choice([16, 32]),
    "lr": tune.choice([2e-5, 3e-5, 5e-5])
    # "n_epochs": tune.choice([[2, 3, 4]])
}

## Aproach 2

In [37]:
def train_bert(config, checkpoint_dir=None, data_dir=None):
# def train_bert(config, checkpoint_dir=None, data_dir=None, train_data_arg=None, val_data_arg=None):
    device = "cuda" if torch.cuda.is_available() else "cpu"

    criterion = nn.NLLLoss() 
    optimizer = AdamW(model.parameters(), lr = config["lr"])

    # if checkpoint_dir:
    #     model_state, optimizer_state = torch.load(
    #         os.path.join(checkpoint_dir, "checkpoint"))
    #     net.load_state_dict(model_state)
    #     optimizer.load_state_dict(optimizer_state)

    # trainset, testset = load_data(data_dir)

    train_dataloader = torch.utils.data.DataLoader(
        train_data,
        num_workers=2,
        shuffle=True,
        # sampler=train_sampler, 
        batch_size=int(config["batch_size"])
    )

    # train_dataloader = DataLoader(train_data, num_workers=num_workers, shuffle=True, batch_size=batch_size)

    val_dataloader = DataLoader(
        val_data, 
        num_workers=2,
        shuffle=True,
        # sampler = val_sampler, 
        batch_size=int(config["batch_size"])
    )

    best_val_loss = float("inf")
    
    for epoch in range(10):  # loop over the dataset multiple times
        running_loss = 0.0
        epoch_steps = 0
        for i, data in enumerate(train_dataloader, 0):

            # push to gpu
            inputs = [r.to(device) for r in inputs]
            sent_id, mask, labels = inputs

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            preds = model(sent_id, mask)
            loss = criterion(preds, labels)
            total_loss = total_loss + loss.item()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) #prevent exploding gradient problem
            optimizer.step()
            preds=preds.detach().cpu().numpy()

            # append the model predictions
            total_preds.append(preds)

            # print statistics
            # running_loss += loss.item()
            # epoch_steps += 1
            # if i % 2000 == 1999:  # print every 2000 mini-batches
            #     print("[%d, %5d] loss: %.3f" % (epoch + 1, i + 1,
            #                                     running_loss / epoch_steps))
            #     running_loss = 0.0

        # Validation loss
        val_loss = 0.0
        val_steps = 0
        total = 0
        correct = 0
        for i, data in enumerate(val_dataloader, 0):

            # push to gpu
            inputs = [t.to(device) for t in inputs]
            sent_id, mask, labels = inputs

            with torch.no_grad():
                preds = model(sent_id, mask)
                loss = criterion(preds,labels)
                total_loss = total_loss + loss.item()
                preds = preds.detach().cpu().numpy()
                total_preds.append(preds)

            val_loss = total_loss / len(val_dataloader)
            total_preds  = np.concatenate(total_preds, axis=0)
        # print(f"Validation Loss: {epoch_loss:.2f}")

        

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), 'saved_weights_hp-tuned.pt')
        
        # with tune.checkpoint_dir(epoch) as checkpoint_dir:
        #     path = os.path.join(checkpoint_dir, "checkpoint")
        #     torch.save((net.state_dict(), optimizer.state_dict()), path)

        tune.report(loss=val_loss)
    print("Finished Training")

In [35]:
train_data = TensorDataset(train_seq, train_mask, train_y)
val_data = TensorDataset(val_seq, val_mask, val_y)

In [38]:
# from functools import partial

config = {
    "batch_size": tune.choice([16, 32]),
    "lr": tune.choice([2e-5, 3e-5, 5e-5])
    # "n_epochs": tune.choice([[2, 3, 4]])
}

analysis = tune.run(
    train_bert,
    # tune.with_parameters(train_bert(config=config, train_data_arg=train_data, val_data_arg=val_data)),
    # partial(train_bert), 
    # train_bert,
    # train_bert(config=config, train_data_arg=train_data, val_data_arg=val_data),
    verbose=2,
    # config
    config = {
        "batch_size": tune.choice([16, 32]),
        "lr": tune.choice([2e-5, 3e-5, 5e-5])
        # "n_epochs": tune.choice([[2, 3, 4]])
    },
    metric="val_loss"
)

# print("Best config: ", analysis.get_best_config(metric="mean_precision"))

# Get a dataframe for analyzing trial results.
df_analysis = analysis.dataframe()

0,1
Current time:,2022-12-12 18:21:46
Running for:,00:00:07.84
Memory:,11.6/16.0 GiB

Trial name,# failures,error file
train_bert_6f981_00000,1,"/Users/lukaswarode/ray_results/train_bert_2022-12-12_18-21-38/train_bert_6f981_00000_0_batch_size=32,lr=0.0000_2022-12-12_18-21-41/error.txt"

Trial name,status,loc,batch_size,lr
train_bert_6f981_00000,ERROR,,32,3e-05


2022-12-12 18:21:44,028	ERROR ray_trial_executor.py:580 -- Trial train_bert_6f981_00000: Unexpected error starting runner.
Traceback (most recent call last):
  File "/Users/lukaswarode/miniforge3/lib/python3.9/site-packages/ray/tune/execution/ray_trial_executor.py", line 573, in start_trial
    return self._start_trial(trial)
  File "/Users/lukaswarode/miniforge3/lib/python3.9/site-packages/ray/tune/execution/ray_trial_executor.py", line 473, in _start_trial
    runner = self._setup_remote_runner(trial)
  File "/Users/lukaswarode/miniforge3/lib/python3.9/site-packages/ray/tune/execution/ray_trial_executor.py", line 414, in _setup_remote_runner
    return full_actor_class.remote(**kwargs)
  File "/Users/lukaswarode/miniforge3/lib/python3.9/site-packages/ray/actor.py", line 637, in remote
    return actor_cls._remote(args=args, kwargs=kwargs, **updated_options)
  File "/Users/lukaswarode/miniforge3/lib/python3.9/site-packages/ray/util/tracing/tracing_helper.py", line 387, in _invocation_

TuneError: ('Trials did not complete', [train_bert_6f981_00000])

In [118]:
def test_preds():
    # load weights of best model
    path = 'saved_weights_hp-tuned.pt'
    model.load_state_dict(torch.load(path))

    # prediction for test set
    with torch.no_grad():
        preds = model(test_data)
        preds = preds.detach().cpu().numpy()
        preds = np.argmax(preds, axis=1)

    return preds

In [None]:
print(classification_report(test_y, preds))