In [1]:
!pip install datasets



In [2]:
!pip install transformers



In [3]:
import shutil

In [4]:
import pandas as pd
import numpy as np

determinants = pd.read_csv("data_clean_determinants.csv")

In [5]:
from sklearn.utils import shuffle

determinants = shuffle(determinants)

In [6]:
determinants.columns

Index(['Unnamed: 0', 'FQText', 'C', 'C'', 'C'F', 'CF', 'E', 'EF', 'F', 'FC',
       'FC'', 'FE', 'K', 'kan'],
      dtype='object')

In [7]:
determinants_train = determinants.iloc[:-56].reset_index(drop=True)
determinants_test = determinants.iloc[-56:].reset_index(drop=True) #

In [8]:
print(len(determinants_train))

691


In [9]:
print(len(determinants_test))

56


In [10]:
# from datasets import Dataset

# ds_contents = Dataset.from_pandas(contents).train_test_split(0.15)
# ds_determinants = Dataset.from_pandas(determinants).train_test_split(0.15)

In [11]:
y_labels_determinants = list(determinants.columns)[2:]

In [12]:
print(len(y_labels_determinants))

12


In [13]:
MAX_LEN = 256
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 16
EPOCHS = 3
LEARNING_RATE = 1e-05

In [14]:
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [15]:
import torch

In [16]:
class CustomDataset(torch.utils.data.Dataset):

    def __init__(self, df, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.df = df
        self.title = df['FQText']
        # self.labels = list(df.columns)[2:] # list of the target values
        self.targets = self.df[y_labels_determinants].values
        self.max_len = max_len

    def __len__(self):
        return len(self.title)

    def __getitem__(self, index):
        title = str(self.title[index])
        title = " ".join(title.split())

        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True, # do we actually need special tokens ??
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'  # pytorch tensors
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs["token_type_ids"].flatten(),
            'targets': torch.FloatTensor(self.targets[index])
        }

In [17]:
train_size = 0.8

train_df = determinants_train.sample(frac=train_size, random_state=42).reset_index(drop=True).drop("Unnamed: 0", axis=1)
val_df = determinants_train.drop(train_df.index).reset_index(drop=True).drop("Unnamed: 0", axis=1)

In [18]:
train_df

Unnamed: 0,FQText,C,C',C'F,CF,E,EF,F,FC,FC',FE,K,kan
0,"Animal (In natural form, canine or feline, suc...",0,0,0,0,0,0,1,0,0,0,0,0
1,Pelvis,0,0,0,0,0,0,1,0,0,0,0,0
2,Frog (Dissected),0,0,0,0,0,0,0,0,0,0,0,1
3,Animals (Back to back),0,0,0,0,0,0,0,0,0,0,0,1
4,Helmet (Science fiction),0,0,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
548,Crucifix (Abstract),0,0,0,0,1,0,0,0,0,0,1,0
549,Dogs,0,0,0,0,1,0,0,0,0,1,1,0
550,Dog,0,0,0,0,0,0,0,0,0,1,0,1
551,Head (Fish),0,0,0,0,0,0,0,0,0,1,1,0


In [19]:
val_df

Unnamed: 0,FQText,C,C',C'F,CF,E,EF,F,FC,FC',FE,K,kan
0,Dinosaur (Front view),0,0,0,0,0,0,0,0,1,0,0,0
1,Bone Structure,0,0,0,0,0,0,0,0,0,0,1,0
2,Groudhog,0,0,0,0,0,0,0,0,0,1,0,0
3,"Boat (With sails, front view)",0,0,0,0,0,0,0,0,0,1,0,0
4,"Head (Animal, long eared)",0,0,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
133,Lung(s),0,0,0,0,0,0,0,1,0,0,0,0
134,Vagina,0,0,0,0,0,0,0,0,0,0,1,0
135,Head (Monster),0,0,0,0,0,0,1,0,0,0,0,0
136,Cherub,0,0,0,0,0,0,0,0,0,0,1,0


In [20]:
train_dataset = CustomDataset(train_df, tokenizer, MAX_LEN)
valid_dataset = CustomDataset(val_df, tokenizer, MAX_LEN)

In [21]:
train_data_loader = torch.utils.data.DataLoader(train_dataset,
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=True,
    num_workers=0
)

val_data_loader = torch.utils.data.DataLoader(valid_dataset,
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

In [22]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [23]:
print(device)

cuda


In [24]:
def load_ckp(checkpoint_fpath, model, optimizer):
    """
    checkpoint_path: path to save checkpoint
    model: model that we want to load checkpoint parameters into
    optimizer: optimizer we defined in previous training
    """
    # load check point
    checkpoint = torch.load(checkpoint_fpath)
    # initialize state_dict from checkpoint to model
    model.load_state_dict(checkpoint['state_dict'])
    # initialize optimizer from checkpoint to optimizer
    optimizer.load_state_dict(checkpoint['optimizer'])
    # initialize valid_loss_min from checkpoint to valid_loss_min
    valid_loss_min = checkpoint['valid_loss_min']
    # return model, optimizer, epoch value, min validation loss
    return model, optimizer, checkpoint['epoch'], valid_loss_min.item()

def save_ckp(state, is_best, checkpoint_path, best_model_path):
    """
    state: checkpoint we want to save
    is_best: is this the best checkpoint; min validation loss
    checkpoint_path: path to save checkpoint
    best_model_path: path to save best model
    """
    f_path = checkpoint_path
    # save checkpoint data to the path given, checkpoint_path
    torch.save(state, f_path)
    # if it is a best model, min validation loss
    if is_best:
        best_fpath = best_model_path
        # copy that checkpoint file to best path given, best_model_path
        shutil.copyfile(f_path, best_fpath)

In [25]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.bert_model = BertModel.from_pretrained('bert-base-uncased', return_dict=True)
        self.dropout = torch.nn.Dropout(0.3)
        self.linear = torch.nn.Linear(768, len(y_labels_determinants)) # have to changet he n of possible labels here

    def forward(self, input_ids, attn_mask, token_type_ids):
        output = self.bert_model(
            input_ids,
            attention_mask=attn_mask,
            token_type_ids=token_type_ids
        )

        # the issue is that bert gets size 6 here?
        output_dropout = self.dropout(output.pooler_output)
        # print(output_dropout)
        output = self.linear(output_dropout)
        return output

model = BERTClass()
model.to(device)



BERTClass(
  (bert_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

https://discuss.huggingface.co/t/huggingface-transformers-bert-for-classification-dimensionality-of-output-with-classification-layer-is-expected-to-be-1-but-is-512-instead/21640

In [26]:
from sklearn.metrics import f1_score

In [27]:
# def f1_score_func(predictions, labels):
#     preds_flat = np.argmax(predictions, axis=1).flatten()
#     labels_flat = labels.flatten()
#     return f1_score(labels_flat, preds_flat, average="weighted")

In [28]:
# def accuracy_per_class(predictions, labels):
#     label_dict_inverse = {v: k for k, v in label_dict.items()}

#     predictions_flat = np.argmax(predictions, axis=1).flatten()
#     labels_flat = labels.flatten()

#     for label in np.unique(labels_flat):
#         y_predictions = predictions_flat[labels_flat==label]
#         y_true = labels_flat[labels_flat==label]
#         print(f"Class: {label_dict_inverse[label]}")
#         print(f"Accuracy: {len(y_predictions[y_predictions==label])}/{len(y_true)}\n")

LOSS FUNCTION MARK


In [29]:
from sklearn.metrics import accuracy_score

In [30]:
def loss_fn(outputs, targets):
    # print(outputs, targets)
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [31]:
val_targets=[]
val_outputs=[]

In [36]:
def train_model(n_epochs, training_loader, validation_loader, model,
                optimizer, checkpoint_path, best_model_path):

  # initialize tracker for minimum validation loss
  valid_loss_min = np.Inf


  for epoch in range(1, n_epochs+1):
    train_loss = 0
    valid_loss = 0

    model.train()
    print('############# Epoch {}: Training Start   #############'.format(epoch))
    for batch_idx, data in enumerate(training_loader):
        # print('yyy epoch', batch_idx)
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        """targets are the y array of the original data. it contains the 25 labels.
        The issue is: the outputs predicted are of size 6 for some reason
        """

        outputs = model(ids, mask, token_type_ids)



        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        #if batch_idx%5000==0:
         #   print(f'Epoch: {epoch}, Training Loss:  {loss.item()}')

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        #print('before loss data in training', loss.item(), train_loss)
        train_loss = train_loss + ((1 / (batch_idx + 1)) * (loss.item() - train_loss))
        #print('after loss data in training', loss.item(), train_loss)

    print('############# Epoch {}: Training End     #############'.format(epoch))

    print('############# Epoch {}: Validation Start   #############'.format(epoch))
    ######################
    # validate the model #
    ######################

    model.eval()

    with torch.no_grad():
      for batch_idx, data in enumerate(validation_loader, 0):
            ids = data['input_ids'].to(device, dtype = torch.long)
            mask = data['attention_mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)

            outputs = model(ids, mask, token_type_ids)

            loss = loss_fn(outputs, targets)
            valid_loss = valid_loss + ((1 / (batch_idx + 1)) * (loss.item() - valid_loss))
            val_targets.extend(targets.cpu().detach().numpy().tolist())
            val_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

            # print(val_targets, val_outputs)
            # logits = outputs[1]
            # logits = logits.detach().cpu().numpy()
            # label_ids = data['targets'].cpu().numpy()
            # predictions.append(logits)
            # true_vals.append(label_ids)

      predictions = []
      # print(val_outputs[0])
      for array_prob in val_outputs:
        pred = list(np.where(np.array(array_prob) > 0.5, 1.0, 0.0 ))
        if 1 not in pred:
          idx = np.argmax(array_prob)
          # print(idx)
          pred[idx] = 1.0

        predictions.append(pred)
      # print(predictions[0], len(predictions[0]))
      # print(val_targets[0], len(val_targets[0]))

      val_f1 = f1_score(predictions, val_targets, average="weighted")
      val_acc = accuracy_score(predictions, val_targets)
      print((f"Accuracy: {val_f1}"))
      print((f"F1 Score (Weighted): {val_f1}"))



      print('############# Epoch {}: Validation End     #############'.format(epoch))
      # calculate average losses
      #print('before cal avg train loss', train_loss)
      train_loss = train_loss/len(training_loader)
      valid_loss = valid_loss/len(validation_loader)
      # print training/validation statistics
      print('Epoch: {} \tAvgerage Training Loss: {:.6f} \tAverage Validation Loss: {:.6f}'.format(
            epoch,
            train_loss,
            valid_loss
            ))

      # create checkpoint variable and add important data
      checkpoint = {
            'epoch': epoch + 1,
            'valid_loss_min': valid_loss,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict()
      }

        # save checkpoint
      save_ckp(checkpoint, False, checkpoint_path, best_model_path)

      ## TODO: save the model if validation loss has decreased
      if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,valid_loss))
        # save checkpoint as best model
        save_ckp(checkpoint, True, checkpoint_path, best_model_path)
        valid_loss_min = valid_loss

    print('############# Epoch {}  Done   #############\n'.format(epoch))

  return model

In [33]:
ckpt_path = "curr_ckpt"
best_model_path = "best_model.pt"

In [37]:
trained_model = train_model(EPOCHS, train_data_loader, val_data_loader, model, optimizer, ckpt_path, best_model_path)

############# Epoch 1: Training Start   #############
############# Epoch 1: Training End     #############
############# Epoch 1: Validation Start   #############
Accuracy: 0.35345453586487957
F1 Score (Weighted): 0.35345453586487957
############# Epoch 1: Validation End     #############
Epoch: 1 	Avgerage Training Loss: 0.012019 	Average Validation Loss: 0.040351
Validation loss decreased (inf --> 0.040351).  Saving model ...
############# Epoch 1  Done   #############

############# Epoch 2: Training Start   #############
############# Epoch 2: Training End     #############
############# Epoch 2: Validation Start   #############
Accuracy: 0.3753594432942259
F1 Score (Weighted): 0.3753594432942259
############# Epoch 2: Validation End     #############
Epoch: 2 	Avgerage Training Loss: 0.010286 	Average Validation Loss: 0.035928
Validation loss decreased (0.040351 --> 0.035928).  Saving model ...
############# Epoch 2  Done   #############

############# Epoch 3: Training Start   #

In [None]:
# testing
# contents_test
encodings = tokenizer.encode_plus(
    contents_test,
    None,
    add_special_tokens=True,
    max_length=MAX_LEN,
    padding='max_length',
    return_token_type_ids=True,
    truncation=True,
    return_attention_mask=True,
    return_tensors='pt'
)
model.eval()
with torch.no_grad():
    input_ids = encodings['input_ids'].to(device, dtype=torch.long)
    attention_mask = encodings['attention_mask'].to(device, dtype=torch.long)
    token_type_ids = encodings['token_type_ids'].to(device, dtype=torch.long)
    output = model(input_ids, attention_mask, token_type_ids)
    final_output = torch.sigmoid(output).cpu().detach().numpy().tolist()
    print(train_df.columns[1:].to_list()[int(np.argmax(final_output, axis=1))])