In [1]:
!pip install datasets



In [2]:
!pip install transformers



In [34]:
import pandas as pd

from transformers import BertForSequenceClassification, BertTokenizer, BertConfig, BertModel
from transformers import RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig

from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics.cluster import contingency_matrix

import shutil
import torch
import json
import numpy as np
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
np.random.seed(42)


In [35]:
def pre_process_data(df, test_proportion, train_size):

    size_df = df.shape[0]
    df = shuffle(df, random_state = 42)
    X = df[["FQText"]]
    y = df.drop(["FQText"], axis=1).astype(np.float32)
    print(y.info())

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_proportion, shuffle=True, random_state=42)

    df_train = pd.concat([X_train, y_train], axis=1).reset_index(drop=True)
    df_test = pd.concat([X_test, y_test], axis=1).reset_index(drop=True)
    df_labels = list(y.columns)

    print(df_labels)

    train_df = df_train.sample(frac=train_size, random_state=42).reset_index(drop=True)
    val_df = df_train.drop(train_df.index).reset_index(drop=True)

    print("Total amount of data: {}".format(size_df))
    print("Number of rows used to TRAIN: {}".format(train_df.shape[0]))
    print("Number of rows used to VALIDATE: {}".format(val_df.shape[0]))
    print("Number of rows used to TEST: {}".format(df_test.shape[0]))

    return train_df, val_df, df_test, df_labels

In [36]:
def set_hyperparams(hp_dictionary):

    MAX_LEN = hp_dictionary["MAX_LEN"]
    TRAIN_BATCH_SIZE = hp_dictionary["TRAIN_BATCH_SIZE"]
    VALID_BATCH_SIZE = hp_dictionary["VALID_BATCH_SIZE"]
    EPOCHS = hp_dictionary["EPOCHS"]
    LEARNING_RATE = hp_dictionary["LEARNING_RATE"]

    return MAX_LEN, TRAIN_BATCH_SIZE, VALID_BATCH_SIZE, EPOCHS, LEARNING_RATE

In [37]:
def load_ckp(checkpoint_fpath, model, optimizer):
    """
    checkpoint_path: path to save checkpoint
    model: model that we want to load checkpoint parameters into
    optimizer: optimizer we defined in previous training
    """
    # load check point
    checkpoint = torch.load(checkpoint_fpath)
    # initialize state_dict from checkpoint to model
    model.load_state_dict(checkpoint['state_dict'])
    # initialize optimizer from checkpoint to optimizer
    optimizer.load_state_dict(checkpoint['optimizer'])
    # initialize valid_loss_min from checkpoint to valid_loss_min
    valid_loss_min = checkpoint['valid_loss_min']
    # return model, optimizer, epoch value, min validation loss
    return model, optimizer, checkpoint['epoch'], valid_loss_min.item()

def save_ckp(state, is_best, checkpoint_path, best_model_path):
    """
    state: checkpoint we want to save
    is_best: is this the best checkpoint; min validation loss
    checkpoint_path: path to save checkpoint
    best_model_path: path to save best model
    """
    f_path = checkpoint_path
    # save checkpoint data to the path given, checkpoint_path
    torch.save(state, f_path)
    # if it is a best model, min validation loss
    if is_best:
        best_fpath = best_model_path
        # copy that checkpoint file to best path given, best_model_path
        shutil.copyfile(f_path, best_fpath)

In [38]:
class CustomDataset(torch.utils.data.Dataset):

    def __init__(self, df, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.df = df
        self.title = df['FQText']
        # self.labels = list(df.columns)[2:] # list of the target values
        self.targets = self.df[df_labels].values
        self.max_len = max_len

    def __len__(self):
        return len(self.title)

    def __getitem__(self, index):
        title = str(self.title[index])
        title = " ".join(title.split())

        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True, # do we actually need special tokens ??
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'  # pytorch tensors
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs["token_type_ids"].flatten(),
            'targets': torch.FloatTensor(self.targets[index])
        }

CAMBIAR EL PROBLEM TYPE AQUÍ

In [39]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.bert_model = BertModel.from_pretrained('bert-base-uncased', return_dict=True, problem_type="multi_label_classification")
        self.dropout = torch.nn.Dropout(0.3)
        self.linear = torch.nn.Linear(768, len(df_labels)) # have to changet he n of possible labels here

    def forward(self, input_ids, attn_mask, token_type_ids):
        output = self.bert_model(
            input_ids,
            attention_mask=attn_mask,
            token_type_ids=token_type_ids
        )

        # the issue is that bert gets size 6 here?
        output_dropout = self.dropout(output.pooler_output)
        # print(output_dropout)
        output = self.linear(output_dropout)
        return output

In [40]:
def loss_fn(outputs, targets):
    # print(outputs, targets)
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [81]:
def train_model(n_epochs, training_loader, validation_loader, model,
                optimizer, checkpoint_path, best_model_path, df_labels):
  # initialize tracker for minimum validation loss
  valid_loss_min = np.Inf


  for epoch in range(1, n_epochs+1):
    train_loss = 0
    valid_loss = 0

    model.train()
    print('############# Epoch {}: Training Start   #############'.format(epoch))
    for batch_idx, data in enumerate(training_loader):
        # print('yyy epoch', batch_idx)
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        """targets are the y array of the original data.
        """

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        #print('before loss data in training', loss.item(), train_loss)
        train_loss = train_loss + ((1 / (batch_idx + 1)) * (loss.item() - train_loss))
        #print('after loss data in training', loss.item(), train_loss)



    print('############# Epoch {}: Training End     #############'.format(epoch))

    print('############# Epoch {}: Validation Start   #############'.format(epoch))
    ######################
    # validate the model #
    ######################

    model.eval()

    val_targets = []
    val_outputs = []
    softm = torch.nn.Softmax(dim=1) # needed to get the actual predictions

    with torch.no_grad():
      for batch_idx, data in enumerate(validation_loader, 0):
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)
        outputs = model(ids, mask, token_type_ids)

        loss = loss_fn(outputs, targets)
        valid_loss = valid_loss + ((1 / (batch_idx + 1)) * (loss.item() - valid_loss))
        val_targets.extend(targets.cpu().detach().numpy().tolist())
        val_outputs.extend(outputs.cpu().detach().numpy().tolist())


        # outputs = model(ids.squeeze(), mask.squeeze(), token_type_ids.squeeze())



      #       metrics_targets.extend(torch.argmax(softm(targets), dim=1).cpu().detach().numpy().tolist())
      #       metrics_outputs.extend(torch.argmax(softm(outputs), dim=1).cpu().detach().numpy().tolist())

      # metrics_outputs = np.array(metrics_outputs, dtype=int)
      # val_f1 = f1_score(metrics_outputs, metrics_targets, average="weighted")
      # val_acc = accuracy_score(metrics_outputs, metrics_targets)
      # class_report = classification_report(metrics_outputs, metrics_targets, target_names = df_labels)

      # print((f"Accuracy: {val_f1}"))
      # print((f"F1 Score (Weighted): {val_f1}"))
      # print((f"Classification report: \n{class_report}"))



      print('############# Epoch {}: Validation End     #############'.format(epoch))

      # calculate average losses
      #print('before cal avg train loss', train_loss)
      train_loss = train_loss/len(training_loader)
      valid_loss = valid_loss/len(validation_loader)
      # print training/validation statistics
      print('Epoch: {} \tAvgerage Training Loss: {:.6f} \tAverage Validation Loss: {:.6f}'.format(
            epoch,
            train_loss,
            valid_loss
            ))

      # create checkpoint variable and add important data
      checkpoint = {
            'epoch': epoch + 1,
            'valid_loss_min': valid_loss,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict()
      }

        # save checkpoint
      save_ckp(checkpoint, True, checkpoint_path, best_model_path)

      ## TODO: save the model if validation loss has decreased
      # if valid_loss <= valid_loss_min:
      #   print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,valid_loss))
      #   # save checkpoint as best model
      #   save_ckp(checkpoint, True, checkpoint_path, best_model_path)
      #   valid_loss_min = valid_loss



    print('############# Epoch {}  Metrics   #############\n\n'.format(epoch))
    metrics = multi_labels_metrics(val_outputs, val_targets)
    # print((f"EVAL METRICS: {metrics}\n"))

    print('############# Epoch {}  Done   #############\n'.format(epoch))
  return model

In [42]:
# not used at the moment

# def compute_metrics(epoch, validation_loader):
#   model.eval()
#   metrics_targets = []
#   metrics_outputs = []
#   softm = torch.nn.Softmax(dim=1)
#   with torch.no_grad():
#         for _, data in enumerate(validation_loader, 0):
#               ids = data['input_ids'].to(device, dtype = torch.long)
#               mask = data['attention_mask'].to(device, dtype = torch.long)
#               token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
#               targets = data['targets'].to(device, dtype = torch.float)

#               outputs = model(ids.squeeze(), mask.squeeze(), token_type_ids.squeeze())

#               metrics_targets.extend(torch.argmax(softm(targets), dim=1).cpu().detach().numpy().tolist())
#               metrics_outputs.extend(torch.argmax(softm(outputs), dim=1).cpu().detach().numpy().tolist())

#   return metrics_targets, metrics_outputs

TRAIN AND VALIDATION LOOP

# Determinants

In [86]:
df = pd.read_csv("nancy_determinants_grouped.csv")


GROUPED

In [87]:
df.rename(columns = {"Answer (English)": "FQText"}, inplace=True)

df.drop(['Déterminant', 'C', 'C\'', 'C\'F',
       'CF', 'CF\'', 'CLOB', 'CLOBF', 'E', 'EF', 'F', 'FC', 'FC\'', 'FCLOB',
       'FE', 'K', 'KAN', 'KOB', 'KP', 'Réponse (French)'], axis=1, inplace=True)

In [88]:
# df.columns

INDIVIDUAL

In [89]:
# df.rename(columns = {"Answer (English)": "FQText"}, inplace=True)

# df.drop(['Déterminant', 'color_sum', 'threat_sum', 'fading_sum',
#        'form_sum', 'kinesthetics_sum', 'Réponse (French)'], axis=1, inplace=True)

In [90]:
# df.columns

# Contents

In [91]:
# df = pd.read_csv("nancy_contents_grouped.csv")


GROUPED

In [92]:
# df.rename(columns = {"Answer (English)": "FQText"}, inplace=True)

# df.drop(['Contenu', '(A)', '(AD)', '(H)', '(HD)',
#        'A', 'ABS', 'AD', 'ALIM', 'ANAT', 'ARCH', 'ART', 'BOT', 'ELEM', 'FRAG',
#        'GÉO', 'H', 'HD', 'MQ', 'NAT', 'OBJ', 'PAYS', 'RADIO', 'SC', 'SCÈNE',
#        'SEX', 'SG', 'VÊT', 'Réponse (French)'], axis=1, inplace=True)

In [93]:
# df.columns

INDIVIDUAL

In [94]:
# df.rename(columns = {"Answer (English)": "FQText"}, inplace=True)

# df.drop(['Contenu', 'animal_sum', 'human_sum', 'abs_sum',
#        'food_sum', 'art_arch_sum', 'nature_sum', 'fragment_sum', 'geo_sum',
#        'object_sum', 'science_sum', 'graphic_sum', 'Réponse (French)'], axis=1, inplace=True)

In [95]:
# df.columns

# MODEL

In [96]:
hp_dictionary = {"MAX_LEN":256 ,
    "TRAIN_BATCH_SIZE": 16,
    "VALID_BATCH_SIZE": 16,
    "EPOCHS": 4,
    "LEARNING_RATE": 1e-05
}

CHANGE PROBLEM TYPE AQUI

In [97]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', problem_type="multi_label_classification")



CHANGE PROBLEM TYPE AQUI?

In [98]:
MAX_LEN, TRAIN_BATCH_SIZE, VALID_BATCH_SIZE, EPOCHS, LEARNING_RATE = set_hyperparams(hp_dictionary)

test_proportion = 0.1
test_size = 0.85

train_df, val_df, df_test, df_labels  = pre_process_data(df, test_proportion, test_size)


train_dataset = CustomDataset(train_df, tokenizer, MAX_LEN)
valid_dataset = CustomDataset(val_df, tokenizer, MAX_LEN)

train_data_loader = torch.utils.data.DataLoader(train_dataset,
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=True,
    num_workers=0
)

val_data_loader = torch.utils.data.DataLoader(valid_dataset,
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)


device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model = BERTClass()
model.to(device)

optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

proba_threshold = 0.5


ckpt_path = "curr_ckpt"
best_model_path = "best_model.pt"

<class 'pandas.core.frame.DataFrame'>
Index: 380 entries, 266 to 102
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   color_sum         380 non-null    float32
 1   threat_sum        380 non-null    float32
 2   fading_sum        380 non-null    float32
 3   form_sum          380 non-null    float32
 4   kinesthetics_sum  380 non-null    float32
dtypes: float32(5)
memory usage: 10.4 KB
None
['color_sum', 'threat_sum', 'fading_sum', 'form_sum', 'kinesthetics_sum']
Total amount of data: 380
Number of rows used to TRAIN: 291
Number of rows used to VALIDATE: 51
Number of rows used to TEST: 38




Counting the labels present in each train/test/val batch


In [99]:
# def count_instances(y):
#   total_labels = 0
#   for col in y:
#       if col == "FQText":
#           continue
#       else:
#           sum_examples = y[col].sum()
#           print(col, sum_examples)

In [100]:
# y_val = val_df.drop("FQText", axis=1)
# y_train = train_df.drop("FQText", axis=1)
# y_test = df_test.drop("FQText", axis=1)


In [101]:
# print("train class distribution".upper())
# print("")
# print(count_instances(y_train))
# print("total size of the test data", len(y_train))
# print("\nvalidation class distribution".upper())
# print("")
# print(count_instances(y_val))
# print("total size of the test data", len(y_val))
# print("\ntest class distribution".upper())
# print("")
# print(count_instances(y_test))
# print("total size of the test data", len(y_test))

https://discuss.huggingface.co/t/dataset-label-format-for-multi-label-text-classification/14998


https://colab.research.google.com/drive/1aue7x525rKy6yYLqqt-5Ll96qjQvpqS7#scrollTo=6d869uZsT9MH

https://discuss.huggingface.co/t/multi-class-using-dataset/8970/3

https://www.youtube.com/watch?v=ZYc9za75Chk

In [102]:
import numpy as np
from sklearn.metrics import roc_auc_score, f1_score, hamming_loss
from transformers import EvalPrediction
import torch


def multi_labels_metrics(predictions, targets, threshold=0.355, df_labels=df_labels): # mess with threshold
  sigmoid = torch.nn.Sigmoid()
  probs = sigmoid(torch.Tensor(predictions)) #mapply activation function in the raw values

  y_pred = np.zeros(probs.shape)
  y_pred[np.where(probs>=threshold)] = 1
  y_true = targets

  f1 = f1_score(y_true, y_pred, average = 'micro')
  roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
  hamming = hamming_loss(y_true, y_pred)
  class_report = classification_report(y_true, y_pred, target_names=df_labels)

  metrics = {
      "roc_auc": roc_auc,       # special multilabel metrics
      "hamming_loss": hamming,  # special multilabel metrics
      "f1": f1
  }
  print(metrics)
  print(class_report)
  return metrics

def compute_test_metrics(test_loader):

  model.eval()
  metrics_targets = []
  metrics_outputs = []
  # softm = torch.nn.Softmax(dim=1)
  with torch.no_grad():
        for _, data in enumerate(test_loader, 0):
              ids = data['input_ids'].to(device, dtype = torch.long)
              mask = data['attention_mask'].to(device, dtype = torch.long)
              token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
              targets = data['targets'].to(device, dtype = torch.float)

              outputs = model(ids.squeeze(), mask.squeeze(), token_type_ids.squeeze())

              metrics_targets.extend(targets.cpu().detach().numpy().tolist())
              metrics_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

  metrics = multi_labels_metrics(metrics_outputs, metrics_targets)
  print((f"EVAL METRICS: {metrics}"))

  return metrics


In [103]:
trained_model = train_model(EPOCHS, train_data_loader, val_data_loader, model, optimizer, ckpt_path, best_model_path, df_labels)

############# Epoch 1: Training Start   #############
############# Epoch 1: Training End     #############
############# Epoch 1: Validation Start   #############
############# Epoch 1: Validation End     #############
Epoch: 1 	Avgerage Training Loss: 0.032807 	Average Validation Loss: 0.142531
############# Epoch 1  Metrics   #############


{'roc_auc': 0.6397058823529411, 'hamming_loss': 0.49411764705882355, 'f1': 0.5}
                  precision    recall  f1-score   support

       color_sum       0.43      1.00      0.60        22
      threat_sum       0.00      0.00      0.00         3
      fading_sum       0.14      1.00      0.24         6
        form_sum       0.43      0.89      0.58        18
kinesthetics_sum       0.37      1.00      0.54        19

       micro avg       0.34      0.93      0.50        68
       macro avg       0.27      0.78      0.39        68
    weighted avg       0.37      0.93      0.52        68
     samples avg       0.36      0.94      0.49  

  _warn_prf(average, modifier, msg_start, len(result))


############# Epoch 3: Training End     #############
############# Epoch 3: Validation Start   #############
############# Epoch 3: Validation End     #############
Epoch: 3 	Avgerage Training Loss: 0.026712 	Average Validation Loss: 0.117788
############# Epoch 3  Metrics   #############


{'roc_auc': 0.7172459893048128, 'hamming_loss': 0.21568627450980393, 'f1': 0.5864661654135338}
                  precision    recall  f1-score   support

       color_sum       0.73      0.73      0.73        22
      threat_sum       0.00      0.00      0.00         3
      fading_sum       0.00      0.00      0.00         6
        form_sum       0.49      1.00      0.65        18
kinesthetics_sum       0.83      0.26      0.40        19

       micro avg       0.60      0.57      0.59        68
       macro avg       0.41      0.40      0.36        68
    weighted avg       0.60      0.57      0.52        68
     samples avg       0.62      0.62      0.60        68

############# Epoch 3  Done  

  _warn_prf(average, modifier, msg_start, len(result))


############# Epoch 4: Training End     #############
############# Epoch 4: Validation Start   #############
############# Epoch 4: Validation End     #############
Epoch: 4 	Avgerage Training Loss: 0.025446 	Average Validation Loss: 0.109557
############# Epoch 4  Metrics   #############


{'roc_auc': 0.7713903743315508, 'hamming_loss': 0.1843137254901961, 'f1': 0.6618705035971224}
                  precision    recall  f1-score   support

       color_sum       0.77      0.77      0.77        22
      threat_sum       0.00      0.00      0.00         3
      fading_sum       0.00      0.00      0.00         6
        form_sum       0.53      1.00      0.69        18
kinesthetics_sum       0.73      0.58      0.65        19

       micro avg       0.65      0.68      0.66        68
       macro avg       0.41      0.47      0.42        68
    weighted avg       0.60      0.68      0.61        68
     samples avg       0.64      0.70      0.65        68

############# Epoch 4  Done   

  _warn_prf(average, modifier, msg_start, len(result))


TEST

In [104]:
test_dataset = CustomDataset(df_test, tokenizer, MAX_LEN)

test_data_loader = torch.utils.data.DataLoader(test_dataset,
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

In [105]:
metrics =  compute_test_metrics(test_data_loader)

{'roc_auc': 0.5, 'hamming_loss': 0.7368421052631579, 'f1': 0.4166666666666667}
                  precision    recall  f1-score   support

       color_sum       0.39      1.00      0.57        15
      threat_sum       0.16      1.00      0.27         6
      fading_sum       0.08      1.00      0.15         3
        form_sum       0.34      1.00      0.51        13
kinesthetics_sum       0.34      1.00      0.51        13

       micro avg       0.26      1.00      0.42        50
       macro avg       0.26      1.00      0.40        50
    weighted avg       0.32      1.00      0.48        50
     samples avg       0.26      1.00      0.41        50

EVAL METRICS: {'roc_auc': 0.5, 'hamming_loss': 0.7368421052631579, 'f1': 0.4166666666666667}
