In [8]:
# !pip install datasets

In [9]:
# !pip install transformers

In [10]:
# !pip install scikit-multilearn

In [11]:
# !nvidia-smi

In [12]:
import pandas as pd

from transformers import BertForSequenceClassification, BertTokenizer, BertConfig, BertModel
from transformers import RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig

from skmultilearn.model_selection import iterative_train_test_split

from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics.cluster import contingency_matrix

import shutil
import os
import torch
import json
import numpy as np
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
np.random.seed(42)


  from .autonotebook import tqdm as notebook_tqdm


# Model helper functions and classes

In [13]:
def pre_process_data(df, test_proportion, train_size):

    df.columns.values[0] = "FQText"
    size_df = df.shape[0]
    df = shuffle(df, random_state = 42)

    X = df[["FQText"]].to_numpy()

    y_df = df.drop(["FQText"], axis=1).astype(np.float32)
    cols = df.columns
    df_labels = list(y_df.columns)

    y = y_df.to_numpy()
    print(y_df.info())

    test_proportion = 0.05
    train_size = 0.9

    X_model_train, y_model_train, X_test, y_test = iterative_train_test_split(X, y, test_size=test_proportion)
    X_train, y_train, X_val, y_val = iterative_train_test_split(X_model_train, y_model_train, test_size= 1-train_size)

    df_train = pd.concat([pd.DataFrame(X_train, columns=["FQText"]), pd.DataFrame(y_train, columns=[df_labels], dtype=np.float32)], axis=1).reset_index(drop=True)
    df_val = pd.concat([pd.DataFrame(X_val, columns=["FQText"]), pd.DataFrame(y_val, columns=[df_labels], dtype=np.float32)], axis=1).reset_index(drop=True)
    df_test = pd.concat([pd.DataFrame(X_test, columns=["FQText"]), pd.DataFrame(y_test, columns=[df_labels], dtype=np.float32)], axis=1).reset_index(drop=True)

    df_train.columns = cols
    df_val.columns = cols
    df_test.columns = cols

    print(df_labels)

    # train_df = df_train.sample(frac=train_size, random_state=42).reset_index(drop=True)
    # val_df = df_train.drop(train_df.index).reset_index(drop=True)

    print("Total amount of data: {}".format(size_df))
    print("Number of rows used to TRAIN: {}".format(df_train.shape[0]))
    print("Number of rows used to VALIDATE: {}".format(df_val.shape[0]))
    print("Number of rows used to TEST: {}".format(df_test.shape[0]))

    return df_train, df_val, df_test, df_labels

In [14]:
def set_hyperparams(hp_dictionary):

    MAX_LEN = hp_dictionary["MAX_LEN"]
    TRAIN_BATCH_SIZE = hp_dictionary["TRAIN_BATCH_SIZE"]
    VALID_BATCH_SIZE = hp_dictionary["VALID_BATCH_SIZE"]
    EPOCHS = hp_dictionary["EPOCHS"]
    LEARNING_RATE = hp_dictionary["LEARNING_RATE"]

    return MAX_LEN, TRAIN_BATCH_SIZE, VALID_BATCH_SIZE, EPOCHS, LEARNING_RATE

In [15]:
def load_ckp(checkpoint_fpath, model, optimizer):
    """
    checkpoint_path: path to save checkpoint
    model: model that we want to load checkpoint parameters into
    optimizer: optimizer we defined in previous training
    """
    # load check point
    checkpoint = torch.load(checkpoint_fpath)
    # initialize state_dict from checkpoint to model
    model.load_state_dict(checkpoint['state_dict'])
    # initialize optimizer from checkpoint to optimizer
    optimizer.load_state_dict(checkpoint['optimizer'])
    # initialize valid_loss_min from checkpoint to valid_loss_min
    valid_loss_min = checkpoint['valid_loss_min']
    # return model, optimizer, epoch value, min validation loss
    return model, optimizer, checkpoint['epoch'], valid_loss_min.item()

def save_ckp(state, is_best, checkpoint_path, best_model_path):
    """
    state: checkpoint we want to save
    is_best: is this the best checkpoint; min validation loss
    checkpoint_path: path to save checkpoint
    best_model_path: path to save best model
    """
    f_path = checkpoint_path
    # save checkpoint data to the path given, checkpoint_path
    torch.save(state, f_path)
    # if it is a best model, min validation loss
    if is_best:
        best_fpath = best_model_path
        # copy that checkpoint file to best path given, best_model_path
        shutil.copyfile(f_path, best_fpath)

In [16]:
class CustomDataset(torch.utils.data.Dataset):

    def __init__(self, df, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.df = df
        self.title = df['FQText']
        # self.labels = list(df.columns)[2:] # list of the target values
        self.targets = self.df[df_labels].values
        self.max_len = max_len

    def __len__(self):
        return len(self.title)

    def __getitem__(self, index):
        title = str(self.title[index])
        title = " ".join(title.split())

        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True, # do we actually need special tokens ??
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'  # pytorch tensors
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs["token_type_ids"].flatten(),
            'targets': torch.FloatTensor(self.targets[index])
        }

In [17]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        # self.bert_model = LongformerModel.from_pretrained('allenai/longformer-base-4096', return_dict=True, problem_type="multi_label_classification")
        self.bert_model = BertModel.from_pretrained("bert-base-uncased", return_dict=True, problem_type="multi_label_classification")

        self.dropout = torch.nn.Dropout(0.3) # why 0.3?
        self.linear = torch.nn.Linear(768, len(df_labels)) # have to changet he n of possible labels here

    def forward(self, input_ids, attn_mask, token_type_ids):
        output = self.bert_model(
            input_ids,
            attention_mask=attn_mask,
            token_type_ids=token_type_ids
        )

        # the issue is that bert gets size 6 here?
        output_dropout = self.dropout(output.pooler_output)
        # print(output_dropout)
        output = self.linear(output_dropout)
        return output

In [18]:
def loss_fn(outputs, targets):
    # print(outputs, targets)
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [19]:
def train_model(n_epochs, training_loader, validation_loader, model,
                optimizer, checkpoint_path, best_model_path, df_labels):
  # initialize tracker for minimum validation loss
  valid_loss_min = np.Inf
  best_metrics = {}
  best_metrics[f"n_epochs"] = n_epochs + 1


  # val_targets_study= []
  # val_outputs_study= [] # I am going to extend all the validation probabilities to determine what the prob threshold should be

  for epoch in range(1, n_epochs+1):

    epoch_n = "epoch_{}".format(epoch)
    best_metrics[epoch_n] = {}

    train_loss = 0
    valid_loss = 0

    model.train()
    print('############# Epoch {}: Training Start   #############'.format(epoch))
    for batch_idx, data in enumerate(training_loader):
        # print('yyy epoch', batch_idx)
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        """targets are the y array of the original data.
        """

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        #print('before loss data in training', loss.item(), train_loss)
        train_loss = train_loss + ((1 / (batch_idx + 1)) * (loss.item() - train_loss))
        #print('after loss data in training', loss.item(), train_loss)



    print('############# Epoch {}: Training End     #############'.format(epoch))

    print('############# Epoch {}: Validation Start   #############'.format(epoch))
    ######################
    # validate the model #
    ######################

    model.eval()

    val_targets = []
    val_outputs = []

    proba_outputs = []


    sigmoid = torch.nn.Sigmoid()

    with torch.no_grad():
      for batch_idx, data in enumerate(validation_loader, 0):
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)
        outputs = model(ids, mask, token_type_ids)

        loss = loss_fn(outputs, targets)
        valid_loss = valid_loss + ((1 / (batch_idx + 1)) * (loss.item() - valid_loss))
        val_targets.extend(targets.cpu().detach().numpy().tolist())
        val_outputs.extend(outputs.cpu().detach().numpy().tolist())

        probs = sigmoid(torch.Tensor(outputs))
        proba_outputs.append(probs.cpu().detach().numpy().tolist())

        # # print(probs)

        # val_targets_study.append(targets.cpu().detach().numpy().tolist())
        # val_outputs_study.append(probs.cpu().detach().numpy().tolist())



      print('############# Epoch {}  Metrics   #############\n\n'.format(epoch))
      metrics, f1, roc_auc, hamming, accuracy = multi_labels_metrics(val_outputs, val_targets, df_labels, threshold=0.5)

      print('############# Epoch {}: Validation End     #############'.format(epoch))

      # calculate average losses
      #print('before cal avg train loss', train_loss)
      train_loss = train_loss/len(training_loader)
      valid_loss = valid_loss/len(validation_loader)


      best_metrics[epoch_n][f"train_loss_epoch_{epoch}"] = train_loss
      best_metrics[epoch_n][f"valid_loss_epoch_{epoch}"] = valid_loss
      best_metrics[epoch_n][f"f1_epoch_{epoch}"] = f1
      best_metrics[epoch_n][f"accuracy_epoch_{epoch}"] = accuracy
      best_metrics[epoch_n][f"roc_auc_epoch_{epoch}"] = roc_auc
      best_metrics[epoch_n][f"hamming_epoch_{epoch}"] = hamming
      best_metrics[epoch_n][f"proba_outputs_epoch_{epoch}"] = proba_outputs
      best_metrics[epoch_n][f"actual_classes_epoch_{epoch}"] = val_targets

      # print training/validation statistics
      print('Epoch: {} \tAvgerage Training Loss: {:.6f} \tAverage Validation Loss: {:.6f}'.format(
            epoch,
            train_loss,
            valid_loss
            ))

      # create checkpoint variable and add important data
      
      # checkpoint = {
      #       'epoch': epoch + 1,
      #       'valid_loss_min': valid_loss,
      #       'state_dict': model.state_dict(),
      #       'optimizer': optimizer.state_dict()
      # }

        # save checkpoint
      # save_ckp(checkpoint, True, checkpoint_path, best_model_path)

      ## TODO: save the model if validation loss has decreased
      if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,valid_loss))
        # save checkpoint as best model
        # save_ckp(checkpoint, True, checkpoint_path, best_model_path)
        valid_loss_min = valid_loss
        best_metrics["best_metrics"] = {
                'epoch': epoch,
                'train_loss': train_loss,
                'valid_loss': valid_loss,
                'val_acc': accuracy,
                'val_f1': f1,
                'val_roc_auc': roc_auc,
                'val_hamming': hamming,
                'proba_outputs': proba_outputs,
                'actual_classes': val_targets
            }

    # print((f"EVAL METRICS: {metrics}\n"))

    print('############# Epoch {}  Done   #############\n'.format(epoch))

  return model, best_metrics

In [20]:
import numpy as np
from sklearn.metrics import roc_auc_score, f1_score, hamming_loss, accuracy_score

def multi_labels_metrics(predictions, targets, df_labels, threshold=0.36): # mess with threshold

  sigmoid = torch.nn.Sigmoid()
  probs = sigmoid(torch.Tensor(predictions)) #mapply activation function in the raw values

  y_pred = np.zeros(probs.shape)
  y_pred[np.where(probs>=threshold)] = 1
  y_true = targets

  accuracy = accuracy_score(y_true, y_pred)
  f1 = f1_score(y_true, y_pred, average = 'micro')
  roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
  hamming = hamming_loss(y_true, y_pred)
  class_report = classification_report(y_true, y_pred, target_names=df_labels)

  metrics = {
      "roc_auc": roc_auc,       # special multilabel metrics
      "hamming_loss": hamming,  # special multilabel metrics
      "f1": f1,
      "accuracy_score": accuracy
  }
  print(metrics)
  print(class_report)
  return metrics, f1, roc_auc, hamming, accuracy

def compute_test_metrics(test_loader):

  model.eval()

  test_loss = 0
  metrics_targets = []
  metrics_outputs = []
  metrics_proba = []

  sigmoid = torch.nn.Sigmoid()
  with torch.no_grad():
        for _, data in enumerate(test_loader, 0):
              ids = data['input_ids'].to(device, dtype = torch.long)
              mask = data['attention_mask'].to(device, dtype = torch.long)
              token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
              targets = data['targets'].to(device, dtype = torch.float)

              outputs = model(ids.squeeze(), mask.squeeze(), token_type_ids.squeeze())

              loss = loss_fn(outputs, targets)
              test_loss = test_loss + ((1 / (_ + 1)) * (loss.item() - test_loss))

              metrics_targets.extend(targets.cpu().detach().numpy().tolist())
              metrics_outputs.extend(outputs.cpu().detach().numpy().tolist())


              probs = sigmoid(torch.Tensor(outputs))

              # val_targets_study.append(targets.cpu().detach().numpy().tolist())
              metrics_proba.append(probs.cpu().detach().numpy().tolist())

  test_loss = test_loss/len(test_loader)

  metrics, f1, roc_auc, hamming, accuracy = multi_labels_metrics(metrics_outputs, metrics_targets, df_labels)
  metrics["test_loss"] = test_loss


  return metrics, f1, roc_auc, hamming, accuracy, test_loss, metrics_proba, metrics_targets


# GRIDSEARCH

In [21]:
# original param_grid

'''hp_dictionary = {"MAX_LEN":256 ,
    "TRAIN_BATCH_SIZE": 64,
    "VALID_BATCH_SIZE": 64,
    "EPOCHS": 4,
    "LEARNING_RATE": 1e-05
}'''

'hp_dictionary = {"MAX_LEN":256 ,\n    "TRAIN_BATCH_SIZE": 64,\n    "VALID_BATCH_SIZE": 64,\n    "EPOCHS": 4,\n    "LEARNING_RATE": 1e-05\n}'

In [22]:
'''for batch in batch_size:
  for rate in learning_rate:
    for epoch in num_epochs:
      hp_dictionary = {"MAX_LEN":256 ,
        "TRAIN_BATCH_SIZE": batch,
        "VALID_BATCH_SIZE": batch,
        "EPOCHS": epoch,
        "LEARNING_RATE": rate}
      hp_dictionaries.append(hp_dictionary)'''

'for batch in batch_size:\n  for rate in learning_rate:\n    for epoch in num_epochs:\n      hp_dictionary = {"MAX_LEN":256 ,\n        "TRAIN_BATCH_SIZE": batch,\n        "VALID_BATCH_SIZE": batch,\n        "EPOCHS": epoch,\n        "LEARNING_RATE": rate}\n      hp_dictionaries.append(hp_dictionary)'

In [23]:
import itertools

In [24]:
#max-len : [128]
#batch_size = [8,12,16,32,64]
#learning_rate = [2e-5, 3e-5, 5e-5]
#num_epochs = [2, 3, 4]

In [25]:
#initialize empty list to hold all hyperparameter combinations
hp_dictionaries = []

#build the grid
#LOW GRID NUMBERS FOR TESTING
#USE SETS ABOVE

# param_grid = {
#     'max-len' : [128, 256, 512],
#     'learning_rate': [1e-5, 2e-5, 3e-5, 5e-5],
#     'batch_size': [8, 12, 16, 32, 64],
#     'num_train_epochs': [16]
# }

# UPDATED PARAM GRID AFTER FIRST SEARCH
param_grid = {
    'max-len' : [128],
    'learning_rate': [2e-5, 3e-5, 5e-5],
    'batch_size': [8, 12, 16],
    'num_train_epochs': [14]
}

#create cartesian product of the parameters
param_combos = list(itertools.product(*param_grid.values()))

#add the combinations into the list of dictionaries
for params in param_combos:
    hp_dictionaries.append(dict(zip(param_grid.keys(), params)))


# Define the function to set hyperparameters
def set_hyperparams(param_grid):
    MAX_LEN = param_grid['max-len']
    TRAIN_BATCH_SIZE = param_grid['batch_size']
    VALID_BATCH_SIZE = param_grid['batch_size']
    EPOCHS = param_grid['num_train_epochs']
    LEARNING_RATE = param_grid['learning_rate']
    return MAX_LEN, TRAIN_BATCH_SIZE, VALID_BATCH_SIZE, EPOCHS, LEARNING_RATE

#print statement for testing to see the cartesian product functions
print(hp_dictionaries)
print(len(hp_dictionaries))

[{'max-len': 128, 'learning_rate': 2e-05, 'batch_size': 8, 'num_train_epochs': 14}, {'max-len': 128, 'learning_rate': 2e-05, 'batch_size': 12, 'num_train_epochs': 14}, {'max-len': 128, 'learning_rate': 2e-05, 'batch_size': 16, 'num_train_epochs': 14}, {'max-len': 128, 'learning_rate': 3e-05, 'batch_size': 8, 'num_train_epochs': 14}, {'max-len': 128, 'learning_rate': 3e-05, 'batch_size': 12, 'num_train_epochs': 14}, {'max-len': 128, 'learning_rate': 3e-05, 'batch_size': 16, 'num_train_epochs': 14}, {'max-len': 128, 'learning_rate': 5e-05, 'batch_size': 8, 'num_train_epochs': 14}, {'max-len': 128, 'learning_rate': 5e-05, 'batch_size': 12, 'num_train_epochs': 14}, {'max-len': 128, 'learning_rate': 5e-05, 'batch_size': 16, 'num_train_epochs': 14}]
9


In [26]:
grid_metric_results = []

In [27]:
failed_grids = []

TRAIN AND VALIDATION LOOP

In [28]:
# os.listdir()

In [29]:
dfs = ['nancy_determinants_individual_labels_eng.csv',
 'nancy_contents_individual_labels_eng.csv',
 'nancy_contents_macro_labels_english.csv',
 'nancy_determinants_macro_labels_english.csv']

In [30]:
# dfs = ['nancy_determinants_individual_labels_eng.csv']

In [31]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')



In [32]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda')

In [34]:
n_grids = len(hp_dictionaries)
save_points = np.arange(10, n_grids, 10)

for df_name in dfs:

  data_model = df_name.split("_")
  data_model = "_".join(data_model[1:3]) #getting the name to name the different dictionaries
  df = pd.read_csv(df_name)

  test_proportion = 0.05
  train_size = 0.9

  train_df, val_df, df_test, df_labels  = pre_process_data(df, test_proportion, train_size)

  i = 0


  for hp_dictionary in hp_dictionaries:
    print(f"PARAM {i}/{n_grids}")
    print(hp_dictionary)

    try:

      MAX_LEN, TRAIN_BATCH_SIZE, VALID_BATCH_SIZE, EPOCHS, LEARNING_RATE = set_hyperparams(hp_dictionary)

      train_dataset = CustomDataset(train_df, tokenizer, MAX_LEN)
      valid_dataset = CustomDataset(val_df, tokenizer, MAX_LEN)
      test_dataset = CustomDataset(df_test, tokenizer, MAX_LEN)

      train_data_loader = torch.utils.data.DataLoader(train_dataset,
          batch_size=TRAIN_BATCH_SIZE,
          shuffle=True,
          num_workers=0
      )

      val_data_loader = torch.utils.data.DataLoader(valid_dataset,
          batch_size=VALID_BATCH_SIZE,
          shuffle=False,
          num_workers=0
      )

      test_data_loader = torch.utils.data.DataLoader(test_dataset,
          batch_size=VALID_BATCH_SIZE,
          shuffle=False,
          num_workers=0
      )

      model = BERTClass()
      model.to(device)

      optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

      proba_threshold = 0.5
      val_targets=[]
      val_outputs=[]

      ckpt_path = "curr_ckpt"
      best_model_path = "best_model.pt"

      trained_model, best_metrics = train_model(EPOCHS, train_data_loader, val_data_loader, model, optimizer, ckpt_path, best_model_path, df_labels)

      metrics, f1, roc_auc, hamming, accuracy, test_loss, metrics_proba, metrics_targets = compute_test_metrics(test_data_loader)

      best_metrics["test"] = {}
      best_metrics["test"]["test_loss"] = test_loss
      best_metrics["test"]["test_f1"] = f1
      best_metrics["test"]["test_roc_auc"] = roc_auc
      best_metrics["test"]["test_hamming"] = hamming
      best_metrics["test"]["test_accuracy"] = accuracy
      best_metrics["test"]["test_proba"] = metrics_proba
      best_metrics["test"]["test_actual_classes"] = metrics_targets


      result = {
            'hyperparameters': hp_dictionary,
            'metrics': best_metrics
        }

      grid_metric_results.append(result)

    except: # avoid crashing when out of memory

      print("Failed Grid")

      failed_grids.append(hp_dictionary)

    # # saving the data periodically
    # if i in save_points:

    with open(f"grid_results_{data_model}.json", "w") as f:
      json.dump(grid_metric_results, f)

    with open(f"failed_grid_results_{data_model}.json", "w") as f:
      json.dump(failed_grids, f)

    i = i + 1

  # when the gridsearch for each data ends, it will save again just in case
  # with open(f"grid_results_{data_model}.json", "w") as f:
  #   json.dump(grid_metric_results, f)

  # with open(f"failed_grid_results_{data_model}.json", "w") as f:
  #   json.dump(failed_grids, f)


<class 'pandas.core.frame.DataFrame'>
Index: 380 entries, 266 to 102
Data columns (total 18 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   C       380 non-null    float32
 1   C'      380 non-null    float32
 2   C'F     380 non-null    float32
 3   CF      380 non-null    float32
 4   CF'     380 non-null    float32
 5   CLOB    380 non-null    float32
 6   CLOBF   380 non-null    float32
 7   E       380 non-null    float32
 8   EF      380 non-null    float32
 9   F       380 non-null    float32
 10  FC      380 non-null    float32
 11  FC'     380 non-null    float32
 12  FCLOB   380 non-null    float32
 13  FE      380 non-null    float32
 14  K       380 non-null    float32
 15  KAN     380 non-null    float32
 16  KOB     380 non-null    float32
 17  KP      380 non-null    float32
dtypes: float32(18)
memory usage: 29.7 KB
None
['C', "C'", "C'F", 'CF', "CF'", 'CLOB', 'CLOBF', 'E', 'EF', 'F', 'FC', "FC'", 'FCLOB', 'FE', 'K', 'KAN', 'KOB',



############# Epoch 1: Training Start   #############
############# Epoch 1: Training End     #############
############# Epoch 1: Validation Start   #############
Failed Grid
PARAM 1/9
{'max-len': 128, 'learning_rate': 2e-05, 'batch_size': 12, 'num_train_epochs': 14}




############# Epoch 1: Training Start   #############
