In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.21.2 (from datasets)
  Downloading huggingface_hub-0.23.0-py3-none-a

In [2]:
!pip install transformers



In [75]:
import pandas as pd

from transformers import BertForSequenceClassification, BertTokenizer, BertConfig, BertModel
from transformers import RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig

from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics.cluster import contingency_matrix

import shutil
import torch
import json
import numpy as np
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
np.random.seed(42)


In [35]:
def pre_process_data(df, test_proportion, train_size):

    size_df = df.shape[0]
    df = shuffle(df)
    X = df[["FQText"]]
    y = df.drop(["FQText"], axis=1)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_proportion, shuffle=True, random_state=42)

    df_train = pd.concat([X_train, y_train], axis=1).reset_index(drop=True)
    df_test = pd.concat([X_test, y_test], axis=1).reset_index(drop=True)
    df_labels = list(y.columns)

    print(df_labels)

    train_df = df_train.sample(frac=train_size, random_state=42).reset_index(drop=True)
    val_df = df_train.drop(train_df.index).reset_index(drop=True)

    print("Total amount of data: {}".format(size_df))
    print("Number of rows used to TRAIN: {}".format(train_df.shape[0]))
    print("Number of rows used to VALIDATE: {}".format(val_df.shape[0]))
    print("Number of rows used to TEST: {}".format(df_test.shape[0]))

    return train_df, val_df, df_test, df_labels

In [5]:
def set_hyperparams(hp_dictionary):

    MAX_LEN = hp_dictionary["MAX_LEN"]
    TRAIN_BATCH_SIZE = hp_dictionary["TRAIN_BATCH_SIZE"]
    VALID_BATCH_SIZE = hp_dictionary["VALID_BATCH_SIZE"]
    EPOCHS = hp_dictionary["EPOCHS"]
    LEARNING_RATE = hp_dictionary["LEARNING_RATE"]

    return MAX_LEN, TRAIN_BATCH_SIZE, VALID_BATCH_SIZE, EPOCHS, LEARNING_RATE

In [6]:
def load_ckp(checkpoint_fpath, model, optimizer):
    """
    checkpoint_path: path to save checkpoint
    model: model that we want to load checkpoint parameters into
    optimizer: optimizer we defined in previous training
    """
    # load check point
    checkpoint = torch.load(checkpoint_fpath)
    # initialize state_dict from checkpoint to model
    model.load_state_dict(checkpoint['state_dict'])
    # initialize optimizer from checkpoint to optimizer
    optimizer.load_state_dict(checkpoint['optimizer'])
    # initialize valid_loss_min from checkpoint to valid_loss_min
    valid_loss_min = checkpoint['valid_loss_min']
    # return model, optimizer, epoch value, min validation loss
    return model, optimizer, checkpoint['epoch'], valid_loss_min.item()

def save_ckp(state, is_best, checkpoint_path, best_model_path):
    """
    state: checkpoint we want to save
    is_best: is this the best checkpoint; min validation loss
    checkpoint_path: path to save checkpoint
    best_model_path: path to save best model
    """
    f_path = checkpoint_path
    # save checkpoint data to the path given, checkpoint_path
    torch.save(state, f_path)
    # if it is a best model, min validation loss
    if is_best:
        best_fpath = best_model_path
        # copy that checkpoint file to best path given, best_model_path
        shutil.copyfile(f_path, best_fpath)

In [10]:
class CustomDataset(torch.utils.data.Dataset):

    def __init__(self, df, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.df = df
        self.title = df['FQText']
        # self.labels = list(df.columns)[2:] # list of the target values
        self.targets = self.df[df_labels].values
        self.max_len = max_len

    def __len__(self):
        return len(self.title)

    def __getitem__(self, index):
        title = str(self.title[index])
        title = " ".join(title.split())

        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True, # do we actually need special tokens ??
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'  # pytorch tensors
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs["token_type_ids"].flatten(),
            'targets': torch.FloatTensor(self.targets[index])
        }

In [11]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.bert_model = BertModel.from_pretrained('bert-base-uncased', return_dict=True)
        self.dropout = torch.nn.Dropout(0.3)
        self.linear = torch.nn.Linear(768, len(df_labels)) # have to changet he n of possible labels here

    def forward(self, input_ids, attn_mask, token_type_ids):
        output = self.bert_model(
            input_ids,
            attention_mask=attn_mask,
            token_type_ids=token_type_ids
        )

        # the issue is that bert gets size 6 here?
        output_dropout = self.dropout(output.pooler_output)
        # print(output_dropout)
        output = self.linear(output_dropout)
        return output

In [12]:
def loss_fn(outputs, targets):
    # print(outputs, targets)
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [84]:
def train_model(n_epochs, training_loader, validation_loader, model,
                optimizer, checkpoint_path, best_model_path):
  # initialize tracker for minimum validation loss
  valid_loss_min = np.Inf


  for epoch in range(1, n_epochs+1):
    train_loss = 0
    valid_loss = 0

    model.train()
    print('############# Epoch {}: Training Start   #############'.format(epoch))
    for batch_idx, data in enumerate(training_loader):
        # print('yyy epoch', batch_idx)
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        """targets are the y array of the original data.
        """

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        #print('before loss data in training', loss.item(), train_loss)
        train_loss = train_loss + ((1 / (batch_idx + 1)) * (loss.item() - train_loss))
        #print('after loss data in training', loss.item(), train_loss)



    print('############# Epoch {}: Training End     #############'.format(epoch))

    print('############# Epoch {}: Validation Start   #############'.format(epoch))
    ######################
    # validate the model #
    ######################

    model.eval()

    metrics_targets = []
    metrics_outputs = []
    softm = torch.nn.Softmax(dim=1) # needed to get the actual predictions

    with torch.no_grad():
      for batch_idx, data in enumerate(validation_loader, 0):
            ids = data['input_ids'].to(device, dtype = torch.long)
            mask = data['attention_mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)

            loss = loss_fn(outputs, targets)
            valid_loss = valid_loss + ((1 / (batch_idx + 1)) * (loss.item() - valid_loss))
            val_targets.extend(targets.cpu().detach().numpy().tolist())
            val_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())


            outputs = model(ids.squeeze(), mask.squeeze(), token_type_ids.squeeze())

            metrics_targets.extend(torch.argmax(softm(targets), dim=1).cpu().detach().numpy().tolist())
            metrics_outputs.extend(torch.argmax(softm(outputs), dim=1).cpu().detach().numpy().tolist())

      metrics_outputs = np.array(metrics_outputs, dtype=int)
      val_f1 = f1_score(metrics_outputs, metrics_targets, average="weighted")
      val_acc = accuracy_score(metrics_outputs, metrics_targets)
      class_report = classification_report(metrics_outputs, metrics_targets)

      print((f"Accuracy: {val_f1}"))
      print((f"F1 Score (Weighted): {val_f1}"))
      print((f"Classification report: \n{class_report}"))



      print('############# Epoch {}: Validation End     #############'.format(epoch))

      # calculate average losses
      #print('before cal avg train loss', train_loss)
      train_loss = train_loss/len(training_loader)
      valid_loss = valid_loss/len(validation_loader)
      # print training/validation statistics
      print('Epoch: {} \tAvgerage Training Loss: {:.6f} \tAverage Validation Loss: {:.6f}'.format(
            epoch,
            train_loss,
            valid_loss
            ))

      # create checkpoint variable and add important data
      checkpoint = {
            'epoch': epoch + 1,
            'valid_loss_min': valid_loss,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict()
      }

        # save checkpoint
      save_ckp(checkpoint, True, checkpoint_path, best_model_path)

      ## TODO: save the model if validation loss has decreased
      if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,valid_loss))
        # save checkpoint as best model
        save_ckp(checkpoint, True, checkpoint_path, best_model_path)
        valid_loss_min = valid_loss

    print('############# Epoch {}  Done   #############\n'.format(epoch))

  return model

In [65]:
# not used at the moment

# def compute_metrics(epoch, validation_loader):
#   model.eval()
#   metrics_targets = []
#   metrics_outputs = []
#   softm = torch.nn.Softmax(dim=1)
#   with torch.no_grad():
#         for _, data in enumerate(validation_loader, 0):
#               ids = data['input_ids'].to(device, dtype = torch.long)
#               mask = data['attention_mask'].to(device, dtype = torch.long)
#               token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
#               targets = data['targets'].to(device, dtype = torch.float)

#               outputs = model(ids.squeeze(), mask.squeeze(), token_type_ids.squeeze())

#               metrics_targets.extend(torch.argmax(softm(targets), dim=1).cpu().detach().numpy().tolist())
#               metrics_outputs.extend(torch.argmax(softm(outputs), dim=1).cpu().detach().numpy().tolist())

#   return metrics_targets, metrics_outputs

In [66]:
def compute_test_metrics(test_loader):
  model.eval()
  metrics_targets = []
  metrics_outputs = []
  softm = torch.nn.Softmax(dim=1)
  with torch.no_grad():
        for _, data in enumerate(test_loader, 0):
              ids = data['input_ids'].to(device, dtype = torch.long)
              mask = data['attention_mask'].to(device, dtype = torch.long)
              token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
              targets = data['targets'].to(device, dtype = torch.float)

              outputs = model(ids.squeeze(), mask.squeeze(), token_type_ids.squeeze())

              metrics_targets.extend(torch.argmax(softm(targets), dim=1).cpu().detach().numpy().tolist())
              metrics_outputs.extend(torch.argmax(softm(outputs), dim=1).cpu().detach().numpy().tolist())

  return metrics_targets, metrics_outputs

TRAIN AND VALIDATION LOOP

# Determinants

In [62]:
df = pd.read_csv("nancy_determinants_grouped.csv")


GROUPED

In [63]:
df.rename(columns = {"Answer (English)": "FQText"}, inplace=True)

df.drop(['Déterminant', 'C', 'C\'', 'C\'F',
       'CF', 'CF\'', 'CLOB', 'CLOBF', 'E', 'EF', 'F', 'FC', 'FC\'', 'FCLOB',
       'FE', 'K', 'KAN', 'KOB', 'KP', 'Réponse (French)'], axis=1, inplace=True)

In [20]:
df.columns

Index(['FQText', 'color_sum', 'threat_sum', 'fading_sum', 'form_sum',
       'kinesthetics_sum'],
      dtype='object')

INDIVIDUAL

In [None]:
df.rename(columns = {"Answer (English)": "FQText"}, inplace=True)

df.drop(['Déterminant', 'color_sum', 'threat_sum', 'fading_sum',
       'form_sum', 'kinesthetics_sum', 'Réponse (French)'], axis=1, inplace=True)

In [None]:
df.columns

# Contents

In [54]:
df = pd.read_csv("nancy_contents_grouped.csv")


GROUPED

In [None]:
df.rename(columns = {"Answer (English)": "FQText"}, inplace=True)

df.drop(['Contenu', '(A)', '(AD)', '(H)', '(HD)',
       'A', 'ABS', 'AD', 'ALIM', 'ANAT', 'ARCH', 'ART', 'BOT', 'ELEM', 'FRAG',
       'GÉO', 'H', 'HD', 'MQ', 'NAT', 'OBJ', 'PAYS', 'RADIO', 'SC', 'SCÈNE',
       'SEX', 'SG', 'VÊT', 'Réponse (French)'], axis=1, inplace=True)

In [None]:
df.columns

Index(['FQText', 'animal_sum', 'human_sum', 'abs_sum', 'food_sum',
       'art_arch_sum', 'nature_sum', 'fragment_sum', 'geo_sum', 'object_sum',
       'science_sum', 'graphic_sum'],
      dtype='object')

INDIVIDUAL

In [55]:
df.rename(columns = {"Answer (English)": "FQText"}, inplace=True)

df.drop(['Contenu', 'animal_sum', 'human_sum', 'abs_sum',
       'food_sum', 'art_arch_sum', 'nature_sum', 'fragment_sum', 'geo_sum',
       'object_sum', 'science_sum', 'graphic_sum', 'Réponse (French)'], axis=1, inplace=True)

In [56]:
df.columns

Index(['FQText', '(A)', '(AD)', '(H)', '(HD)', 'A', 'ABS', 'AD', 'ALIM',
       'ANAT', 'ARCH', 'ART', 'BOT', 'ELEM', 'FRAG', 'GÉO', 'H', 'HD', 'MQ',
       'NAT', 'OBJ', 'PAYS', 'RADIO', 'SC', 'SCÈNE', 'SEX', 'SG', 'VÊT'],
      dtype='object')

# MODEL

In [67]:
hp_dictionary = {"MAX_LEN":256 ,
    "TRAIN_BATCH_SIZE": 16,
    "VALID_BATCH_SIZE": 16,
    "EPOCHS": 4,
    "LEARNING_RATE": 1e-05
}

In [68]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')



In [69]:
MAX_LEN, TRAIN_BATCH_SIZE, VALID_BATCH_SIZE, EPOCHS, LEARNING_RATE = set_hyperparams(hp_dictionary)

test_proportion = 0.05
test_size = 0.9

train_df, val_df, df_test, df_labels  = pre_process_data(df, test_proportion, test_size)


train_dataset = CustomDataset(train_df, tokenizer, MAX_LEN)
valid_dataset = CustomDataset(val_df, tokenizer, MAX_LEN)

train_data_loader = torch.utils.data.DataLoader(train_dataset,
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=True,
    num_workers=0
)

val_data_loader = torch.utils.data.DataLoader(valid_dataset,
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)


device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model = BERTClass()
model.to(device)

optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

proba_threshold = 0.5
val_targets=[]
val_outputs=[]

ckpt_path = "curr_ckpt"
best_model_path = "best_model.pt"

['color_sum', 'threat_sum', 'fading_sum', 'form_sum', 'kinesthetics_sum']
Total amount of data: 380
Number of rows used to TRAIN: 325
Number of rows used to VALIDATE: 36
Number of rows used to TEST: 19




In [72]:
trained_model = train_model(EPOCHS, train_data_loader, val_data_loader, model, optimizer, ckpt_path, best_model_path)

############# Epoch 1: Training Start   #############
############# Epoch 1: Training End     #############
############# Epoch 1: Validation Start   #############


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.466286799620133
F1 Score (Weighted): 0.466286799620133
Classification report: 
              precision    recall  f1-score   support

           0       0.20      0.60      0.30         5
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0
           3       0.85      0.48      0.61        23
           4       0.20      0.12      0.15         8

    accuracy                           0.42        36
   macro avg       0.25      0.24      0.21        36
weighted avg       0.61      0.42      0.47        36

############# Epoch 1: Validation End     #############
Epoch: 1 	Avgerage Training Loss: 0.027545 	Average Validation Loss: 0.185199
Validation loss decreased (inf --> 0.185199).  Saving model ...
############# Epoch 1  Done   #############

############# Epoch 2: Training Start   #############
############# Epoch 2: Training End     #############
############# Epoch 2: Validation Start   #############


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.5385802469135802
F1 Score (Weighted): 0.5385802469135802
Classification report: 
              precision    recall  f1-score   support

           0       0.73      0.52      0.61        21
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0
           3       0.46      0.43      0.44        14
           4       0.20      1.00      0.33         1

    accuracy                           0.50        36
   macro avg       0.28      0.39      0.28        36
weighted avg       0.61      0.50      0.54        36

############# Epoch 2: Validation End     #############
Epoch: 2 	Avgerage Training Loss: 0.025147 	Average Validation Loss: 0.175182
Validation loss decreased (0.185199 --> 0.175182).  Saving model ...
############# Epoch 2  Done   #############

############# Epoch 3: Training Start   #############
############# Epoch 3: Training End     #############
############# Epoch 3: Validation Start   #############


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.5723223223223222
F1 Score (Weighted): 0.5723223223223222
Classification report: 
              precision    recall  f1-score   support

           0       0.80      0.55      0.65        22
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0
           3       0.46      0.46      0.46        13
           4       0.20      1.00      0.33         1

    accuracy                           0.53        36
   macro avg       0.29      0.40      0.29        36
weighted avg       0.66      0.53      0.57        36

############# Epoch 3: Validation End     #############
Epoch: 3 	Avgerage Training Loss: 0.024294 	Average Validation Loss: 0.172149
Validation loss decreased (0.175182 --> 0.172149).  Saving model ...
############# Epoch 3  Done   #############

############# Epoch 4: Training Start   #############
############# Epoch 4: Training End     #############
############# Epoch 4: Validation Start   #############


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.5954715219421103
F1 Score (Weighted): 0.5954715219421103
Classification report: 
              precision    recall  f1-score   support

           0       0.73      0.58      0.65        19
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0
           3       0.62      0.53      0.57        15
           4       0.20      0.50      0.29         2

    accuracy                           0.56        36
   macro avg       0.31      0.32      0.30        36
weighted avg       0.65      0.56      0.60        36

############# Epoch 4: Validation End     #############
Epoch: 4 	Avgerage Training Loss: 0.023690 	Average Validation Loss: 0.166850
Validation loss decreased (0.172149 --> 0.166850).  Saving model ...
############# Epoch 4  Done   #############



TEST

In [73]:
test_dataset = CustomDataset(df_test, tokenizer, MAX_LEN)

test_data_loader = torch.utils.data.DataLoader(test_dataset,
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

In [74]:
metrics_targets, metrics_outputs =  compute_test_metrics(test_data_loader)

metrics_outputs = np.array(metrics_outputs, dtype=int)

val_f1 = f1_score(metrics_outputs, metrics_targets, average="weighted")
val_acc = accuracy_score(metrics_outputs, metrics_targets)
class_report = classification_report(metrics_outputs, metrics_targets)
print((f"Accuracy: {val_f1}"))
print((f"F1 Score (Weighted): {val_f1}"))
print((f"Classification report: \n{class_report}"))

Accuracy: 0.593440122044241
F1 Score (Weighted): 0.593440122044241
Classification report: 
              precision    recall  f1-score   support

           0       0.67      0.33      0.44         6
           1       0.00      0.00      0.00         0
           3       0.75      0.82      0.78        11
           4       0.00      0.00      0.00         2

    accuracy                           0.58        19
   macro avg       0.35      0.29      0.31        19
weighted avg       0.64      0.58      0.59        19



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
