# Google Colab implementation of Roberta_zh for classification

To run, first set 'ROOT' to directory containing load_data, model, and utils .py files, as well as the roberta_config configuration file.

Within the configuration file, set the data path and inference data path. 

Finally, alternations are neccesary within the data loader. Ensure that the column names of imported dataframes and the one-hot encoding schemes accord with the data used.

In [3]:
#First define root
ROOT = 'insert directory containing files'

In [None]:
!pip install transformers

In [5]:
import sys
from google.colab import drive
drive.mount('/content/gdrive')
sys.path.append(ROOT)

Mounted at /content/gdrive


In [None]:
#Import py files
from load_data import *
from model import *
from utils import *

# The experiment function

In [None]:
from cmath import inf
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
from datetime import datetime
import gc
import os

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

class Experiment(object):
    def __init__(self, name):

        #This pulls in the config data
        experiment_config = read_file(ROOT + name + '.json')
        if experiment_config is None:
            raise Exception("Configuration file doesn't exist: ", name)
        
        #This sets name, to be used when setting directory
        self.__name = experiment_config['experiment_name']
        
        #Sets config
        self.__experiment_config = experiment_config
        
        #Sets dir
        self.__experiment_dir = os.path.join(ROOT, self.__name)
        
        #Used experiment_config to create general model, tokenizer, and specifically configured model
        self.__model, self.__tokenizer, self.__config_model = classification_model(experiment_config)

        #Get loaders and dfs, third and 4th arg specify the one-hot schema for labels
        self.__train_loader, self.__val_loader, self.__test_loader, self.__train_df, self.__val_df, self.__test_df = get_dataset(experiment_config, 
                                                                                                                                                       self.__tokenizer,
                                                                                                                                                ['m', 'T', 'lab', 'temp'],
                                                                                                                                                       [0, 1, 1, 1],
                                                                                                                                                     'lab')
        self.__infer_loader, self.__infer_df = inference_data_processing(experiment_config, self.__tokenizer)                                                                       

        #Define params and storage of training output
        self.__num_labels = experiment_config['dataset']['num_labels']
        self.__epochs = experiment_config['experiment']['num_epochs']
        self.__learning_rate = experiment_config['experiment']['learning_rate']
        self.__current_epoch = 0
        self.__training_losses = []
        self.__val_losses = []
        self.__best_model = None  # Save your best model in this field and use this in test method.
        self.__best_f1 = 0
        self.__best_f1_scores = None

        #Define cuda
        self.__device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        #Define loss function
        self.__criterion = nn.CrossEntropyLoss()
        
        #Define learning rate(s)
        if isinstance(self.__learning_rate, list):
            self.__optimizer = torch.optim.Adam(params=self.__model.parameters(), lr=self.__learning_rate[0])
        else:
            self.__optimizer = torch.optim.Adam(params=self.__model.parameters(), lr=self.__learning_rate)
        self.__init_model()
        

        #os.makedirs(ROOT_STATS_DIR, exist_ok=True)

        #Check for output directory
        os.makedirs(self.__experiment_dir, exist_ok=True)

    def __init_model(self):
        if torch.cuda.is_available():
            self.__model = self.__model.cuda().float()
            self.__criterion = self.__criterion.cuda()


    def run(self):
        start_epoch = self.__current_epoch
        f1_scores = []

        tr_labels = []
        tr_preds = []
        tr_cnkis = []

        labels = []
        preds = []
        cnkis = []

        for epoch in range(start_epoch, self.__epochs):  # loop over the dataset multiple times
            start_time = datetime.now()
            print(f"### Training epoch: {epoch + 1}")
            
            #If list of lr, then use lr according to epoch
            if isinstance(self.__learning_rate, list):
                for g in self.__optimizer.param_groups: 
                    g['lr'] = self.__learning_rate[epoch]
                lr = self.__optimizer.param_groups[0]['lr']
                print(f'### LR = {lr}\n')
            
            #Store losses, predictions, and corresponding article IDs
            train_loss, train_f1, tr_lab_list, tr_pred_list, tr_cnki_list = self.__train()
            val_loss, val_f1, lab_list, pred_list, cnki_list = self.__val()
            
            labels.append(lab_list)
            preds.append(pred_list)
            cnkis.append(cnki_list)

            tr_labels.append(tr_lab_list)
            tr_preds.append(tr_pred_list)
            tr_cnkis.append(tr_cnki_list)
            
            #Save best validation F1 score and best model
            f1_scores.append(val_f1)
            if val_f1 >= self.__best_f1:
                self.__best_f1 = val_f1
                self.__best_model = self.__model.state_dict()
                self.__save_model(model_path='best_model.pt')                

            #Store stats and latest model
            self.__record_stats(train_loss, val_loss)
            self.__log_epoch_stats(start_time)
            self.__save_model()

        #Conclude by printing best F1 and list of all F1 scores
        print('Training Ended')
        print('Best F1:')
        print(self.__best_f1)
        print('f1 list:')
        print(f1_scores)
        print()
        
        # housekeeping
        gc.collect() 
        torch.cuda.empty_cache()
        
        return labels, preds, cnkis, tr_labels, tr_preds, tr_cnkis, self.__val_df, self.__train_df

    def __train(self):

        #Set baseline for loss, accuracy, training examples, and steps
        #tr_loss, tr_accuracy = 0, 0
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        
        #Create loss and acc list
        loss_list = []
        pred_list = []
        lab_list = []
        cnki_list = []
        #tr_preds, tr_labels = [], []
        
        # put model in training mode
        self.__model.train()
        
        for idx, batch in enumerate(self.__train_loader):
            
            #Move to GPU
            ids = batch['input_ids'].to(self.__device, dtype = torch.long)
            mask = batch['attention_mask'].to(self.__device, dtype = torch.long)
            labels = batch['label'].to(self.__device, dtype = torch.long)
            cnkis = batch['cnki_id']

            #Return loss and logits from model for batch
            loss, tr_logits = self.__model(input_ids=ids, attention_mask=mask, labels=labels, return_dict=False)

            #Sum with current loss
            tr_loss += loss.item()
            loss_list.append(loss.item())

            nb_tr_steps += 1
            nb_tr_examples += labels.size(0)
            
            #print loss every 200 steps
            if idx % 200==0:
                loss_step = tr_loss/nb_tr_steps
                # print(f"Training loss after {idx:04d} training steps: {loss_step}")
                print(f"Training loss after {idx:04d} training steps: {np.mean(loss_list)}")

            
            # compute training accuracy

            #Flatten targets and get argmax for logits
            flattened_targets = labels.view(-1).cpu().numpy()  # shape (batch_size * seq_len,)
            curr_logits = tr_logits.view(-1, self.__model.num_labels) # shape (batch_size * seq_len, num_labels)
            predictions = torch.argmax(curr_logits, axis=1).cpu().numpy() 

            #Acc for batch, then add to overall acc
            lab_list.extend(flattened_targets)
            #tr_accuracy += tmp_tr_accuracy
            pred_list.extend(predictions)
            cnki_list.extend(cnkis)
            # gradient clipping
            torch.nn.utils.clip_grad_norm_(
                parameters=self.__model.parameters(), max_norm=self.__experiment_config['model']['max_grad_norm']
            )
            
            # backward pass
            self.__optimizer.zero_grad()
            loss.backward()
            self.__optimizer.step()

        #epoch_loss = tr_loss / nb_tr_steps
        train_loss = np.mean(loss_list)
        # return that loss
        #tr_accuracy = tr_accuracy / nb_tr_steps
        train_f1 = f1_score(lab_list, pred_list)
        # return that acc
        print(f"Training loss epoch: {train_loss}")
        print(f"Training f1 for epoch: {train_f1}")
        
        
        return train_loss, train_f1, lab_list, pred_list, cnki_list
    
    def __val(self):
        self.__model.eval()
        val_loss = 0
        loss_list = []
        pred_list = []
        label_list = []
        cnki_list = []


        with torch.no_grad():
            for idx, batch in enumerate(self.__val_loader):

                #Move to GPU and perform inference
                ids = batch['input_ids'].to(self.__device, dtype = torch.long)
                mask = batch['attention_mask'].to(self.__device, dtype = torch.long)
                labels = batch['label'].to(self.__device, dtype = torch.long)
                cnki_ids = batch['cnki_id']

                loss, logits = self.__model(ids, attention_mask=mask, labels=labels, return_dict=False)
                loss_list.append(loss.item())
                flattened_targets = labels.view(-1).cpu().numpy()
                all_preds = torch.argmax(logits, axis=-1).cpu().numpy() 
                label_list.append(flattened_targets)
                pred_list.append(all_preds)
                cnki_list.append(cnki_ids)

        #Flatted
        label_list = [x for y in label_list for x in y]
        pred_list = [x for y in pred_list for x in y]
        cnki_list = [x for y in cnki_list for x in y]

        val_f1 = f1_score(label_list, pred_list)
        val_loss = np.mean(loss_list)

        print(f"Validation loss epoch: {val_loss}")
        print(f"Validation f1 for epoch: {val_f1}")        
        return val_loss, val_f1, label_list, pred_list, cnki_list


    def test(self, model_loc=None):
        self.__best_model = self.__model
        if model_loc is not None:
            best_checkpoint = torch.load(model_loc)
        else:
            best_checkpoint = torch.load(os.path.join(self.__experiment_dir, 'best_model.pt'))
        self.__best_model.load_state_dict(best_checkpoint['model'])            
        self.__best_model = self.__best_model.to(self.__device)
        self.__best_model.eval()
        test_loss = 0

        
        loss_list = []
        label_list = []
        pred_list = []
        cnki_list = []
        

        with torch.no_grad():
            for idx, batch in enumerate(self.__test_loader):

                ids = batch['input_ids'].to(self.__device, dtype = torch.long)
                mask = batch['attention_mask'].to(self.__device, dtype = torch.long)
                labels = batch['label'].to(self.__device, dtype = torch.long)
                cnki_ids = batch['cnki_id']

                loss, logits = self.__model(ids, attention_mask=mask, labels=labels, return_dict=False)
                loss_list.append(loss.item())
                flattened_targets = labels.view(-1).cpu().numpy()
                all_preds = torch.argmax(logits, axis=-1).cpu().numpy() 
                label_list.append(flattened_targets)
                pred_list.append(all_preds)
                cnki_list.append(cnki_ids)

        #Flatted
        label_list = [x for y in label_list for x in y]
        pred_list = [x for y in pred_list for x in y]
        cnki_list = [x for y in cnki_list for x in y]

        test_f1 = f1_score(label_list, pred_list)
        test_loss = np.mean(loss_list)

        print(f"Test loss: {test_loss}")
        print(f"Test f1: {test_f1}")        
        return test_loss, test_f1, label_list, pred_list, cnki_list

    def infer(self, model_loc=None):
      self.__best_model = self.__model
      if model_loc is not None:
          best_checkpoint = torch.load(model_loc)
      else:
          best_checkpoint = torch.load(os.path.join(self.__experiment_dir, 'best_model.pt'))
      self.__best_model.load_state_dict(best_checkpoint['model'])            
      self.__best_model = self.__best_model.to(self.__device)
      self.__best_model.eval()

      loss_list = []
      label_list = []
      pred_list = []
      cnki_list = []
    

      with torch.no_grad():
          for idx, batch in enumerate(self.__infer_loader):

              ids = batch['input_ids'].to(self.__device, dtype = torch.long)
              mask = batch['attention_mask'].to(self.__device, dtype = torch.long)
              cnki_ids = batch['cnki_id']

              logits = self.__model(ids, attention_mask=mask, return_dict=False)
              all_preds = torch.argmax(logits[0], axis=-1).cpu().numpy() 
              cnki_list.append(cnki_ids)
              pred_list.append(all_preds)

      return pred_list, cnki_list

    #Auxiliary functions
    def __save_model(self, model_path = 'latest_model.pt'):
        root_model_path = os.path.join(self.__experiment_dir, model_path)
        model_dict = self.__model.state_dict()
        state_dict = {'model': model_dict, 'optimizer': self.__optimizer.state_dict()}
        torch.save(state_dict, root_model_path)

    def __record_stats(self, train_loss, val_loss):
        self.__training_losses.append(train_loss)
        self.__val_losses.append(val_loss)

        self.plot_stats()

        write_to_file_in_dir(self.__experiment_dir, 'training_losses.txt', self.__training_losses)
        write_to_file_in_dir(self.__experiment_dir, 'val_losses.txt', self.__val_losses)

    def __log(self, log_str, file_name=None):
        print(log_str)
        log_to_file_in_dir(self.__experiment_dir, 'all.log', log_str)
        if file_name is not None:
            log_to_file_in_dir(self.__experiment_dir, file_name, log_str)

    def __log_epoch_stats(self, start_time):
        time_elapsed = datetime.now() - start_time
        time_to_completion = time_elapsed * (self.__epochs - self.__current_epoch - 1)
        train_loss = self.__training_losses[self.__current_epoch]
        val_loss = self.__val_losses[self.__current_epoch]
        summary_str = "Epoch: {}, Train Loss: {}, Val Loss: {}, Took {}, ETA: {}\n"
        summary_str = summary_str.format(self.__current_epoch + 1, train_loss, val_loss, str(time_elapsed),
                                         str(time_to_completion))
        self.__log(summary_str, 'epoch.log')

    def plot_stats(self):
        e = len(self.__training_losses)
        x_axis = np.arange(1, e + 1, 1)
        plt.figure()
        plt.plot(x_axis, self.__training_losses, label="Training Loss")
        plt.plot(x_axis, self.__val_losses, label="Validation Loss")
        plt.xlabel("Epochs")
        plt.legend(loc='best')
        plt.title(self.__name + " Stats Plot")
        plt.savefig(os.path.join(self.__experiment_dir, "stat_plot.png"))
        plt.show()

In [None]:
#Run experiment
exp = Experiment('roberta_config')
labels, preds, cnkis, tr_labels, tr_preds, tr_cnkis, val_df, tr_df = exp.run()

#Return test metrics
test_loss, test_f1, label_list, pred_list, cnki_list = exp.test()

In [None]:
exp = Experiment('roberta_config')
model_loc = 'BEST MODEL DIR HERE'

#Perform inference on new instances
inf_pred_list, inf_cnki_list = exp.infer(model_loc)

In [None]:
#Convert model output to dataframe
all_preds = [y for z in [list(x) for x in inf_pred_list] for y in z]
all_cnkis = [x for cnki in inf_cnki_list for x in cnki]

df = pd.DataFrame()
df['cnki_id'] = all_cnkis
df['inf_lab'] = all_preds