In [None]:
! pip install transformers
! pip install datasets



In [None]:
# imports
import pandas as pd
import numpy as np
import pdb
import os
os.chdir('/content/drive/MyDrive/CAMemBERT2')
import re
from datasets import Dataset,DatasetDict,load_dataset,concatenate_datasets
from transformers import AutoTokenizer
from torch import FloatTensor
from torch.cuda import is_available
from math import ceil,floor
import random
import json
import time

# A class that merges the fce grammatical error detection (ged) dataset to fce automated essay scoring dataset (aes) 
# so that grammar tags and essay scores can be used.
# for further notes of how this class works and additioanl essay matching methods see bottom
class LinkGedDatasetToEssayDataset:

    # mapping for string labels to numerical ones in ged dataset
    _map_labels_2_ids = {'c':0,'i':1}

    def __init__(self,set_type='train'):
        # counter for the number of differences where characters appear in the aes dataset but not ged dataset
        self.errors_count=0
        # set type can be train, test, dev
        self.set_type = set_type
        self.essays = self.read_and_parse_essay_data()
        # create mapping of column. names to indexes for itertuples iteration in self.get_differences() method
        self.essay_col_index = {col:i+1 for i,col in enumerate(self.essays)}
        self.ged = self.read_and_parse_ged_data()
        # string containing all words in ged dataset with no whitespace
        self.all_words_no_ws = ''.join(self.ged.word.tolist())
        # final dataframe with essay text, essay scores, essay ids and essay grammar labels 
        self.updated_df = self.match_essays_and_grammar_labels()

    def get_updated_df(self):
        return self.updated_df

    def read_and_parse_essay_data(self):
        essays = pd.read_json(f'data/fce.{self.set_type}.json',lines=True)
        essays['text'] = essays.text.str.replace('\n',' ')
        essays['text_no_ws'] = essays.text.str.split().str.join('')
        essays['essay_char_len'] = essays['text_no_ws'].apply(len)
        essays['end_word_ind'] = essays['essay_char_len'].cumsum()
        essays['start_ind'] = essays['end_word_ind'] - essays['essay_char_len']
        essays['ind_combined'] = essays.apply(lambda x: list([x['start_ind'],x['end_word_ind']]),axis=1)
        return essays

    def read_and_parse_ged_data(self):
        ged = pd.DataFrame(pd.read_csv(f'data/fce-public.{self.set_type}.original.tsv',sep='  ',names=['word']).word.str.split('\t',1).tolist(),columns = ['word','labels'])
        ged['labels'] = ged['labels'].map(self._map_labels_2_ids)
        ged['end_word_ind'] = ged.word.apply(len).cumsum()
        return ged

    def match_essays_and_grammar_labels(self):
        # essays_to_keep = index of all essays that form an exact match with all words in the ged dataset string
        essays_to_keep,matched_essays = zip(*[(i,re.search(re.escape(essay_no_ws),self.all_words_no_ws)) for i,essay_no_ws 
                                                      in enumerate(self.essays['text_no_ws'].tolist()) 
                                                      if essay_no_ws in self.all_words_no_ws ])
        # get the words from the ged dataset (with whitespace) to form each essay
        essays_from_ged = [' '.join(self.ged.loc[(self.ged['end_word_ind']>m.start())& (self.ged['end_word_ind']<=m.end())]['word'].tolist()) for m in matched_essays]
        # get the labels from the ged dataset corresponding to each essay
        labels_from_ged = [self.ged.loc[(self.ged['end_word_ind']>m.start())& (self.ged['end_word_ind']<=m.end())]['labels'].tolist() for m in matched_essays]

        # make sure essays dataframe only contains essays that have matched
        essays = self.essays.iloc[list(essays_to_keep)].reset_index()
        return pd.concat([pd.DataFrame({'text':essays_from_ged,'labels':labels_from_ged}),essays[['answer-s','script-s','id']]],axis=1)


# Class to create a huggingface FatasetDict for the fce dataset for tasks of aes and ged
class CreateHuggingFaceDictFce:

    # class varaibles 
    # possible set types
    _set_types = ['train','dev','test']
    _cols_to_keep = ['attention_mask','labels','input_ids','scores']
    _answer_score_mapping = {
                      0.0:0,
                      1.1:1,1.2:4,1.3:8,
                      2.1:9,2.2:10,2.3:11,
                      3.1:12,3.2:13,3.3:14,
                      4.1:15,4.2:16,4.3:17,
                      5.1:18,5.2:19,5.3:20,
                  }

    def __init__(self,pretrained_model= 'distilroberta-base',max_length=512,scoring='script'):
        # max length for tokenization
        self.max_length = max_length
        # huggingface tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
        self.scoring = scoring
        self.fce_df = self.parse_data()
        self.dataset_dict = self.create_hf_dataset_dict()
        self.set_weights()

    def get_df(self):
        return self.df_for_sent

    def get_dataset_dict(self):
        return self.dataset_dict

    def get_weights(self):
        return self.class_weights

    def parse_data(self):
        # create one dataframe containing all samples from all train, test and dev set 
        combined_df = pd.concat([self.add_col(LinkGedDatasetToEssayDataset(set_type).get_updated_df(),'set_type',set_type) for set_type in self._set_types],axis=0)
        if self.scoring == 'script':
            df = combined_df.rename(columns={'script-s':'scores'})
            df['scores'] = df['scores'].astype(float)
            # group all essays by id and combine the text and labels as well as merge the scores and set types        
            df = df.groupby('id').agg({'text':list,'labels':list,'scores':list,'set_type':list})
            df[ 'text' ] = df[ 'text' ].str.join(' ')
            df['labels'] = df['labels'].apply(lambda x : x[0] + x[1] if len(x)>1 else x[0])
            df[ 'scores' ] = df[ 'scores' ].apply(lambda x : x[0])
            df[ 'set_type' ] = df[ 'set_type' ].apply(lambda x : x[0])
        elif self.scoring == 'answer':
            df = combined_df.rename(columns={'answer-s':'scores'})
            # correct for errors in scoring
            df[ 'scores' ] = df[ 'scores' ].str.replace( '/','.' ).str.replace('T','')
            # remove values containing non-numeric data and are in score mappings
            df = df[ ~pd.to_numeric( df[ 'scores' ] , errors='coerce' ).isna() ]
            df = df[ df['scores'].astype(float).isin(self._answer_score_mapping.keys())]
            # map scores to new values
            df[ 'scores' ] = df[ 'scores' ].astype(float).map(self._answer_score_mapping)
            # return only certain columns from the original dataset
        return df.reset_index()[['text','labels','scores','set_type']]

    # used to add set type column to each of the train test and dev dataframes
    def add_col(self,df,col,val):
        df[col] = val
        return df

    def create_hf_dataset_dict(self):
        # create a hugging face dataset for each of the train test and dev samples and combine them to create a huggingface dataset dictionary
        dataset_dict = DatasetDict({set_type:Dataset.from_pandas(self.fce_df.groupby('set_type').get_group(set_type)) for set_type in self._set_types})
        # apply the method to extend the labels for grammatical error detection and tokenize each essay
        dataset_dict = dataset_dict.map(self.extend_labels_for_tokenizer).map(self.tokenize_text)
        # find the columns to drop from the new dataset dict
        cols_to_drop = set(dataset_dict.column_names['train']) - set(self._cols_to_keep)
        return dataset_dict.remove_columns(list(cols_to_drop))

    def extend_labels_for_tokenizer(self,example):
        # split text by white space to create individual tokens that correspond to labels
        tokens,labels = example['text'].split(),example['labels']
        labels_for_tokens = [] 
        # split word counts (words that are plit up by tokenizer)
        split_count = 0
        # iterate through each token generated when the tokenizer is applied to the full length text
        for index, token in enumerate( self.tokenizer.tokenize( ' '.join( tokens ) , truncation = True , padding = False , add_special_tokens = False , max_length = self.max_length ) ):
            # if conditions to determine if the tokenizer has split a word based on tokenizer used
            if ( ( ( ( token.startswith( "Ġ" ) == False and index != 0 ) or ( token in tokens[ index - split_count - 1 ].lower() and index - split_count - 1 >= 0 ) ) and self.tokenizer.sep_token == '</s>' ) 
                or ( ( token.startswith( "##" ) or ( token in tokens[index - split_count - 1].lower() and index - split_count - 1 >= 0 ) ) and self.tokenizer.sep_token == '[SEP]' ) ):
                # add a padding token for words that are split by the tokenizer
                labels_for_tokens.append( -100 )
                # add a count 
                split_count += 1
            else:
                # add the label to all tokens that either haven't been split by the tokenizer or are the first word of a split
                labels_for_tokens.append(labels[index - split_count])
        # pad and truncate the labels to be the max length of the tokenizer by padding -100 to the token length where necessary
        return {'labels':np.pad( labels_for_tokens , ( 0 , self.max_length - len( labels_for_tokens ) ) , 'constant' , constant_values = ( 0 , -100 ) )[:self.max_length]}

    # get the padded and truncated input ids and attention masks for each text (essay)
    def tokenize_text(self,example):
        return self.tokenizer( example['text'] , truncation=True , padding = 'max_length' , max_length = self.max_length )

    # set weights to apply to the cross entropy loss function to penalise for under represented classes
    def set_weights(self):
        dataset = self.get_dataset_dict()
        padding,n_c,n_i = np.unique(np.concatenate(dataset['train']['labels']),return_counts=True)[1]
        class_weights = FloatTensor([(n_c + n_i)/(2.0 * n_c),(n_c + n_i)/(2.0 * n_i)]).to('cuda' if is_available() else 'cpu')
        self.class_weights = class_weights

# Class to create a huggingface DatasetDict for the conll2003 dataset for task of Named Entity Recognition NER
class CreateHuggingFaceDictNerandAesDataset(CreateHuggingFaceDictFce):

    # class variables
    # possible set types
    _cols_to_keep_before_dataset_conversion = ['tokens','ner_tags']
    ner_dataset = "conll2003"

    def __init__(self,pretrained_model= 'distilroberta-base',max_length=512,scoring='script'):
        super().__init__(pretrained_model,max_length,scoring)
        # load in ner dataset from huffingface
        dataset = load_dataset(self.ner_dataset).map(self.append_sep_and_pad)
        self._set_types_ner = list(dataset.keys())
        dataset_dict = self.reshape_to_match_to_fce_dataset(dataset)
        # apply tokenization
        dataset_dict = dataset_dict.map(self.extend_labels_for_tokenizer).map(self.tokenize_text).remove_columns(['text'])
        self.dataset_dict_ner = dataset_dict

    def get_ner_dataset_dict(self):
        return self.dataset_dict_ner

    def get_fce_dataset_dict(self):
        return self.dataset_dict

    # append a sep token to the end of each sample from the ner dataset and a padtoken to the end of each label
    def append_sep_and_pad(self,example):
        tokens,tags = example['tokens'],example['ner_tags']
        tokens.append(self.tokenizer.sep_token)
        tags.append(-100)
        return {'tokens':tokens,'ner_tags':tags}

    def reshape_to_match_to_fce_dataset(self,dataset):
        dataset_dict = {}
        # variable that dictates the length of tokenization in the ner dataset so that the number of samples/rows match that of the fce dataset
        self.max_length_for_training = None
        # iterate through the set types of the ner and fce datasets
        for set_type,set_type_fce in zip(self._set_types_ner,self._set_types):
            # list_for_each_set_types_data
            type_data = []
            # iterate through each column that is needed from the ner dataset
            for col in self._cols_to_keep_before_dataset_conversion:
                # flatten all the values in the curent column to a 1d array
                flattened_values = np.concatenate(dataset[set_type][col])
                # if the set type is train clalculate the length of tokenization required to ensure the number of rows in the ner dataset 
                # is equal to the number of rows in the fce
                if set_type=='train':
                    split_arr = np.array_split(flattened_values,self.get_fce_dataset_dict()[set_type_fce].num_rows)
                    if self.max_length_for_training==None:
                        max_row_length_of_split_array =  max([len(row) for row in split_arr])
                        self.max_length_for_training = max_row_length_of_split_array if max_row_length_of_split_array <= self.max_length else self.max_length
                # if the dataset is not train then set the dataset to have the same max length tokenization as max_length_for_training
                # unless max length for training is greater than max length of tokenizer in which case the max_length of tokenizer should be used
                else:
                    # reshape array to have the correct max length of tokenization but an unlimited number of rows
                    length_of_new_arr = ceil(len(flattened_values)/self.max_length_for_training )
                    split_arr = np.array_split(flattened_values,length_of_new_arr)

                # pad each row in the split array to have the same length
                padded_array = []
                for row in split_arr:
                    if col == 'tokens':
                        padded_array.append(' '.join(list(np.pad(row,(0,self.max_length_for_training -len(row)),constant_values = (self.tokenizer.pad_token,self.tokenizer.pad_token))[:self.max_length_for_training])))
                    else:
                        padded_array.append(list(np.pad(row,(0,self.max_length_for_training -len(row)),constant_values = (-100,-100))[:self.max_length_for_training ]))
                type_data.append(pd.Series(padded_array))
            # create a hugging face dataset from the set type
            tmp_df = pd.DataFrame({'text':type_data[0],'labels':type_data[1]})
            dataset_dict[set_type] = Dataset.from_pandas(tmp_df)
        # combine all data into datasetdict
        return DatasetDict(dataset_dict)

# to note:
# 			- ner training dataset is padded to be the same width and length as the fce dataset so that it can be loaded into a model without the need for over
#			  under sampling. 
class CreateHuggingFaceMultiTask(CreateHuggingFaceDictNerandAesDataset):

    # class variables
    _col_to_add_to_ner = ['score']
    _set_types = ['train','test','dev']
    _map_fce_set_to_ner = {'train':'train','test':'test','dev':'validation'}
    _map_ner_set_to_fce = {'train':'train','test':'test','validation':'dev'}
    _tasks = ['aes','ged','ner']

    def __init__(self,pretrained_model='distilroberta-base',max_length=512, scoring='script',batch_size=8):
        super().__init__(pretrained_model,max_length, scoring)
        self.fce_dataset_dict = self.get_fce_dataset_dict()
        self.ner_dataset_dict = self.get_ner_dataset_dict()
        self.batch_size = batch_size
        for set_type in self._set_types:
            # add a dataset column to fce dataset
            self.fce_dataset_dict[set_type] = self.fce_dataset_dict[set_type].add_column('dataset',[0]*self.fce_dataset_dict[set_type].num_rows)
        for set_type in self._set_types_ner:
            # add a score column to ner dataset and pad values 
            self.ner_dataset_dict[set_type] = self.ner_dataset_dict[set_type].add_column('scores',[-100]*self.ner_dataset_dict[set_type].num_rows)
            # add a dataset column to ner dataset
            self.ner_dataset_dict[set_type] = self.ner_dataset_dict[set_type].add_column('dataset',[1]*self.ner_dataset_dict[set_type].num_rows)
            # make sure all columns are of the same dataset type
            self.ner_dataset_dict[set_type] = self.ner_dataset_dict[set_type].cast(self.fce_dataset_dict[self._map_ner_set_to_fce[set_type]].features)
        # combine the two datasets through concatanation
        self.combined_dataset_dict = self.combine_datasets()

    def get_combined_dataset_dict(self):
        return self.combined_dataset_dict

    # generate a dataloader for training and testing models so that data is loaded in alternating tasks for training 
    # and all samples from one task then another for testing and dev.
    def combine_datasets(self):
        dataset_dict = {}
        for set_type in self._set_types:
            # concatanate datasets so one follows another from a list of all the datasets
            dataset_lst = [self.get_fce_dataset_dict()[set_type],self.get_ner_dataset_dict()[self._map_fce_set_to_ner[set_type]]]
            concatenated_datasets = concatenate_datasets(dataset_lst)
            if set_type=='train':
                # get the length of each dataset
                lengths = [dset.num_rows for dset in dataset_lst]
                # get the offset for each dataset (number of samples between the begining of the concatanated dataset and the start of a new dataset)
                offsets = np.cumsum([0] + lengths[:-1])
                # get a list of indexes for the minimum length dataset (although both the same length)
                indexes = list(np.arange(min(lengths)))
                # get a list of all the possible indexes in the smallest / first dataset
                indicies = [offset + indexes for offset in offsets]
                # list for storing the order which batches should appear in
                batch_order=[]
                for _ in range(ceil(min(lengths)/self.batch_size)):
                    # create a list of mini batch indexes by appending randomly sampled indexes of length batch size 
                    # until they run out / can no longer fill a batch and then append the remaining to the last batch
                    # (this is samples only for the smallest / first dataset)
                    try:
                        samples = random.sample(indexes , self.batch_size)
                        batch_order.append(samples)
                        indexes = [ind for ind in indexes if ind not in samples]
                    except:
                        batch_order.append(indexes)
                # extend samples to both datasets
                bath_indexes = [[ind[mini_batch_inds] for ind in indicies] for mini_batch_inds in batch_order]
                # flatten out the list of lists (potential for one mixed batch that will be handled by model)
                batches_flattened = [list(mini_batch) for mini_batches in bath_indexes for mini_batch in mini_batches]
                # select samples in order defined above
                dataset_dict[set_type] = concatenated_datasets.select( np.concatenate(batches_flattened) )
            else:
                dataset_dict[set_type] = concatenated_datasets
        return DatasetDict(dataset_dict)



 


# further notes:
#           - each row in the ged dataset represents one token and its corresponding error label, 
#             where as one row in the aes dataset represents one essay and its score.
#           - each token has been formed using rasp tokenization.
#           - words in ged dataset appear in the same order as they do in essay dataset.
#           - some essays in the ged dataset contain extra words or miss words when compared to the original dataset.
#             these are omitted from the dataset. 
# Works by: 
#           - indexing the end of every word in the ged dataset
#           - joining all the words in the ged dataset together with no whitespace 
#             (creating one string of all the words in the ged dataset (referred to as ged text))
#           - joining all the words in each individual essay together with no whitespace
#           - locating where the essay with no whitespace matches the sequence of words in the joined ged text 
#           - use the index of the start and end word of the appearance of the essay in the ged text as a way to locate the rows in the ged dataset 
#             corresponding to the words in an essay, these rows to get the error labels and words for each essay and merged with essay score and grammar labels.
# Further notes:
#           - however, this does mean the tokenization used to split up words in the essay by the ged dataset does affect the final appearance of the essay.
#             (as more whitespace appears due to splitting of individual words by the tokenizer in the ged dataset), which was seen to negatively impact essay predictions.
#           - but, using the original essay leads to incorrect tagging of the words in each essay; as a result of labels needing to be extended for transformer tokenization.
#             (which was seen to negatively impact grammar predictions).
#           - the impact of incorrect tagging was considdered to be of great impact to the overall validity of the model, hence the joined words from the ged
#             were used as opposed to using the original words dataset.
#           - attempts at making the essays and tags match exactly by locating differences in between the two datasets were somewhat but were at risk of error
#             so there were 70 scripts omitted from the original training set, 7 from the original developement set and 9 from the original test set.
#             Additionally, attempts to perform exact matches were futile for the test dataset as essays in the aes dataset appeared in a different order to the ged dataset. 

# Following code block omitted from experimentation due to possible errors
    ###################################################################################################################################
class LinkGedDatasetToEssayDatasetWithAdditionalMatching(LinkGedDatasetToEssayDataset):

    def __init__(self,set_type='train'):
      super.__init__(set_type)

    def additional_matching(self):
        # create a list of essays that match between the essay and grammar data by using the start and end index of the essay 
        # as they appear in the aes dataset.
        # done by : using the cumulative sum of the length of each essay to get indexes in the aes dataset.
        # then using the indexes to find the differences where characters appear in the aes dataset but not the ged dataset
        # the number of differences are used to adjust the indexs of the start and end of each essay for indexind of the ged dataset.
        # this method was not yetd eveloped to remove characters that appear in the grammar dataset but not the essay dataset.
        self.find_differences()
        rows_from_ged = [self.ged.loc[(self.ged['end_word_ind']>m[0])& (self.ged['end_word_ind']<=m[1])] for m in self.new_indexes]
        # get the words from the ged dataset (with whitespace) to form each essay
        essays_from_ged = [' '.join(tmp_df['word']) for tmp_df in rows_from_ged]
        # get the labels from the ged dataset corresponding to each essay
        labels_from_ged = [tmp_df['labels'] for tmp_df in rows_from_ged]
        return pd.concat([pd.DataFrame({'text':essays_from_ged,'labels':labels_from_ged}),self.essays[['answer-s','script-s','id']]],axis=1)

    def find_differences(self):
        # store list of text for essays with differences removed
        self.new_text = []
        # store new indexes for essays with differences removed
        self.new_indexes = []
        # store list of indexes of differences
        self.new_indexes_errors = []
        # iterate through the aes dataset with itertuples
        for i,row in enumerate(self.essays.itertuples()):
            # get essay with no whitespace and essay length (characters) from the aes dataset
            aes_essay_no_ws , char_len = row[self.essay_col_index['text_no_ws']] , row[self.essay_col_index['essay_char_len']] 
            # get start and end of essay as it appears in the aes set 
            # but with the number of differences that have appeared so far between the two datasets subtracted from the start and end index
            start_ind , end_ind = row[self.essay_col_index['start_ind']]-self.errors_count,row[self.essay_col_index['end_word_ind']]-self.errors_count
            # use the start and end index to get the potentially matching characters between the ged and aes datasets 
            ged_essay = self.all_words_no_ws[start_ind:end_ind]
            # method to locate errors between the two datasets
            self.locate_chars_in_aes_but_not_ged(aes_essay_no_ws,ged_essay,char_len,start_ind,end_ind)

    def locate_chars_in_aes_but_not_ged(self,aes_essay_no_ws,ged_essay,char_len,start_ind,end_ind):
        # if the two essays are equal append the indexes to the new_index list
        if aes_essay_no_ws==ged_essay:
            self.new_indexes.append([start_ind,end_ind])
        else:
            # counter for number of characters in aes essay but not the ged dataset
            aes_essay_errors_count = 0
            # find the first character in aes essay but not the ged dataset
            current_error = min(np.nonzero(np.invert(np.array(list(aes_essay_no_ws))==np.array(list(ged_essay))))[0])
            # get the text which is in the ged essay up to the first difference 
            correct_text = ged_essay[:current_error]
            more_errors = True
            while more_errors==True:
                # get the characters in the the aes essay beyond the current character which is found to be a difference between the two datasets
                tmp_aes_essay = aes_essay_no_ws[current_error+aes_essay_errors_count+1:char_len]
                # get the characters beyond the current difference in the ged essay
                tmp_ged = ged_essay[current_error:char_len-aes_essay_errors_count-1]
                if tmp_aes_essay == tmp_ged:
                    # update the corrected text to contain the ged essay text before the current error
                    correct_text = correct_text + tmp_ged
                    # add the number of errors to the errors count
                    aes_essay_errors_count = aes_essay_errors_count if aes_essay_errors_count!= 0 else 1
                    self.errors_count+=aes_essay_errors_count
                    more_errors = False
                    break
                else:
                    # find the number of characters between the current difference and next one
                    char_to_next_error = min(np.nonzero(np.invert(np.array(list(tmp_aes_essay))==np.array(list(tmp_ged))))[0])
                    # get the index of the next difference in the essay 
                    current_error += char_to_next_error
                    # update the corrected text to contain the ged essay text before the current error
                    correct_text = correct_text + tmp_ged[:char_to_next_error]
                    # add an error to the current essay difference count
                    aes_essay_errors_count+=1
    ###################################################################################################################################

dataset_obj = CreateHuggingFaceMultiTask()
dataset_dict = dataset_obj.get_combined_dataset_dict()



  0%|          | 0/1057 [00:00<?, ?ex/s]

  0%|          | 0/97 [00:00<?, ?ex/s]

  0%|          | 0/79 [00:00<?, ?ex/s]

  0%|          | 0/1057 [00:00<?, ?ex/s]

  0%|          | 0/97 [00:00<?, ?ex/s]

  0%|          | 0/79 [00:00<?, ?ex/s]

Reusing dataset conll2003 (/root/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/40e7cb6bcc374f7c349c83acd1e9352a4f09474eb691f64f364ee62eb65d0ca6)


  0%|          | 0/14041 [00:00<?, ?ex/s]

  0%|          | 0/3250 [00:00<?, ?ex/s]

  0%|          | 0/3453 [00:00<?, ?ex/s]

  0%|          | 0/1057 [00:00<?, ?ex/s]

  0%|          | 0/266 [00:00<?, ?ex/s]

  0%|          | 0/243 [00:00<?, ?ex/s]

  0%|          | 0/1057 [00:00<?, ?ex/s]

  0%|          | 0/266 [00:00<?, ?ex/s]

  0%|          | 0/243 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
import pandas as pd
import numpy as np
import pdb
import os
os.chdir('/content/drive/MyDrive/CAMemBERT2')
import re
import string
import copy
from transformers import AutoModelForSequenceClassification,TrainingArguments,Trainer,AutoTokenizer,AutoModelForTokenClassification,AutoModel,AutoConfig,EarlyStoppingCallback
from transformers.models.bert.modeling_bert import TokenClassifierOutput
from datasets import DatasetDict,Dataset,load_dataset
from scipy.stats import spearmanr
from sklearn.metrics import cohen_kappa_score
from math import floor
import torch 
from torch import nn

# class ClassificationHead(nn.Module):

#     def __init__(self,task_name,mini_task_dict,pretrained_model_name='distilroberta-base',shared_encoder_layer=None):
#         super().__init__()
#         self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
#         model = mini_task_dict['model'].from_pretrained(pretrained_model_name,
#                                                         num_labels=mini_task_dict['n_labels'],
#                                                         output_hidden_states=True)
#         self.dropout = nn.Dropout(model.config.hidden_dropout_prob)
#         self.classifier = model.classifier
#         self.loss_fct = nn.MSELoss()

#     def forward(self,original_model_output,inputs):
#         sequence_output = self.dropout(original_model_output)
#         logits = self.classifier(sequence_output)
#         loss = loss_fct(logits,inputs['labels'])
#         return {'loss':loss,'preds':logits,'labels':inputs['labels']}

# class TaggingHead(nn.Module):

#     def __init__(self,task_name,mini_task_dict,pretrained_model_name='distilroberta-base',shared_encoder_layer=None):
#         super().__init__()
#         self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
#         config = AutoConfig.from_pretrained(pretrained_model_name,
#                                                         num_labels=mini_task_dict['n_labels'],
#                                                         output_hidden_states=True)
#         self.config = config
#         self.dropout = nn.Dropout(config.hidden_dropout_prob)
#         self.classifier = nn.Linear(config.hidden_size, config.num_labels)
#         self.class_weights = mini_task_dict['class_weights']
#         self.loss_fct = nn.CrossEntropyLoss(weight = self.class_weights) if self.class_weights else nn.CrossEntropyLoss()
#         self.shared_encoder_layer = shared_encoder_layer

#     def set_shared_encoder_layer(self,encoder_layer):
#         self.shared_encoder_layer = encoder_layer

#     def forward(self,original_model_output,inputs):
#         output = self.shared_encoder_layer(original_model_output).last_hidden_state if self.shared_encoder_layer else original_model_output
#         sequence_output = self.dropout(output)
#         logits = self.classifier(sequence_output)
#         loss = None
#         labels = inputs['labels']
#         # Only keep active parts of the loss
#         active_loss = inputs['attention_mask'].view(-1) == 1
#         active_logits = logits.view(-1, self.config.num_labels)
#         active_labels = torch.where(active_loss, labels.view(-1), torch.tensor(self.loss_fct.ignore_index).type_as(labels))
#         non_padded_mask = active_labels!=-100
#         loss = self.loss_fct(active_logits[non_padded_mask], active_labels[non_padded_mask])
#         active_preds_mask = torch.logical_and(active_loss,non_padded_mask)
#         return {'loss':loss,'preds':torch.argmax(logits,2).flatten()[active_preds_mask],'labels':active_labels[non_padded_mask]}

# class MultiTaskModel(nn.Module):

#     def __init__(self,pretrained_model='distilroberta-base',kwargs_dict=None,current_task=None):
#         super().__init__()
#         self.kwargs_dict = kwargs_dict
#         self.primary_task = self.kwargs_dict['task_priorities_priority_as_key']['primary_task']
#         self.model = self.kwargs_dict[self.primary_task]['model'].from_pretrained(pretrained_model,
#                                                                num_labels=self.kwargs_dict[self.primary_task]['n_labels'],
#                                                                output_hidden_states=True)
#         self.shared_encoder_layer = None
#         self.current_task = current_task
#         self.decoder_dict = {}
#         if self.kwargs_dict['shared_encoder_n_layers']>0 and self.shared_encoder_layer==None:
#             config = AutoConfig.from_pretrained(pretrained_model,num_hidden_layers=self.kwargs_dict['shared_encoder_n_layers'],output_hidden_states=True)
#             shared_encoder = AutoModel.from_pretrained(pretrained_model,config=config).encoder
#             self.shared_encoder_layer = shared_encoder
#         for task in self.kwargs_dict['tasks']:
#             if task in self.kwargs_dict['classification_tasks']:
#                 self.decoder_dict[task] = TaggingHead(task,self.kwargs_dict[task])
#                 if self.kwargs_dict[task]['shares_encoder'] and self.shared_encoder_layer:
#                     self.decoder_dict[task].set_shared_encoder_layer(self.shared_encoder_layer)
#             elif task in self.kwargs_dict['regression_tasks'] and task != self.primary_task :
#                 self.decoder_dict[task] = ClassificationHead(task_name,self.kwargs_dict[task])

#     def forward(self,**inputs):
#         dataset = torch.unique(inputs['dataset'])[0] if len(torch.unique(inputs['dataset']))==1 else 'mix'
#         if dataset!='mix':
#             outputs_dict = self.get_outputs(inputs,dataset)
#         else:
#             outputs_dict = {}
#             for task in self.split_inputs(inputs):
#                 outputs = self.get_outputs(self,inputs=task[1],dataset=0) if task[0]=='fce_task' else self.get_outputs(self,inputs=task[1],dataset=1) 
#                 outputs_dict = {**outputs_dict,**outputs}
#         return outputs_dict


#     def get_outputs(self,inputs,dataset):
#         if dataset==0:
#             if self.primary_task in self.kwargs_dict['regression_tasks']:
#                 model_output = self.model(input_ids=inputs['input_ids'],attention_mask=inputs['attention_mask'],labels=inputs[self.kwargs_dict['labels'][self.primary_task]].float())
#                 outputs_dict = {
#                                 f'{self.primary_task}_loss':model_output.loss,
#                                 f'{self.primary_task}_preds':model_output.logits.flatten(),
#                                 f'{self.primary_task}_labels':inputs[self.kwargs_dict['labels'][self.primary_task]]
#                                 }
#             else:
#                 model_output = self.model(input_ids=inputs['input_ids'],attention_mask=inputs['attention_mask'])
#                 primary_output = self.decoder_dict[self.primary_task](model_output.hidden_states[self.kwargs_dict[self.primary_task]['output_layer']])
#                 outputs_dict = {}

#             tasks = list(set(self.kwargs_dict['tasks'])-{'ner'}) if self.primary_task not in self.kwargs_dict['regression_tasks'] else ['ged']
#             for task in tasks:
#                 output = self.decoder_dict[task](model_output.hidden_states[self.kwargs_dict[task]['output_layer']])
#                 outputs_dict = self.get_output_dict(task,output,outputs_dict)
#         else:
#             model_output = self.model(input_ids=inputs['input_ids'],attention_mask=inputs['attention_mask'])
#             task='ner'
#             output = self.decoder_dict[task](model_output.hidden_states[self.kwargs_dict[task]['output_layer']])
#             outputs_dict = self.get_output_dict(task,output,outputs_dict={})
#         return outputs_dict

#     def get_output_dict(self,task,output,output_dict):
#         return {**outputs_dict,**{f'{task}_loss':output['loss'],f'{task}_preds':output['preds'],f'{task}_labels':inputs[self.kwargs_dict['labels'][task]]}}

#     def split_inputs(self,inputs):
#         split_mask_1 = inputs['dataset']=='fce'
#         return ((['fce_task'],{k:v[split_mask_1] for k,v in inputs.items()}),('ner',{k:v[~split_mask_1] for k,v in inputs.items()}))



In [None]:
from transformers import TrainingArguments,Trainer,AutoModelForSequenceClassification,AutoModel
import numpy as np
from math import floor
from torch import tensor
from torch.cuda import is_available
from collections import defaultdict
from scipy.stats import spearmanr
from sklearn.metrics import cohen_kappa_score,classification_report,fbeta_score

class ClassificationHead(nn.Module):

    def __init__(self,task_name,mini_task_dict,pretrained_model_name='distilroberta-base',shared_encoder_layer=None):
        super().__init__()
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        model = mini_task_dict['model'].from_pretrained(pretrained_model_name,
                                                        num_labels=mini_task_dict['n_labels'],
                                                        output_hidden_states=True)
        self.dropout = nn.Dropout(model.config.hidden_dropout_prob)
        self.classifier = model.classifier
        self.loss_fct = nn.MSELoss()

    def forward(self,original_model_output,inputs):
        sequence_output = self.dropout(original_model_output)
        logits = self.classifier(sequence_output)
        loss = loss_fct(logits,inputs['labels'])
        return {'loss':loss,'preds':logits,'labels':inputs['labels']}

class TaggingHead(nn.Module):

    def __init__(self,task_name,mini_task_dict,pretrained_model_name='distilroberta-base',shared_encoder_layer=None):
        super().__init__()
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        config = AutoConfig.from_pretrained(pretrained_model_name,
                                                        num_labels=mini_task_dict['n_labels'],
                                                        output_hidden_states=True)
        self.config = config
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels).to(self.device)
        self.class_weights = mini_task_dict['class_weights']
        self.loss_fct = nn.CrossEntropyLoss(weight = self.class_weights) if self.class_weights != None else nn.CrossEntropyLoss()
        self.shared_encoder_layer = shared_encoder_layer

    def set_shared_encoder_layer(self,encoder_layer):
        self.shared_encoder_layer = encoder_layer

    def forward(self,original_model_output,inputs):
        output = self.shared_encoder_layer(original_model_output).last_hidden_state if self.shared_encoder_layer else original_model_output
        sequence_output = self.dropout(output)
        logits = self.classifier(sequence_output)
        loss = None
        labels = inputs['labels']
        # Only keep active parts of the loss
        active_loss = inputs['attention_mask'].view(-1) == 1
        active_logits = logits.view(-1, self.config.num_labels)
        active_labels = torch.where(active_loss, labels.view(-1), torch.tensor(self.loss_fct.ignore_index).type_as(labels))
        non_padded_mask = active_labels!=-100
        loss = self.loss_fct(active_logits[non_padded_mask], active_labels[non_padded_mask])
        active_preds_mask = torch.logical_and(active_loss,non_padded_mask)
        return {'loss':loss,'preds':torch.argmax(logits,2).flatten()[active_preds_mask],'labels':active_labels[non_padded_mask]}

class MultiTaskModel(nn.Module):

    def __init__(self,pretrained_model='distilroberta-base',kwargs_dict=None,current_task=None):
        super().__init__()
        self.kwargs_dict = kwargs_dict
        self.primary_task = self.kwargs_dict['task_priorities_priority_as_key']['primary_task']
        self.model = self.kwargs_dict[self.primary_task]['model'].from_pretrained(pretrained_model,
                                                               num_labels=self.kwargs_dict[self.primary_task]['n_labels'],
                                                               output_hidden_states=True)
        self.shared_encoder_layer = None
        self.current_task = current_task
        self.decoder_dict = {}
        if self.kwargs_dict['shared_encoder_n_layers']>0 and self.shared_encoder_layer==None:
            config = AutoConfig.from_pretrained(pretrained_model,num_hidden_layers=self.kwargs_dict['shared_encoder_n_layers'],output_hidden_states=True)
            shared_encoder = AutoModel.from_pretrained(pretrained_model,config=config).encoder
            self.shared_encoder_layer = shared_encoder
        for task in self.kwargs_dict['tasks']:
            if task in self.kwargs_dict['classification_tasks']:
                self.decoder_dict[task] = TaggingHead(task,self.kwargs_dict[task])
                if self.kwargs_dict[task]['shares_encoder'] and self.shared_encoder_layer:
                    self.decoder_dict[task].set_shared_encoder_layer(self.shared_encoder_layer)
            elif task in self.kwargs_dict['regression_tasks'] and task != self.primary_task :
                self.decoder_dict[task] = ClassificationHead(task_name,self.kwargs_dict[task])

    def forward(self,**inputs):
        dataset = torch.unique(inputs['dataset'])[0] if len(torch.unique(inputs['dataset']))==1 else 'mix'
        if dataset!='mix':
            outputs_dict = self.get_outputs(inputs,dataset)
        else:
            outputs_dict = {}
            for task in self.split_inputs(inputs):
                outputs = self.get_outputs(inputs=task[1],dataset=0) if task[0]=='fce_task' else self.get_outputs(inputs=task[1],dataset=1) 
                outputs_dict = {**outputs_dict,**outputs}
        return outputs_dict


    def get_outputs(self,inputs,dataset):
        if dataset==0:
            if self.primary_task in self.kwargs_dict['regression_tasks']:

                model_output = self.model(input_ids=inputs['input_ids'],attention_mask=inputs['attention_mask'],labels=inputs[self.kwargs_dict[self.primary_task]['labels']].float())
                outputs_dict = {
                                f'{self.primary_task}_loss':model_output.loss,
                                f'{self.primary_task}_preds':model_output.logits.flatten(),
                                f'{self.primary_task}_labels':inputs[self.kwargs_dict[self.primary_task]['labels']]
                                }
            else:
                model_output = self.model(input_ids=inputs['input_ids'],attention_mask=inputs['attention_mask'])
                # primary_output = self.decoder_dict[self.primary_task](model_output.hidden_states[self.kwargs_dict[self.primary_task]['output_layer']])
                outputs_dict = {}

            tasks = list(set(self.kwargs_dict['tasks'])-{'ner'}) if self.primary_task not in self.kwargs_dict['regression_tasks'] else ['ged']
            for task in tasks:
                output = self.decoder_dict[task](model_output.hidden_states[self.kwargs_dict[task]['output_layer']],inputs)
                outputs_dict = self.get_output_dict(task,output,outputs_dict)
        else:
            model_output = self.model(input_ids=inputs['input_ids'],attention_mask=inputs['attention_mask'])
            task='ner'
            output = self.decoder_dict[task](model_output.hidden_states[self.kwargs_dict[task]['output_layer']],inputs)
            outputs_dict = self.get_output_dict(task,output,outputs_dict={})
        return outputs_dict

    def get_output_dict(self,task,output,outputs_dict):
        return {**outputs_dict,**{f'{task}_loss':output['loss'],f'{task}_preds':output['preds'],f'{task}_labels':output[self.kwargs_dict[task]['labels']]}}

    def split_inputs(self,inputs):
        split_mask = inputs['dataset']==0
        return ((['fce_task'],{k:v[split_mask] for k,v in inputs.items()}),('ner',{k:v[~split_mask] for k,v in inputs.items()}))

def training_args(batch_size=8,save_strategy='no',output_dir='/',lr= 5e-4,epochs=30,weight_decay=0.01):
	args = TrainingArguments(
	    output_dir=output_dir,
	    save_strategy=save_strategy,
	    evaluation_strategy='no',
	    learning_rate=lr,
	    per_device_train_batch_size=batch_size,
	    per_device_eval_batch_size=batch_size,
	    num_train_epochs=epochs,
	    weight_decay=weight_decay,
	)
	return args

def training_kwargs(
    tasks=['aes','ged','ner'],outputs=['loss','labels','preds'],optimizer_weighting='fixed',
    init_task_weightings={'aes':1.0,'ged':1.0,'ner':1.0},task_priorities={'aes':'primary_task','ged':'secondary_task','ner':'aux_task'},
    temp = 2,metrics_to_track_by_task={'aes':'pearson_aes','ged':'f_0_5_ged','ner':'f1_score_avg_ner'},
    regression_tasks=['aes'],classification_tasks=['ner','ged'],fce_tasks=['aes','ged'], 
    class_weights={'aes':None,'ged':None,'ner':None},n_labels={'aes':1,'ged':2,'ner':9},labels={'aes':'scores','ged':'labels','ner':'labels'},
    model={'aes':AutoModelForSequenceClassification,'ged':AutoModel,'ner':AutoModel},
    shares_encoder={'aes':False,'ged':True,'ner':True},output_layer={'aes':-1,'ged':-1,'ner':-1},shared_encoder_n_layers = 1,trainer=True,
    early_stopping_patience=3,early_stopping_metric=None,pretrained_model='distilroberta-base',frozen_layers='all',scoring='script'
    ):
    if type(init_task_weightings)==int:
        init_task_weightings = calculate_init_loss_hyper_params(init_task_weightings,tasks,task_priorities)

    map_task_index = {task:i for i,task in enumerate(tasks)}
    map_out_to_task={f'{task}_{output}':task  for task in tasks for output in outputs}
    map_out_to_out={f'{task}_{output}':output  for task in tasks for output in outputs}
    task_priorities_priority_as_key = {v:k for k,v in task_priorities.items()}
    kwargs = {
        'tasks':tasks,
        'outputs':outputs,
        'map_out_to_task':map_out_to_task,
        'map_out_to_out':map_out_to_out,
        'map_task_index':map_task_index,
        'map_index_task':{v:k for k,v in map_task_index.items()},
        'optimizer_weighting':optimizer_weighting,
        'epochs_to_avg_over':2,
        'temp':temp,
        'init_task_weightings':init_task_weightings,
        'metrics_to_track_by_task':metrics_to_track_by_task,
        'regression_tasks':regression_tasks,
        'classification_tasks':classification_tasks,
        'fce_tasks':fce_tasks,
        'task_priorities_priority_as_key':task_priorities_priority_as_key,
        'early_stopping_patience':early_stopping_patience,
        'early_stopping_metric':early_stopping_metric,
        'pretrained_model':pretrained_model,
        'frozen_layers':frozen_layers,
        'scoring':scoring
    }
    if trainer:
        return kwargs
    else:
        task_dict = {
            task:{
              'class_weights':class_weights[task],
              'n_labels':n_labels[task],
              'labels':labels[task],
              'model':model[task],
              'shares_encoder':shares_encoder[task],
              'output_layer':output_layer[task]
              } 
            for task in tasks
            }
        task_dict['shared_encoder']=True
        task_dict['shared_encoder_n_layers']=shared_encoder_n_layers
        task_dict['fce_tasks'] = fce_tasks
        return {**task_dict,**kwargs}

def calculate_init_loss_hyper_params(primary_task_weight,tasks,task_priorities):
    updated_init_task_weightings = {}
    for task in tasks:
        if task_priorities[task]=='primary_task':
            updated_init_task_weightings[task] = primary_task_weight
        elif task_priorities[task]=='secondary_task':
            updated_init_task_weightings[task] = 1-primary_task_weight
        elif task_priorities[task]=='aux_task':
            updated_init_task_weightings[task] = (1-primary_task_weight)*0.1
    return updated_init_task_weightings

class MultiTaskModelTrainer(Trainer):

    prev_epoch = 0
    device = 'cuda' if is_available() else 'cpu'

    def compute_loss(self,model,inputs,return_outputs=False):
    	# round down epoch 
        epoch = floor(self.state.epoch)
        # calculate the current step within an epoch
        step_in_epoch = self.state.global_step+1-(epoch*(self.state.max_steps/self.state.num_train_epochs))
        if step_in_epoch==1:
            self.model.train()   
            # inits for start of training
            if self.state.global_step == 0:
                # avg cost across predetermined number of epochs (init as zero for first epoch) used for dwa
                self.avg_cost = torch.zeros([self.state.num_train_epochs, len(self.kwargs_['tasks'])]).float().to(self.device)
                # a tensoer where each col represetnts the task and each row represents its value at a given epoch
                indexes , weights = zip(*sorted([(self.kwargs_['map_task_index'][k],v) for k,v in self.kwargs_['init_task_weightings'].items()]))
                self.lambda_weights = tensor(list(weights)).expand(self.state.num_train_epochs,-1).to(self.device)

        if self.kwargs_['optimizer_weighting']=='dwa':
            # if the first step of any epoch update task weighing values 
            if step_in_epoch==1:
                if epoch not in list(range(self.kwargs_[epochs_to_avg_over])):
                    # find the change in loss over the past number of spesified epochs for each task
                    ws = [torch.exp((self.avg_cost[epoch - 1, val] / self.avg_cost[epoch - self.kwargs_[epochs_to_avg_over], val])/temp) for val in range(len(self.kwargs_['tasks']))]
                    # updates weights in accordance with change in loss
                    self.lambda_weights[epoch, :] = tensor([len(self.kwargs_['tasks']) * (ws[val]/sum(ws)) for val in range(len(self.kwargs_['tasks']))])

        if self.kwargs_['optimizer_weighting']=='dynamic':
            if step_in_epoch==1:
                self.update_te(epoch)
                self.lambda_weights[epoch, :] = tensor([self.calc_dynamic_loss(task,epoch) for task in self.kwargs_['tasks']])


        outputs = self.model(**inputs)
        # get training loss for current task based off of current mini batch
        loss =  self.sum_losses(outputs,epoch)

        # if final step in epoch perform evaluation
        if (self.state.global_step+1)%(self.state.max_steps/self.state.num_train_epochs)==0:
            self.compute_metrics_2(eval_dataset=self.eval_dataset,epoch=floor(self.state.epoch),testing=False)
            self.prev_epoch = self.state.epoch

        return loss

    def sum_losses(self,outputs,epoch,eval=False):
        train_loss = {}
        for task in self.kwargs_['tasks']:
            try:
                output_loss = outputs[f'{task}_loss']
                train_loss[task] = output_loss
            except:
                continue
        
        losses = {}
        for k,v in train_loss.items():
            losses[k] = self.lambda_weights[epoch, self.kwargs_['map_task_index'][k]] * v
            if self.kwargs_['optimizer_weighting']=='dwa':
                self.avg_cost[epoch, k] += train_loss[self.kwargs_['map_task_index'][k]].item() / (self.state.max_steps/self.state.num_train_epochs)
        if eval:
            return losses
        else:
            return sum(losses.values())


    def compute_metrics_2(self,eval_dataset,epoch,testing=False):
        
        self.model.eval()
        history = {task:defaultdict(list) for task in self.kwargs_['tasks']}
        losses = defaultdict(int)
        total_loss = 0.0
        # add all outputs to device
        for step,inputs in enumerate(self.get_eval_dataloader(eval_dataset)):     
            for key, value in inputs.items():
                inputs[key] = inputs[key].to(self.device)
            # get output from model using mini batch
            outputs = self.model(**inputs)
            calculated_loses = self.sum_losses(outputs,epoch,eval=True)

            for task,value in calculated_loses.items():
                # add loss to the total loss of the model and inidividual task losses
                losses[task] += value.item()
                total_loss += value.item()
            for key,value in outputs.items():
                # appnd the list of predictions based off the task and output
                if 'loss' not in key and 'dataset' != key:
                    history[self.kwargs_['map_out_to_task'][key]][self.kwargs_['map_out_to_out'][key]] = history[self.kwargs_['map_out_to_task'][key]][self.kwargs_['map_out_to_out'][key]] + value.cpu().detach().numpy().tolist()

        if self.kwargs_['optimizer_weighting'] in['dwa','dynamic']:
            for task in tasks:
                print( f'{task}_weight_coef : {self.lambda_weights[self.kwargs_["map_task_index"][task] , epoch]}')
            print()
        avg_losses = {k:v/step for k,v in losses.items()}
        avg_losses['total_loss'] = total_loss/step
        print('losses',avg_losses)
        print()

        metrics_dics = []
        for task,values in history.items():
            if task in self.kwargs_['regression_tasks']:
            	  metrics_dics.append(self.calc_regression_metrics(np.rint(values['preds']),values['labels'],task))
            elif task in self.kwargs_['classification_tasks']:
                metrics_dics.append(self.calc_classification_metrics(values['preds'],values['labels'],task))
        
            logs = {k:v for dic in metrics_dics for k,v in dic.items()}
            logs = {**logs,**avg_losses}
        if testing==False:
            self.log(logs)
            log_hist = [epoch for epoch in self.state.log_history for k,v in epoch.items() if 'learning_rate' not in epoch.keys()]
            self.best_metrics_values = {task : max([epoch[metric] for epoch in log_hist]) for task,metric in self.kwargs_['metrics_to_track_by_task'].items() }
            self.best_metrics_epoch = {task : np.argmax([epoch[metric] for epoch in log_hist]) for task,metric in self.kwargs_['metrics_to_track_by_task'].items() }
            self.lowest_loss_epoch = np.argmin([epoch[f'{self.kwargs_["task_priorities_priority_as_key"]["primary_task"]}'] for epoch in log_hist])
            if self.kwargs_['early_stopping_patience']:
                if self.kwargs_['early_stopping_metric'] == 'loss':
                    if (floor(self.state.epoch) - self.lowest_loss_epoch) == self.kwargs_['early_stopping_patience']:
                        self.save_hist_and_stop_training(log_hist)

                else:
                    if (floor(self.state.epoch) - self.best_metrics_epoch[self.kwargs_['task_priorities_priority_as_key']['primary_task']]) == self.kwargs_['early_stopping_patience']:
                        self.save_hist_and_stop_training(log_hist)
        else:
            return logs,{task:{'preds':values['preds'],'labels':values['labels']} for task,values in history.items() if task in self.kwargs_['regression_tasks']}
    
    def save_hist_and_stop_training(self,log_hist):
        best_hist = {task : {metric:value for metric,value in log_hist[epoch].items() if task in metric} for task,epoch in self.best_metrics_epoch.items()}
        eval_hist = defaultdict(list)
        [eval_hist[k].append(v) for epoch in log_hist for k,v in epoch.items()]
        try:
            del eval_hist['total_flos']
        except:
            pass
        try:
            del eval_hist['total_flos']
            train_hist =  {}
            train_hist['train_loss'] = eval_hist.pop('train_loss')
            train_hist['train_runtime'] = eval_hist.pop('train_runtime')
            train_hist['train_samples_per_second'] = eval_hist.pop('train_samples_per_second')
            train_hist['train_steps_per_second'] = eval_hist.pop('train_steps_per_second')
            train_hist['train_samples_per_second'] = eval_hist.pop('train_samples_per_second')
        except:
            train_hist = {}
        test_hist,test_preds = self.compute_metrics_2(eval_dataset=self.test_dataset,epoch=self.best_metrics_epoch[self.kwargs_["task_priorities_priority_as_key"]["primary_task"]],testing=True)
        info_dict = self.kwargs_
        hist = {}
        hist['best'],hist['train'],hist['eval'],hist['test'],hist['info'],hist['preds'] = best_hist,train_hist,eval_hist,test_hist,info_dict,test_preds
        self.history_dict = hist
        with open(f'results/{time.time()}.json', 'w') as fp:
            json.dump(hist, fp)
        self.state.max_steps = self.state.global_step


    def calc_regression_metrics(self,preds,labels,task):
        metrics_dic = {
            f"rmse_{task}": np.sqrt(np.mean((preds-labels)**2)),
            f"pearson_{task}": np.corrcoef(preds,labels)[0,1],
            f"spearman_{task}" : spearmanr(preds, labels)[0],
            f"kappa_{task}":cohen_kappa_score(preds,labels,weights='quadratic')
            }
        print(f'{task}_metrics',metrics_dic)
        print()
        return metrics_dic

    def calc_classification_metrics(self,preds,labels,task):
        digits = 9 if task == 'ner' else 2
        print(task,classification_report(labels, preds, digits=digits))
        report_output = classification_report(labels, preds, digits=digits, output_dict=True)
        metrics_dic = {
          f'f1_score_avg_{task}' : report_output['accuracy'],
          f'f1_score_macro_{task}' : report_output['macro avg']['f1-score'],
          f'f1_score_weighted_{task}' : report_output['weighted avg']['f1-score'],
    	  }
        if digits==2:
            metrics_dic[f'f_0_5_{task}'] = fbeta_score(preds,labels,beta=0.5)
        print(f'{task}_metrics',metrics_dic )
        print()
        return metrics_dic

    def calc_dynamic_loss(self,task,epoch):
        if task in self.kwargs_['regression_tasks']:
            return self.te
        elif task in self.kwargs_['classification_tasks']:
            return (1-self.te)

    def update_te(self,epoch):
        if epoch==0:
            self.gamma = np.log((1/1e-6)-1)/((self.args.num_train_epochs/2)-1)
            self.te = 1/(1+np.exp(self.gamma*((self.args.num_train_epochs/2)-epoch)))

    def get_train_dataloader(self):
        train_dataset = self.train_dataset
        return torch.utils.data.DataLoader(
            train_dataset,
            batch_size=self.args.train_batch_size,
            collate_fn=self.data_collator,
            drop_last=self.args.dataloader_drop_last,
            num_workers=self.args.dataloader_num_workers,
            pin_memory=self.args.dataloader_pin_memory,
        )

    def get_eval_dataloader(self,eval_dataset):
        return torch.utils.data.DataLoader(
            eval_dataset,
            batch_size=self.args.train_batch_size,
            collate_fn=self.data_collator,
            drop_last=self.args.dataloader_drop_last,
            num_workers=self.args.dataloader_num_workers,
            pin_memory=self.args.dataloader_pin_memory,
        )

model = MultiTaskModel(kwargs_dict = training_kwargs(class_weights={'aes':None,'ged':dataset_obj.get_weights(),'ner':None},trainer=False))
for name,params in model.model.named_parameters():
    if 'classifier' not in name:
        params.requires_grad = False
args = training_args()
trainer = MultiTaskModelTrainer(
    model,
    args,
    train_dataset=dataset_dict['train'],
    eval_dataset=dataset_dict['dev'],
)
trainer.kwargs_ = training_kwargs(trainer=True)
trainer.test_dataset = dataset_dict['test']
trainer.train()

loading configuration file https://huggingface.co/distilroberta-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/42d6b7c87cbac84fcdf35aa69504a5ccfca878fcee2a1a9b9ff7a3d1297f9094.aa95727ac70adfa1aaf5c88bea30a4f5e50869c68e68bce96ef1ec41b5facf46
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "output_hidden_states": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.10.0",
  "type_vocab

Step,Training Loss


losses {'aes': 7.006794596827308, 'ged': 0.13928976862929587, 'ner': 0.16011982116588327, 'total_loss': 7.3062041866224865}

aes_metrics {'rmse_aes': 5.779513628133234, 'pearson_aes': 0.37877302864959617, 'spearman_aes': 0.39060947837014537, 'kappa_aes': 0.06899128268991284}

ged               precision    recall  f1-score   support

           0       0.89      0.87      0.88     25174
           1       0.28      0.33      0.30      3917

    accuracy                           0.80     29091
   macro avg       0.59      0.60      0.59     29091
weighted avg       0.81      0.80      0.80     29091

ged_metrics {'f1_score_avg_ged': 0.7986318792753773, 'f1_score_macro_ged': 0.5934380378238653, 'f1_score_weighted_ged': 0.8044897380094166, 'f_0_5_ged': 0.3179677819083023}

ner               precision    recall  f1-score   support

           0  0.960108135 0.988400103 0.974048722     42759
           1  0.916269571 0.730727470 0.813047418      1842
           2  0.985248447 0.970925784 0

KeyboardInterrupt: ignored

In [None]:
log_hist = [epoch for epoch in trainer.state.log_history for k,v in epoch.items() if 'learning_rate' not in epoch.keys()]

In [None]:
trainer.save_hist_and_stop_training(log_hist)

losses {'aes': 9.145910399300712, 'ged': 0.19631987526303246, 'ner': 0.15572395778837658, 'total_loss': 9.49795423235212}

aes_metrics {'rmse_aes': 5.62175832517431, 'pearson_aes': 0.6513049882002766, 'spearman_aes': 0.6600941274408868, 'kappa_aes': 0.17117951091568306}

ged               precision    recall  f1-score   support

           0       0.88      0.92      0.90     31755
           1       0.38      0.28      0.32      5670

    accuracy                           0.82     37425
   macro avg       0.63      0.60      0.61     37425
weighted avg       0.80      0.82      0.81     37425

ged_metrics {'f1_score_avg_ged': 0.821376085504342, 'f1_score_macro_ged': 0.6083254454882531, 'f1_score_weighted_ged': 0.809666868023565, 'f_0_5_ged': 0.29236388029664967}

ner               precision    recall  f1-score   support

           0  0.961643416 0.983926102 0.972657157     38323
           1  0.882768362 0.773036487 0.824266403      1617
           2  0.959697733 0.988754325 0.97400

In [None]:
best_scores,train_hist,eval_hist,test_hist,info_dict


[{'aes': 7.0047174054522845,
  'epoch': 1.0,
  'f1_score_avg_ged': 0.748925784606923,
  'f1_score_avg_ner': 0.9352828939683034,
  'f1_score_macro_ged': 0.5825031745614321,
  'f1_score_macro_ner': 0.6992174443121277,
  'f1_score_weighted_ged': 0.7751119523551384,
  'f1_score_weighted_ner': 0.9280180117469411,
  'f_0_5_ged': 0.38042269187986644,
  'ged': 0.13772699028946633,
  'kappa_aes': 0.06308401511690687,
  'ner': 0.16539328181466392,
  'pearson_aes': 0.3380663611717876,
  'rmse_aes': 5.792715732327589,
  'spearman_aes': 0.33919986498837235,
  'step': 264,
  'total_loss': 7.307837677556415},
 {'epoch': 1.89,
  'learning_rate': 0.0004685534591194969,
  'loss': 37.5006,
  'step': 500},
 {'aes': 5.951165908990904,
  'epoch': 2.0,
  'f1_score_avg_ged': 0.8358942628304287,
  'f1_score_avg_ner': 0.941785756006386,
  'f1_score_macro_ged': 0.6191960677203276,
  'f1_score_macro_ner': 0.7672190406167556,
  'f1_score_weighted_ged': 0.8291005349545686,
  'f1_score_weighted_ner': 0.9399376925319

In [None]:
self.history_dict = hist



In [None]:
args = training_args()

In [None]:
args