In [429]:
! pip install transformers
! pip install datasets

Collecting datasets
  Downloading datasets-1.11.0-py3-none-any.whl (264 kB)
[K     |████████████████████████████████| 264 kB 4.9 MB/s 
Collecting xxhash
  Downloading xxhash-2.0.2-cp37-cp37m-manylinux2010_x86_64.whl (243 kB)
[K     |████████████████████████████████| 243 kB 50.6 MB/s 
[?25hCollecting fsspec>=2021.05.0
  Downloading fsspec-2021.7.0-py3-none-any.whl (118 kB)
[K     |████████████████████████████████| 118 kB 48.0 MB/s 
Installing collected packages: xxhash, fsspec, datasets
Successfully installed datasets-1.11.0 fsspec-2021.7.0 xxhash-2.0.2


In [434]:
import pandas as pd
import numpy as np
import pdb
import os
os.chdir('/content/drive/MyDrive/CAMemBERT2')
import re
import string
import copy
from transformers import AutoTokenizer
from datasets import DatasetDict,Dataset

In [461]:
class PreProcessing:

    _dataset = 'fce'
    _set_types = ['train','test','dev']
    _path_to_dataset = 'data/'
    _script_score_mapping = {

                      1.1:1,1.2:4,1.3:8,
                  
                      2.1:9,2.2:10,2.3:11,
                  
                      3.1:12,3.2:13,3.3:14,
                  
                      4.1:15,4.2:16,4.3:17,
                  
                      5.1:18,5.2:19,5.3:20,
                  
                  }
    _no_ws = re.compile( '\S+' )
    _punc_strt_or_end = re.compile( f'(^[{string.punctuation}]+|[{string.punctuation}]+$)' )
    _only_punc = re.compile(f'^[{string.punctuation}]+$')

    # initialise the class so that all pre-processing steps are completed
    def __init__(self,pretrained_model='distilroberta-base',max_length=512):
        self.pretrained_model = pretrained_model
        self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
        self.max_length = max_length
        self.data = self.read_data()
        self.cleaned_data = self.clean_data(self.data.copy())
        self.labeled_data = self.create_labels(self.cleaned_data.copy())
        self.data_for_script_scoring = self.prepare_for_script_scoring(self.labeled_data)
        self.dataset = self.create_dataset(self.labeled_data.copy())
        self.dataset_script = self.create_dataset(self.data_for_script_scoring.copy())

    def get_data(self):
        return self.data

    def get_cleaned_data(self):
        return self.cleaned_data

    def get_labeled_data(self):
        return self.labeled_data

    def get_dataset(self):
        return self.dataset

    def get_script_dataset(self):
        return self.dataset_script

    # read each dataset into one pandas dataframe
    def read_data(self):
        return pd.concat(
            [self.add_col(pd.read_json(f'{self._path_to_dataset}{self._dataset}.{set_type}.json',lines=True),'set_type',set_type) for set_type in self._set_types]
            ,axis=0
        )

    # create a new column in a dataframe with one constant value 
    def add_col(self,df,col,val):
        df[col] = val
        return df

    # method to apply all steps that clean the data.
    def clean_data(self,data):
        data = self.turn_edits_to_one_list_of_lists(data)
        data = self.remove_new_line_breaks_from_text(data)
        data = self.clean_answer_scores(data)
        data = self.normalise_script_scores(data)
        return data

    def turn_edits_to_one_list_of_lists(self,df):
        df[ 'edits' ] = df[ 'edits' ].apply( lambda x : x[ 0 ][ 1 ] )
        df[ 'edits' ] = df[ 'edits' ].apply( lambda x : [ a[:2] for a in x ])
        df[ 'edits' ] = df[ 'edits' ].apply( lambda x : self.remove_overlapping_groups(x) )
        return df

    def remove_overlapping_groups(self,x):
        if len(x)>0:
            x = [list(range(ind[0],ind[1]+1)) for ind in x]
            non_overlapping = []
            for i,inds in enumerate(x):
                list_with_data_removed = x[:i] + x[i+1 :]
                if not any(len(set(inds_2)-set(inds))==0 for inds_2 in list_with_data_removed):
                    non_overlapping.append([inds[0],inds[-1]])
            return non_overlapping
        else:
            return x

    # cleaning text
    def remove_new_line_breaks_from_text(self,df):
        df[ 'text' ] = df[ 'text' ].str.replace( '\n',' ' )
        return df

    # clean script scores by correcting typos '/' should represent '.'
    # remove scores that can't be converted to float and are not in the score mapping dictionary.
    # these scores have either been tagged with a T or S suggesting that the essay has not adhered to the prompt
    # or do not have a clear mapping and thus might negatively impact on results.
    # applies score mapping and nomalises scores.
    def clean_answer_scores(self,df):
        df[ 'answer-s' ] = df[ 'answer-s' ].str.replace( '/','.' ).str.replace('T','')
        df = df[ ~pd.to_numeric( df[ 'answer-s' ] , errors='coerce' ).isna() ]
        df = df[ df['answer-s'].astype(float).isin(self._script_score_mapping.keys())]
        df[ 'answer-s' ] = df[ 'answer-s' ].astype( float ).map( self._script_score_mapping ).apply( lambda x : (x -1) / 19 )
        return df.rename(columns={'answer-s':'answer_scores'})

    def normalise_script_scores(self,df):
        df[ 'script-s' ] = df[ 'script-s' ].astype( int ).apply( lambda x : x / 40 )
        return df.rename(columns={'script-s':'script_scores'})


    def create_labels(self,df):
        df = df.apply(self.create_tags,axis=1)
        return df[['cleaned_text','answer_scores','script_scores','labels','set_type','id']]

    def create_tags(self,row):
        previous_tagged_word = None
        if len(row['edits'])>1:
            edits = row['edits']
            for i,ind in enumerate(edits):
                if ind[0]==ind[1]:
                    ind[1] += 1
                if row['text'][ind[0]:ind[1]]==' ':
                    ind[0],ind[1] = self.tag_to_next_word(ind,row['text'])
                    if previous_tagged_word==row['text'][ind[0]:ind[1]] and (prev_index == ind[0] or prev_index == ind[1]):
                        first_ind,prev_index = ind[0],ind[1]
                        continue
                    previous_tagged_word = row['text'][ind[0]:ind[1]]
                    first_ind,prev_index = ind[0],ind[1]
                
                if i == 0:
                    if ind[0]!=0:
                        if ind[0]!=1:
                            first_ind,next_ind = 0,ind[0]
                        else:
                            first_ind,next_ind = 0,2
                        updated_indexes = [[first_ind,next_ind-1,0]] + [ind+[1]]
                    else:
                        updated_indexes = [ind+[1]]
                    prev_index = ind[1]
                else:
                    first_ind = ind[0]
                    if first_ind-prev_index>1:
                        updated_indexes = updated_indexes + [[prev_index+1,first_ind-1,0]] + [ind+[1]]
                    else:
                        updated_indexes = updated_indexes + [ind+[1]]
                    prev_index = ind[1]
                if i==len(edits)-1:
                    if prev_index!=len(row['text']):
                        updated_indexes = updated_indexes + [[prev_index+1,len(row['text']),0]]

            word_and_tags =[(word,ind[-1]) for ind in updated_indexes for word in row['text'][ind[0]:ind[1]].split()]
            
            tokens , labels =  zip(*word_and_tags)
            r_tags , token2word = [] , []
            count = 0
            for index, token in enumerate( self.tokenizer.tokenize( ' '.join( tokens ) , truncation = True , padding = False , add_special_tokens = False , max_length = self.max_length ) ):

                if ( ( ( ( token.startswith( "Ġ" ) == False and index != 0 ) or ( token in tokens[ index - count - 1 ].lower() and index - count - 1 >= 0 ) ) and tokenizer.sep_token == '</s>' ) 
                    or ( ( token.startswith( "##" ) or ( token in tokens[index - count - 1].lower() and index - count - 1 >= 0 ) ) and tokenizer.sep_token == '[SEP]' ) ):

                    # r_tags.append( 0 )
                    
                    count += 1

                else:

                    try:
                        r_tags.append(labels[index - count])
                    except:
                        pdb.set_trace()

                token2word.append( index - count )
            row['labels'] = np.pad( r_tags , ( 0 , self.max_length - len( r_tags ) ) , 'constant' , constant_values = ( 0 , -100 ) )
            row['cleaned_text'] = ' '.join( tokens )
            return row
        else:
            split_text = row['text'].split()
            row['labels'] = [0]*len(split_text[:self.max_length]) + [-100]*(self.max_length - len(split_text[:self.max_length]))
            row['cleaned_text'] = ' '.join( split_text )
            return row

    def tag_to_next_word(self,ind,text):
        m = re.search('\S+',text[ind[0]:])
        return ind[0]+m.start(),ind[0]+m.end()

    def prepare_for_script_scoring(self,df):
        df = df.groupby('id').agg({'cleaned_text':list,'script_scores':list,'set_type':list,'labels':list})
        df[ 'cleaned_text' ] = df[ 'cleaned_text' ].str.join(' ')
        df[ 'script_scores' ] = df[ 'script_scores' ].apply(lambda x : x[0])
        df[ 'set_type' ] = df[ 'set_type' ].apply(lambda x : x[0])
        df[ 'labels' ] = df[ 'labels' ].apply(lambda x : np.concatenate([np.array(x[0])[np.array(x[0])!=-100],np.array(x[1])])[:self.max_length]  if len(x)==2 else x[0] )
        return df


    def get_dataset_dict(self,df):
        return {set_type:Dataset.from_pandas(df.groupby('set_type').get_group(set_type)) for set_type in self._set_types}

    def pre_processing_func(self,example):
        return self.tokenizer( example[ 'cleaned_text' ] , truncation=True , padding = 'max_length' , max_length = self.max_length )

    def create_dataset(self,df):
        dataset = DatasetDict(self.get_dataset_dict(df))
        dataset = dataset.map(self.pre_processing_func)
        dataset = dataset.remove_columns(['cleaned_text','set_type','id'])
        return dataset

    def save_dataset(self,scoring_level='answer'):
        if scoring_level=='answer':
            dataset = self.get_dataset()
        elif scoring_level=='script':
            dataset = self.get_script_dataset()
        else:
            print('score level not implemented, select: answer or script')
            return
        dataset.save_to_disk(f'{self._path_to_dataset}word_level_dataset/{scoring_level}/{self.pretrained_model}')

data = PreProcessing(max_length=512)
data.save_dataset()
data.save_dataset('script')

UnboundLocalError: ignored