# Imports

In [25]:
import os 
import csv
import pickle

from tqdm.notebook import tqdm

import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 

from transformers import AutoTokenizer

In [26]:
## Utilities variables

#Sample mode 
SAMPLE_MODE = None

#Max len of essay
SEQ_LEN = 1024

#path
PATH_RAW_DATA='/Users/arthurcollard/code/arthurcol/feedback_prize/raw_data/'

VERSION = 3
NAME_OUTPUT_FILE = f'preprocessed_v{VERSION}.csv'
NAME_TEST_FILE = f'test_preprocessed_v{VERSION}.csv'

# Data loading and preparation

## Loading training set

In [27]:
#load data from csv file 
df = pd.read_csv(PATH_RAW_DATA+'train.csv',nrows=SAMPLE_MODE)

## Preparation of the training data

In [28]:
## Def a function for labelling discourses per word

def labelizer(label,len_,flag):
    """Repeat the label according to the length of the sentence. Makes use of B/I notation according to the position of the word within the sentence and the sentence within the essay.

    Args:
        label (str): NER label of the sentence.
        len_ (int): Length of the sentence (n° of words).
        flag (int): 1 if the sentence follows a sentence with the same label. 0 otherwise.

    Returns:
        str: Returns a string of length (n° of words) len_ with B/I-label repeated len_ times.
    """
    if flag==0:
        label_first = f'B-{label} '
    else:
        label_first = f'I-{label} '
        
    return (label_first + f'I-{label} '*(len_-1)).strip()

In [29]:
## Creating features for labeling needs : 

    #Flag if the discourse is the same as the previous one
df['previous_discourse_flag']=np.where(df['discourse_type'].shift(1)==df['discourse_type'],1,0)

    #Get length of predictionstring
df['predictionstring_len'] = df['predictionstring'].apply(lambda txt:len(txt.split()))

    # Remove spaces in labels
df['discourse_type']=df['discourse_type'].str.replace('Concluding Statement','Concluding_Statement')

In [30]:
# vectorize labelizer func and apply to our df 
labelizer_vect = np.vectorize(labelizer)
df['label']=labelizer_vect(df['discourse_type'],df['predictionstring_len'],df['previous_discourse_flag'])
df.head(1)

Unnamed: 0,id,discourse_id,discourse_start,discourse_end,discourse_text,discourse_type,discourse_type_num,predictionstring,previous_discourse_flag,predictionstring_len,label
0,423A1CA112E2,1622628000000.0,8.0,229.0,Modern humans today are always on their phone....,Lead,Lead 1,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...,0,44,B-Lead I-Lead I-Lead I-Lead I-Lead I-Lead I-Le...


In [31]:
## Groupby ID to get predictionstrings and labels as a unique string

df_essays = df.groupby('id').agg({'predictionstring':' '.join,'label':' '.join})

## Transform into lists

df_essays['label'] = df_essays['label'].apply(lambda txt : txt.split())
df_essays['predictionstring'] = df_essays['predictionstring'].apply(lambda txt : txt.split())

#remove utilities columns created in the original df
df.drop(['previous_discourse_flag','predictionstring_len','label'],axis=1,inplace=True)

#reset index
df_essays.reset_index(inplace=True)

display(df.head(3),df_essays.head(3))

Unnamed: 0,id,discourse_id,discourse_start,discourse_end,discourse_text,discourse_type,discourse_type_num,predictionstring
0,423A1CA112E2,1622628000000.0,8.0,229.0,Modern humans today are always on their phone....,Lead,Lead 1,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
1,423A1CA112E2,1622628000000.0,230.0,312.0,They are some really bad consequences when stu...,Position,Position 1,45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
2,423A1CA112E2,1622628000000.0,313.0,401.0,Some certain areas in the United States ban ph...,Evidence,Evidence 1,60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75


Unnamed: 0,id,predictionstring,label
0,0000D23A521A,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[B-Position, I-Position, I-Position, I-Positio..."
1,00066EA9880D,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[B-Lead, I-Lead, I-Lead, I-Lead, I-Lead, I-Lea..."
2,000E6DE9E817,"[2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, ...","[B-Position, I-Position, I-Position, I-Positio..."


## Create dataframe for the test set

In [32]:
ids = [t.split('.')[0] for t in os.listdir(os.path.join(PATH_RAW_DATA,'test'))]
df_test = pd.DataFrame(ids,columns=['id'])
df_test

Unnamed: 0,id
0,DF920E0A7337
1,0FB0700DAF44
2,D46BCB48440A
3,18409261F5C2
4,D72CB1C11673


# Retrieve full text properly

In [33]:
#Function

def get_essay(id_,mode='train'):
    """Function to get the full text of an essay from the .txt file.

    Args:
        id_ (str): id of the essay
        mode (str, optional): determines whether to access *train* or *test* texts. \
        Defaults to 'train'.

    Returns:
        str: Returns the full text of the id
    """
    with open(os.path.join(PATH_RAW_DATA,mode,f'{id_}.txt'),'r') as file:
        txt = file.read()
        return txt.strip()

# Tokenizer

In [34]:
#Function

def tokenize_labelize(essay,tokenizer,predictionstring=None,labels=None,max_len=SEQ_LEN):
    """Tokenize an essay and match each token with the corresponding label.

    Args:
        essay (str): Text to tokenize
        tokenizer (tokenizer): Tokenizer from HF.
        predictionstring (pandas.Series | numpy.array, optional): As a unique string, list of index position of words with a label. Must be provided with labels. Defaults to None.
        labels (pandas.Series | numpy.array, optional): As a unique string, list of labels of each word. Must be provided with labels. Defaults to None.
        max_len (int): Maximum sequence length for padding/truncating.
        

    Returns:
        dict : Returns a dictionnary with input_ids,attention_mask and labels if passed.
    """
    
    tokens = tokenizer(essay,
                       return_attention_mask = True,
                       return_token_type_ids = False,
                       padding = 'max_length',
                       max_length = SEQ_LEN,
                       truncation = True,
                       return_tensors='np'
                      )
    
    word_ids=tokens.word_ids()
    
    labels_mapping = {
                      'B-Lead' : 0,
                      'B-Position' : 1,
                      'B-Evidence' : 2,
                      'B-Claim' : 3,
                      'B-Concluding_Statement' : 4,
                      'B-Counterclaim' : 5,
                      'B-Rebuttal' : 6,
                      'I-Lead' : 7,
                      'I-Position' : 8,
                      'I-Evidence' : 9,
                      'I-Claim' : 10,
                      'I-Concluding_Statement' : 11,
                      'I-Counterclaim' : 12,
                      'I-Rebuttal': 13
                        }
    
    if labels:
        match = {p:labels_mapping[l] for p,l in zip(predictionstring,labels)}
        labels_matched = [15 if (w==None or w==word_ids[i-1]) \
                            else match.get(str(w),14) \
                            for i,w in enumerate(word_ids)]
                            
        
        return {
        'input_ids' : tokens['input_ids'][0],
        'attention_mask' : tokens['attention_mask'][0],
        'labels': np.array(labels_matched), 
        'predictionstring':np.array(word_ids)
        }
    
    return {
        'input_ids' : tokens['input_ids'][0],
        'attention_mask' : tokens['attention_mask'][0],
        'predictionstring':np.array(word_ids)
        }

# Create preprocessed data

Working per batch for RAM issues

In [43]:
## Instantiate tokenizer from HF
tokenizer = AutoTokenizer.from_pretrained('backbone')

In [44]:
## vectorize the function tokenizer above
tokenize_labelize_vect = np.vectorize(tokenize_labelize,excluded=['SEQ_LEN'],otypes=['object'])

In [45]:
# Create CSV file with tokens (input_ids, attention_mask, predictionstring, labels)
# also stored in an array tokens

batch_size = 50
nbatch = int(len(df_essays)/batch_size)+1

fieldnames = ['id','predictionstring','label','essays']
tokens = np.array([])

with open(PATH_RAW_DATA+NAME_OUTPUT_FILE,'w') as file :
    writer = csv.DictWriter(file,fieldnames = fieldnames)
    writer.writeheader()

for i in tqdm(range(nbatch+1),desc='Processing...'):
    df_ = df_essays.loc[i*batch_size:(i+1)*batch_size-1].copy()
    df_['essays'] = df_['id'].apply(get_essay)
    tokens = np.append(tokens,tokenize_labelize_vect(df_.essays,tokenizer,
                                           df_.predictionstring, df_.label ,max_len=SEQ_LEN))
    df_.to_csv(PATH_RAW_DATA+NAME_OUTPUT_FILE,mode='a',header=False)


Processing...:   0%|          | 0/313 [00:00<?, ?it/s]

In [46]:
## sanity check
result = pd.read_csv(PATH_RAW_DATA+NAME_OUTPUT_FILE)
assert(tokens.shape[0]==result.shape[0])

In [47]:
##create tokens_test array

df_test['essays'] = df_test['id'].apply(get_essay,mode='test')
tokens_test = tokenize_labelize_vect(df_test.essays,tokenizer,max_len=SEQ_LEN)

## saving as csv
df_test.to_csv(PATH_RAW_DATA+f'preprocessed_inf_v{VERSION}.csv')

# Build dataset

In [48]:
def dataset_creator(tokens):
    """
    Creates a dictionnary with tokens attributes as a numpy array.

    Args:
        tokens (list): list of dictionnaries, outputs from tokeniner

    Returns:
        dict: dict with list of size BATCH_SIZE of inputs_id, attention mask, predictionstring and labels if provided.
    """
    
    keys = tokens[0].keys()

    inputs = {
        'input_ids':[],
        'attention_mask':[]
        }
    predictionstring = []
    labels = []
       
    for t in tqdm(tokens,desc='Aggregating dataset'):
        inputs['input_ids'].append(t['input_ids'])
        inputs['attention_mask'].append(t['attention_mask'])
        predictionstring.append(t['predictionstring'])
        if 'labels' in keys:
                labels.append(t['labels'])

        
    inputs['input_ids'] = np.array(inputs['input_ids'])
    inputs['attention_mask'] = np.array(inputs['attention_mask'])
    predictionstring = np.array(predictionstring)
    labels = np.array(labels)
    
    if 'labels' in tokens[0].keys():
        
        #OHE labels
        labels_ohe = np.zeros((len(labels),SEQ_LEN,16))
        
        dim1 = np.arange(len(labels))
        dim2 = np.arange(SEQ_LEN)
        
        labels_ohe[dim1[:,None,None],dim2[None,:,None],labels[:,:,None]] = 1
        
        return inputs, labels_ohe, predictionstring
    
    return inputs, predictionstring

In [49]:
#training dataset
if 'labels' in tokens[0].keys():
    inputs,labels,predictionstrings = dataset_creator(tokens)
else:
    inputs,predictionstrings = dataset_creator(tokens)

Aggregating dataset:   0%|          | 0/15594 [00:00<?, ?it/s]

In [50]:
## creating test dataset
inputs_test,ps_test = dataset_creator(tokens_test)

Aggregating dataset:   0%|          | 0/5 [00:00<?, ?it/s]

# Save datasets

In [51]:
## Store all objects in a single dictionnary for training

if 'labels' in tokens[0].keys():
    dataset = {
        'inputs':inputs,
        'labels':labels,
        'predictionstrings':predictionstrings
    }
else:
    dataset = {
        'inputs':inputs,
        'predictionstrings':predictionstrings
    }

In [52]:
## store test objects stored in a dict

dataset_test = {
        'inputs':inputs_test,
        'predictionstrings':ps_test
    }

In [53]:
## dump dataset dictionnary with as a pickle file

with open(f'../raw_data/dataset_v{VERSION}.pickle','wb') as file : 
    pickle.dump(dataset,file)
    
with open(f'../raw_data/dataset_test_v{VERSION}.pickle','wb') as file : 
    pickle.dump(dataset_test,file)

In [22]:
### the end ###