# Imports

In [142]:
import os 
import csv
from tqdm.notebook import tqdm

import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 

from transformers import AutoTokenizer

In [194]:
## Utilities variables

#Sample mode 
SAMPLE_MODE = None

#Max len of essay
SEQ_LEN = 1024

#Train, val, test split proportion
VAL_SPLIT = 0.8
TEST_SPLIT = 0.9

#path
PATH_RAW_DATA='/Users/arthurcollard/code/arthurcol/feedback_prize/raw_data/'

# Load data

In [195]:
#load data from csv file 
df = pd.read_csv(PATH_RAW_DATA+'train.csv',nrows=SAMPLE_MODE)

## Preparation

In [196]:
## Def a function for labelling discourses per word

def labelizer(label,len_,flag):
    """Repeat the label according to the length of the sentence. Makes use of B/I notation according to the position of the word within the sentence and the sentence within the essay.

    Args:
        label (str): NER label of the sentence.
        len_ (int): Length of the sentence (n° of words).
        flag (int): 1 if the sentence follows a sentence with the same label. 0 otherwise.

    Returns:
        str: Returns a string of length (n° of words) len_ with B/I-label repeated len_ times.
    """
    if flag==0:
        label_first = f'B-{label} '
    else:
        label_first = f'I-{label} '
        
    return (label_first + f'I-{label} '*(len_-1)).strip()

In [197]:
## Creating features for labeling needs : 

    #Flag if the discourse is the same as the previous one
df['previous_discourse_flag']=np.where(df['discourse_type'].shift(1)==df['discourse_type'],1,0)

    #Get length of predictionstring
df['predictionstring_len'] = df['predictionstring'].apply(lambda txt:len(txt.split()))

    # Remove spaces in labels
df['discourse_type']=df['discourse_type'].str.replace('Concluding Statement','Concluding_Statement')

In [198]:
# vectorize labelizer func and apply to our df 
labelizer_vect = np.vectorize(labelizer)
df['label']=labelizer_vect(df['discourse_type'],df['predictionstring_len'],df['previous_discourse_flag'])
df.head(1)

Unnamed: 0,id,discourse_id,discourse_start,discourse_end,discourse_text,discourse_type,discourse_type_num,predictionstring,previous_discourse_flag,predictionstring_len,label
0,423A1CA112E2,1622628000000.0,8.0,229.0,Modern humans today are always on their phone....,Lead,Lead 1,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...,0,44,B-Lead I-Lead I-Lead I-Lead I-Lead I-Lead I-Le...


In [199]:
## Groupby ID to get predictionstrings and labels as a unique string

df_essays = df.groupby('id').agg({'predictionstring':' '.join,'label':' '.join})

## Transform into lists

df_essays['label'] = df_essays['label'].apply(lambda txt : txt.split())
df_essays['predictionstring'] = df_essays['predictionstring'].apply(lambda txt : txt.split())

#remove utilities columns created in the original df
df.drop(['previous_discourse_flag','predictionstring_len','label'],axis=1,inplace=True)

#reset index
df_essays.reset_index(inplace=True)

display(df.head(3),df_essays.head(3))

Unnamed: 0,id,discourse_id,discourse_start,discourse_end,discourse_text,discourse_type,discourse_type_num,predictionstring
0,423A1CA112E2,1622628000000.0,8.0,229.0,Modern humans today are always on their phone....,Lead,Lead 1,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
1,423A1CA112E2,1622628000000.0,230.0,312.0,They are some really bad consequences when stu...,Position,Position 1,45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
2,423A1CA112E2,1622628000000.0,313.0,401.0,Some certain areas in the United States ban ph...,Evidence,Evidence 1,60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75


Unnamed: 0,id,predictionstring,label
0,0000D23A521A,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[B-Position, I-Position, I-Position, I-Positio..."
1,00066EA9880D,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[B-Lead, I-Lead, I-Lead, I-Lead, I-Lead, I-Lea..."
2,000E6DE9E817,"[2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, ...","[B-Position, I-Position, I-Position, I-Positio..."


## Retrieve full text properly

In [200]:
def get_essay(id_,mode='train'):
    """Function to get the full text of an essay from the .txt file.

    Args:
        id_ (str): id of the essay
        mode (str, optional): determines whether to access *train* or *test* texts. \
        Defaults to 'train'.

    Returns:
        str: Returns the full text of the id
    """
    with open(os.path.join(PATH_RAW_DATA,mode,f'{id_}.txt'),'r') as file:
        txt = file.read()
        return txt.strip()

## Tokenizer

In [201]:
def tokenize_labelize(essay,tokenizer,predictionstring=None,labels=None,max_len=SEQ_LEN):
    """Tokenize an essay and match each token with the corresponding label.

    Args:
        essay (str): Text to tokenize
        tokenizer (tokenizer): Tokenizer from HF.
        predictionstring (pandas.Series | numpy.array, optional): As a unique string, list of index position of words with a label. Must be provided with labels. Defaults to None.
        labels (pandas.Series | numpy.array, optional): As a unique string, list of labels of each word. Must be provided with labels. Defaults to None.
        max_len (int): Maximum sequence length for padding/truncating.
        

    Returns:
        dict : Returns a dictionnary with input_ids,attention_mask and labels if passed.
    """
    
    tokens = tokenizer(essay,
                       return_attention_mask = True,
                       return_token_type_ids = False,
                       padding = 'max_length',
                       max_length = SEQ_LEN,
                       truncation = True,
                       return_tensors='np'
                      )
    
    word_ids=tokens.word_ids()
    
    labels_mapping = {'B-Lead' : 0,
                  'B-Position' : 1,
                  'B-Evidence' : 2,
                  'B-Claim' : 3,
                  'B-Concluding_Statement' : 4,
                  'B-Counterclaim' : 5,
                  'B-Rebuttal' : 6,
                  'I-Lead' : 7,
                  'I-Position' : 8,
                  'I-Evidence' : 9,
                  'I-Claim' : 10,
                  'I-Concluding_Statement' : 11,
                  'I-Counterclaim' : 12,
                  'I-Rebuttal': 13}
    
    if labels:
        match = {p:labels_mapping[l] for p,l in zip(predictionstring,labels)}
        labels_matched = [-100 if (w==None or w==word_ids[i-1]) \
                            else match.get(str(w),14) \
                            for i,w in enumerate(word_ids)]
                            
        
        return {
        'input_ids' : tokens['input_ids'][0],
        'attention_mask' : tokens['attention_mask'][0],
        'labels': np.array(labels_matched)
        }
    
    return {
        'input_ids' : tokens['input_ids'][0],
        'attention_mask' : tokens['attention_mask'][0],
        }

## Batching

In [202]:
## Instantiate tokenizer from HF
tokenizer = AutoTokenizer.from_pretrained('allenai/longformer-base-4096')

In [203]:
## vectorize the function 
tokenize_labelize_vect = np.vectorize(tokenize_labelize,excluded=['SEQ_LEN'],otypes=['object'])

In [209]:
# Fetch essays text, per batch, save it in a csv file

batch_size = 50
nbatch = int(len(df_essays)/batch_size)+1

fieldnames = ['id','predictionstring','label','essays']
tokens = np.array([])

with open(PATH_RAW_DATA+'preprocessed.csv','w') as file :
    writer = csv.DictWriter(file,fieldnames = fieldnames)
    writer.writeheader()

for i in tqdm(range(nbatch+1)):
    df_ = df_essays.loc[i*batch_size:(i+1)*batch_size-1].copy()
    df_['essays'] = df_['id'].apply(get_essay)
    tokens = np.append(tokens,tokenize_labelize_vect(df_.essays,tokenizer,
                                           df_.predictionstring, df_.label ,max_len=SEQ_LEN))
    df_.to_csv(PATH_RAW_DATA+'preprocessed.csv',mode='a',header=False)


  0%|          | 0/313 [00:00<?, ?it/s]

In [210]:
ha = pd.read_csv(PATH_RAW_DATA+'preprocessed.csv')

In [211]:
ha

Unnamed: 0,id,predictionstring,label,essays
0,0000D23A521A,"['0', '1', '2', '3', '4', '5', '6', '7', '8', ...","['B-Position', 'I-Position', 'I-Position', 'I-...","Some people belive that the so called ""face"" o..."
1,00066EA9880D,"['0', '1', '2', '3', '4', '5', '6', '7', '8', ...","['B-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Le...",Driverless cars are exaclty what you would exp...
2,000E6DE9E817,"['2', '3', '4', '5', '6', '7', '8', '10', '11'...","['B-Position', 'I-Position', 'I-Position', 'I-...",Dear: Principal\n\nI am arguing against the po...
3,001552828BD0,"['0', '1', '2', '3', '4', '5', '6', '7', '8', ...","['B-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Le...",Would you be able to give your car up? Having ...
4,0016926B079C,"['0', '1', '2', '3', '4', '5', '6', '7', '8', ...","['B-Position', 'I-Position', 'I-Position', 'I-...",I think that students would benefit from learn...
...,...,...,...,...
15589,FFF1442D6698,"['0', '1', '2', '3', '4', '5', '6', '7', '8', ...","['B-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Le...","Every student looks forward to summer break, i..."
15590,FFF1ED4F8544,"['0', '1', '2', '3', '4', '5', '6', '7', '8', ...","['B-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Le...",Many citizens argue that the Electoral college...
15591,FFF868E06176,"['0', '1', '2', '3', '4', '5', '6', '7', '8', ...","['B-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Le...","Every summer break, students are given project..."
15592,FFFD0AF13501,"['44', '45', '46', '47', '48', '49', '50', '51...","['B-Claim', 'I-Claim', 'I-Claim', 'I-Claim', '...","In the article ""A Cowboy Who Rode the Waves"" L..."


In [214]:
## sanity check
assert(tokens.shape[0]==ha.shape[0])

## Creating dataset

In [218]:
def dataset_creator(tokens,val_split=VAL_SPLIT,test_split=TEST_SPLIT):
    
    keys = tokens[0].keys()
    
    if 'labels' in keys:
        
        train_dict = {'input_ids':[],
              'attention_mask':[]}
        val_dict = {'input_ids':[],
              'attention_mask':[]}
        test_dict = {'input_ids':[],
              'attention_mask':[]}
        
        train_labels = []
        val_labels = []
        test_labels = []
    
        idx_val=int(len(tokens)*VAL_SPLIT)
        idx_test=int(len(tokens)*TEST_SPLIT)
        
        print('Creating training set...')
        for t in tqdm(tokens[:idx_val]):
            train_dict['input_ids'].append(t['input_ids'])
            train_dict['attention_mask'].append(t['attention_mask'])
            train_labels.append(t['labels'])
        
        print('Creating validation set...')
        for t in tqdm(tokens[idx_val:idx_test]):
            val_dict['input_ids'].append(t['input_ids'])
            val_dict['attention_mask'].append(t['attention_mask'])
            if 'labels' in keys:
                val_labels.append(t['labels'])
        
        print('Creating testing set...')
        for t in tqdm(tokens[idx_test:]):
            test_dict['input_ids'].append(t['input_ids'])
            test_dict['attention_mask'].append(t['attention_mask'])
            if 'labels' in keys:
                test_labels.append(t['labels'])

        ## Converting lists into arrays 
        
        for d in [train_dict,val_dict,test_dict]:
            d['input_ids']=np.array(d['input_ids'])
            d['attention_mask']=np.array(d['attention_mask'])
            
        ## WORKAROUND for now regarding -100 tokens to be excluded from the loss
        train_labels=np.where(np.array(train_labels)==-100,15,np.array(train_labels))
        val_labels=np.where(np.array(val_labels)==-100,15,np.array(val_labels))
        test_labels=np.where(np.array(test_labels)==-100,15,np.array(test_labels))

        #OHE labels
        train_labels_ohe = np.zeros((len(train_labels),SEQ_LEN,16))
        val_labels_ohe = np.zeros((len(val_labels),SEQ_LEN,16))
        test_labels_ohe = np.zeros((len(test_labels),SEQ_LEN,16))

        dim1_train = np.arange(len(train_labels))
        dim1_val = np.arange(len(val_labels))
        dim1_test = np.arange(len(test_labels))
        dim2 = np.arange(SEQ_LEN)

        train_labels_ohe[dim1_train[:,None,None],dim2[None,:,None],train_labels[:,:,None]] = 1
        val_labels_ohe[dim1_val[:,None,None],dim2[None,:,None],val_labels[:,:,None]] = 1
        test_labels_ohe[dim1_test[:,None,None],dim2[None,:,None],test_labels[:,:,None]] = 1
    
        return (train_dict,train_labels_ohe), (val_dict,val_labels_ohe), (test_dict,test_labels_ohe)
    
    data_dict = {'input_ids':[],
              'attention_mask':[]}
    
    print('Creating new dataset...')
    for t in tqdm_notebook(tokens):
        data_dict['input_ids'].append(t['input_ids'])
        data_dict['attention_mask'].append(t['attention_mask'])
        
    data_dict['input_ids'] = np.array(data_dict['input_ids'])
    data_dict['attention_mask'] = np.array(data_dict['attention_mask'])
    
    return data_dict

In [219]:
train,val,test = dataset_creator(tokens)

Creating training set...


  0%|          | 0/12475 [00:00<?, ?it/s]

Creating validation set...


  0%|          | 0/1559 [00:00<?, ?it/s]

Creating testing set...


  0%|          | 0/1560 [00:00<?, ?it/s]

# Save datasets

In [221]:
import pickle

In [225]:
with open('../raw_data/train.pickle','wb') as file : 
    pickle.dump(train,file)

with open('../raw_data/val.pickle','wb') as file : 
    pickle.dump(val,file)

with open('../raw_data/test.pickle','wb') as file : 
    pickle.dump(test,file)

In [233]:
### the end ###