In [68]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt

In [3]:
from torch.utils.data.dataset import Dataset

In [133]:
#create the dataframe
dataframe = {}
dataframe["code"] = []
dataframe["code"].append(['abcd','efgh','ijkl','SEP','mnop','qrst','SEP','abcd','efgh','ijkl'])
dataframe["code"].append(['abcd','efgh','ijkl','SEP','mnop','qrst','SEP','abcd','efgh','ijkl','mnop','fsfe','wefe'])
dataframe["code"].append(['abcd','efgh','ijkl','SEP','mnop','qrst','SEP','abcd','efgh','ijkl'])

In [179]:
token2idx = {}
token2idx["abcd"] = 1
token2idx["efgh"] = 2
token2idx["ijkl"] = 3
token2idx["mnop"] = 4
token2idx["qrst"] = 5
token2idx["fsfe"] = 6
token2idx["wefe"] = 7
token2idx["SEP"] = 99
token2idx["CLS"] = 100
token2idx["MASK"] = 101
token2idx["UNK"] = 102
token2idx["PAD"] = 103

In [180]:
def random_mask(tokens, token2idx):
    output_label = []
    output_token = []
    for i, token in enumerate(tokens):
        prob = random.random()
        # mask token with 15% probability
        if prob < 0.15:
            prob /= 0.15

            # 80% randomly change token to mask token
            if prob < 0.8:
                output_token.append(token2idx["MASK"])

            # 10% randomly change token to random token
            elif prob < 0.9:
                output_token.append(random.choice(list(token2idx.values())))

            # -> rest 10% randomly keep current token

            # append current token to output (we will predict these later
            output_label.append(token2idx.get(token, token2idx['UNK']))
        else:
            # no masking token (will be ignored by loss function later)
            output_label.append(-1)
            output_token.append(token2idx.get(token, token2idx['UNK']))

    return tokens, output_token, output_label


In [240]:
def seq_padding(tokens, max_len, token2idx=None, symbol=None, unkown=True):
    if symbol is None:
        symbol = 'PAD'

    seq = []
    token_len = len(tokens)
    for i in range(max_len):
        if token2idx is None:
            if i < token_len:
                seq.append(tokens[i])
            else:
                seq.append(symbol)
        else:
            if i < token_len:
                # 1 indicate UNK
                if unkown:
                    seq.append(token2idx.get(tokens[i], token2idx['UNK']))
                else:
                    seq.append(token2idx.get(tokens[i]))
            else:
                seq.append(token2idx.get(symbol))
    return seq

In [241]:
def position_idx(tokens, symbol='SEP'):
    pos = []
    flag = 0

    for token in tokens:
        if token == symbol:
            pos.append(flag)
            flag += 1
        else:
            pos.append(flag)
    return pos

In [264]:
class MLMLoader(Dataset):
    def __init__(self, dataframe, token2idx, max_len, code='code'):
        self.vocab = token2idx
        self.max_len = max_len
        self.code = dataframe[code]
        #self.age = dataframe[age]
        #self.age2idx = age2idx

    def __getitem__(self, index):
        """
        return: #age, code, position, segmentation, mask, label
        """

        # extract data
        #age = self.age[index][(-self.max_len+1):]
        code = self.code[index][(-self.max_len+1):]

        # avoid data cut with first element to be 'SEP'
        if code[0] != 'SEP':
            code = np.append(np.array(['CLS']), code)
            #age = np.append(np.array(age[0]), age)
        else:
            code[0] = 'CLS'

        #this might be attention mask    
        # mask 0:len(code) to 1, padding to be 0
        mask = np.ones(self.max_len)
        mask[len(code):] = 0

        # pad age sequence and code sequence
        #age = seq_padding(age, self.max_len, token2idx=self.age2idx)
        print(code)
        tokens, code, label = random_mask(code, self.vocab) #code is now a list of token2idx
                                                            #label is what we want to predict
        print(code)

        # get position code
        tokens = seq_padding(tokens, self.max_len)
        position = position_idx(tokens)
        #segment = index_seg(tokens)

        # pad code and label
        code = seq_padding(code, self.max_len, symbol=self.vocab['PAD'])
        label = seq_padding(label, self.max_len, symbol=-1)

        return torch.LongTensor(code), torch.LongTensor(position), \
               torch.LongTensor(mask), torch.LongTensor(label)

    def __len__(self):
        return len(self.code)

In [286]:
dataloader = MLMLoader(dataframe,token2idx,15)

In [287]:
dataframe['code'][0]

['abcd', 'efgh', 'ijkl', 'SEP', 'mnop', 'qrst', 'SEP', 'abcd', 'efgh', 'ijkl']

In [288]:
dataloader[0]

['CLS' 'abcd' 'efgh' 'ijkl' 'SEP' 'mnop' 'qrst' 'SEP' 'abcd' 'efgh' 'ijkl']
[100, 101, 2, 3, 99, 4, 5, 99, 1, 101, 3]
['CLS' 'abcd' 'efgh' 'ijkl' 'SEP' 'mnop' 'qrst' 'SEP' 'abcd' 'efgh' 'ijkl']


(tensor([100, 101,   2,   3,  99,   4,   5,  99,   1, 101,   3, 103, 103, 103,
         103]),
 tensor([0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2]),
 tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]),
 tensor([-1,  1, -1, -1, -1, -1, -1, -1, -1,  2, -1, -1, -1, -1, -1]))

In [207]:
dataframe['code'][1]

['abcd',
 'efgh',
 'ijkl',
 'SEP',
 'mnop',
 'qrst',
 'SEP',
 'abcd',
 'efgh',
 'ijkl',
 'mnop',
 'fsfe',
 'wefe']

In [208]:
dataloader[1]

(tensor([100, 101,   2,   3, 101,   6, 101, 103]),
 tensor([0, 0, 0, 0, 0, 0, 0, 0]),
 tensor([1, 1, 1, 1, 1, 1, 1, 0]),
 tensor([-1,  1, -1, -1,  4, -1,  7, -1]))

In [209]:
dataframe['code'][2]

['abcd', 'efgh', 'ijkl', 'SEP', 'mnop', 'qrst', 'SEP', 'abcd', 'efgh', 'ijkl']

In [210]:
dataloader[2]

(tensor([100,   4,   5,  99,   1,   2,   3, 103]),
 tensor([0, 0, 0, 0, 1, 1, 1, 1]),
 tensor([1, 1, 1, 1, 1, 1, 1, 0]),
 tensor([-1, -1, -1, -1, -1, -1, -1, -1]))