In [1]:
import torch
import os, sys
from transformers import AutoModel, AutoTokenizer
from pathlib import Path

HOME = os.getcwd()
current = HOME 
while 'src' not in os.listdir(current):
    current = Path(current).parent

PARENT_DIR = str(current)
sys.path.append(str(current))
sys.path.append(os.path.join(str(current), 'data_analysis'))
sys.path.append(os.path.join(str(current), 'evaluation'))
sys.path.append(os.path.join(str(current), 'text_processing'))

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
DATA_FOLDER = os.path.join(PARENT_DIR, 'src', 'data')
NOTEBOOK_DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
CHECKPOINT = 'distilbert-base-uncased'  # let's keep it simple as for the first iteration
TOKENIZER = AutoTokenizer.from_pretrained(CHECKPOINT)

In [3]:
# let's focus
from datasets import load_dataset
data = load_dataset('csv', data_files=os.path.join(DATA_FOLDER, 'all_data_processed.csv'), split='train')
data = data.filter(function=lambda b: (isinstance(b['source'], str) and isinstance(b['target'], str)))
data.to_csv(os.path.join(DATA_FOLDER, 'all_data_processed.csv'), index=False)

Downloading data files: 100%|██████████| 1/1 [00:00<00:00, 17549.39it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 1808.67it/s]
Generating train split: 597519 examples [00:00, 708018.22 examples/s]
Filter: 100%|██████████| 597519/597519 [00:01<00:00, 536018.70 examples/s]
Creating CSV from Arrow format: 100%|██████████| 598/598 [00:02<00:00, 244.45ba/s]


66239054

In [4]:
# split the data \
import src.data_preparation.prepare_data as pdr
train_data, val_data, test_data = pdr.data_split(all_data=data.select(range(2000)))

In [5]:
train_data, val_data, test_data

(Dataset({
     features: ['source', 'target'],
     num_rows: 1920
 }),
 Dataset({
     features: ['source', 'target'],
     num_rows: 40
 }),
 Dataset({
     features: ['source', 'target'],
     num_rows: 39
 }))

In [6]:
def prepare_labeled_data(batch):
    model_inputs = TOKENIZER(batch['source'], truncation=True, max_length=1024)
    labels = TOKENIZER(batch['target'],truncation=True, max_length=1024)
    model_inputs['labels'] = labels['input_ids']
    # data['labels_attention_mask'] = labels['attention_mask']
    return model_inputs

train_data = train_data.map(prepare_labeled_data, batched=True).remove_columns(['source', 'target'])
val_data = val_data.map(prepare_labeled_data, batched=True).remove_columns(['source', 'target'])
test_data = test_data.map(prepare_labeled_data, batched=True).remove_columns(['source', 'target'])

Map:   0%|          | 0/1920 [00:00<?, ? examples/s]

Map: 100%|██████████| 1920/1920 [00:00<00:00, 10745.51 examples/s]
Map: 100%|██████████| 40/40 [00:00<00:00, 7319.90 examples/s]
Map: 100%|██████████| 39/39 [00:00<00:00, 7803.54 examples/s]


In [20]:
train_data[:2]

{'input_ids': [[101, 2396, 1010, 2115, 2269, 1010, 102],
  [101, 26450, 2068, 2035, 999, 102]],
 'attention_mask': [[1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]],
 'labels': [[101, 5506, 2332, 1010, 2115, 2269, 1012, 102],
  [101, 5495, 2068, 2035, 2039, 999, 102]]}

In [21]:
def custom_padding_data_collator(batch):
    # let's pad and see how it goes
    max_len = len(max([i['input_ids'] for i in batch], key=len))
    max_target_len = len(max([i['labels'] for i in batch], key=len))


    for e in batch:
        e['input_ids'] = e['input_ids'] + [TOKENIZER.pad_token_id] * (max_len - len(e['input_ids']))
        e['attention_mask'] = e['attention_mask'] + [0] * (max_len - len(e['input_ids']))
        e['labels'] = e['labels'] + [TOKENIZER.pad_token_id] * (max_target_len - len(e ['labels']))    
    # padd the target
    return batch

In [22]:
# apparently, I have to write my
# def custom_padding_data_collator(batch):
#     # find the longest sentence in the batch
#     max_len = len(max(batch['input_ids'], key=len))
#     # make sure to add padding to all sentences
#     batch['input_ids'] = [(ids + TOKENIZER.pad_token_id * max_len - len(ids)) for ids in batch['input_ids']]
    
#     # add '0's to the attention masks
#     batch['attention_mask'] = [(mask + 0 * max_len - len(mask)) for mask in batch['attention_mask']]

#     # pad the labels
#     max_target = len(max(batch['labels'], key=len)) 
#     batch['labels'] = [(ids + TOKENIZER.pad_token_id * max_target - len(ids)) for ids in batch['labels']]

#     return batch

In [23]:
# we are now ready to create the dataloader
from torch.utils.data import DataLoader
from transformers import DataCollatorForSeq2Seq
# dc = DataCollatorWithPadding(tokenizer=TOKENIZER)
dc = DataCollatorForSeq2Seq(tokenizer=TOKENIZER, model=AutoModel.from_pretrained(CHECKPOINT))
train_dl = DataLoader(dataset=train_data, batch_size=4, shuffle=True, collate_fn=custom_padding_data_collator)
val_dl = DataLoader(dataset=val_data, batch_size=4, shuffle=False, collate_fn=custom_padding_data_collator)

In [24]:
next(iter(train_dl))        

[{'input_ids': [101,
   2115,
   3611,
   1005,
   1055,
   1037,
   19101,
   1010,
   4845,
   1012,
   102,
   0,
   0,
   0,
   0,
   0,
   0,
   0],
  'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
  'labels': [101,
   2115,
   6643,
   1010,
   2002,
   1005,
   1055,
   2288,
   1037,
   2502,
   2677,
   1012,
   2879,
   1012,
   102]},
 {'input_ids': [101,
   2045,
   1005,
   1055,
   1037,
   2843,
   1997,
   16034,
   1999,
   2023,
   2388,
   11263,
   9102,
   1012,
   102,
   0,
   0,
   0],
  'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
  'labels': [101,
   2045,
   1005,
   1055,
   1037,
   2843,
   1997,
   16034,
   1999,
   2032,
   1012,
   102,
   0,
   0,
   0]},
 {'input_ids': [101,
   1045,
   2228,
   1037,
   10515,
   2003,
   1037,
   9951,
   5949,
   1997,
   2051,
   1012,
   102,
   0,
   0,
   0,
   0,
   0],
  'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
  'labels': [101,
   1045,
   2228,
   8720,
   2003,


In [25]:
from torch.optim import Adam
import src.models.customSeq2Seq.seq2seq_model as s2s
import src.models.customSeq2Seq.train as ts2s
import importlib
importlib.reload(s2s)
importlib.reload(ts2s)
# the output size should be that of the vocabulary size
TOKENIZER.vocab_size

30522

In [26]:
# let's build the classifier for the Token prediction
from torch import nn
from src.models.customSeq2Seq.classification_head import ExponentialClassifier
classifier = ExponentialClassifier(num_classes=TOKENIZER.vocab_size, in_features=100, num_layers=5)
encoder = s2s.BertBasedEncoder(hidden_dim=100, num_layers=2)
# the output of the encoder
decoder = s2s.DecoderRNN(token_classifier=classifier)


e_opt = Adam(encoder.parameters(), lr=0.01)
d_opt = Adam(decoder.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()

from src.models.customSeq2Seq.train import train_model
train_model(encoder=encoder, decoder=decoder, train_dataloader=train_dl, )

TypeError: DecoderRNN.__init__() missing 2 required positional arguments: 'emb_dim' and 'output_size'