In [None]:
import os, sys
from pathlib import Path
HOME = os.getcwd()

current = HOME 
while 'src' not in os.listdir(current):
    current = Path(current).parent

PARENT_DIR = str(current)
DATA_FOLDER = os.path.join(PARENT_DIR, 'src','data')
data_path = os.path.join(DATA_FOLDER, 'filtered.tsv')

sys.path.append(str(current))
sys.path.append(os.path.join(str(current), 'data_analysis'))
sys.path.append(os.path.join(str(current), 'evaluation'))
sys.path.append(os.path.join(str(current), 'text_processing')) 

In [None]:
from transformers import BartForConditionalGeneration, AutoTokenizer, AutoModelForSequenceClassification
checkpoint = 'facebook/bart-base'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = BartForConditionalGeneration.from_pretrained(checkpoint, num_labels=2)

In [4]:
DATA_FOLDER = os.path.join(os.getcwd(), '..', 'data')
DATASET_FILE = os.path.join(DATA_FOLDER, 'raw', 'filtered.tsv')
MODEL_FOLDER = os.path.join(os.getcwd(), '..', 'models')
MODEL_PREFIX = os.path.join(MODEL_FOLDER, 'tokenizer')
VOCAB_SIZE = 10000

In [5]:
pd_data = pd.read_csv(os.path.join(DATA_FOLDER, 'raw', 'filtered.tsv'), sep='\t')

In [6]:
# Prepare data for training
source = pd_data['translation'].tolist()
target = pd_data['reference'].tolist()

In [7]:
import torch
import numpy as np

torch.manual_seed(705)
np.random.seed(705)

# Split data
from sklearn.model_selection import train_test_split

source_val_train, source_test, target_val_train, target_test = train_test_split(source, target, test_size=0.2)
source_train, source_val, target_train, target_val = train_test_split(source_val_train, target_val_train, test_size=0.2)

In [8]:
def tokenize_function(examples):
    inputs = tokenizer.batch_encode_plus(
        examples['translation'], 
        max_length=512, 
        padding='max_length',
        return_tensors='pt'
    )
    
    outputs = tokenizer.batch_encode_plus(
        examples['reference'], 
        max_length=512, 
        pad_to_max_length=True, 
        padding='max_length',
        return_tensors='pt'
    )
    
    batch = {
        'input_ids': inputs['input_ids'],
        'attention_mask': inputs['attention_mask'],
        'labels': outputs['input_ids'],
        'decoder_input_ids': outputs['input_ids'],
        'decoder_attention_mask': outputs['attention_mask']
    }
    
    return batch

In [9]:
from datasets import Dataset

train_dataset = Dataset.from_dict({'translation': source_train, 'reference': target_train})
val_dataset = Dataset.from_dict({'translation': source_val, 'reference': target_val})
test_dataset = Dataset.from_dict({'translation': source_test, 'reference': target_test})

train_dataset = train_dataset.map(tokenize_function, batched=True, batch_size=512, num_proc=6)
val_dataset = val_dataset.map(tokenize_function, batched=True, batch_size=512, num_proc=6)
test_dataset = test_dataset.map(tokenize_function, batched=True, batch_size=512, num_proc=6)

Map (num_proc=6):   0%|          | 0/369776 [00:00<?, ? examples/s]

  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


Map (num_proc=6):   0%|          | 0/92445 [00:00<?, ? examples/s]

  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


Map (num_proc=6):   0%|          | 0/115556 [00:00<?, ? examples/s]

  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


In [52]:
# Train model

from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

batch_size = 30
num_epochs = 1
learning_rate = 5e-5
warmup_steps = 500
weight_decay = 0.01


training_args = Seq2SeqTrainingArguments(
    output_dir=MODEL_FOLDER,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    predict_with_generate=True,
    do_train=True,
    do_eval=True,
    logging_steps=1000,
    save_steps=1000,
    eval_steps=1000,
    overwrite_output_dir=True,
    warmup_steps=warmup_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    num_train_epochs=num_epochs,
    report_to="none"
)

In [15]:
# Sanity check
sc_train_data = train_dataset.select(range(100))
sc_dataset = sc_train_data.map(tokenize_function, batched=True, batch_size=512, num_proc=6)

Map (num_proc=6):   0%|          | 0/100 [00:00<?, ? examples/s]

  return cls._concat_blocks(pa_tables_to_concat_vertically, axis=0)
  return cls._concat_blocks(pa_tables_to_concat_vertically, axis=0)
  return cls._concat_blocks(pa_tables_to_concat_vertically, axis=0)
  return cls._concat_blocks(pa_tables_to_concat_vertically, axis=0)
  return cls._concat_blocks(pa_tables_to_concat_vertically, axis=0)
  return cls._concat_blocks(pa_tables_to_concat_vertically, axis=0)
  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


In [43]:
# Train model   

sc_training_args = Seq2SeqTrainingArguments(
    output_dir='/tmp',
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    predict_with_generate=True,
    do_train=True,
    do_eval=True,
    logging_steps=100,
    save_steps=100000,
    eval_steps=10,
    overwrite_output_dir=True,
    warmup_steps=warmup_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    num_train_epochs=num_epochs,
    fp16=True,
    report_to="none",
    num_train_epochs=100
)

trainer = Seq2SeqTrainer(
    model=model,
    args=sc_training_args,
    train_dataset=sc_train_data,
    eval_dataset=val_dataset,
)

In [28]:
trainer.train()

Step,Training Loss
100,6.6106
200,1.4389
300,0.2476
400,0.1531


TrainOutput(global_step=400, training_loss=2.1125432896614074, metrics={'train_runtime': 148.1723, 'train_samples_per_second': 67.489, 'train_steps_per_second': 2.7, 'total_flos': 1353418014720000.0, 'train_loss': 2.1125432896614074, 'epoch': 100.0})

In [32]:
train_sample = train_dataset[4]
train_sample['translation'], train_sample['reference']

("I know you hate me, but I don't make a cow out of myself, and I don't lecture you.",
 "I know you hate me. But this isn't me being some overbearing bitch.")

In [39]:
# Test model

for i in range(len(val_dataset)):

    input_ids = train_sample['input_ids']
    attention_mask = train_sample['attention_mask']

    outputs = model.generate(
        input_ids=torch.tensor(input_ids).unsqueeze(0).to('cuda'),
        attention_mask=torch.tensor(attention_mask).unsqueeze(0).to('cuda'),
        max_length=512,
        num_beams=5,
        early_stopping=True
    )

    tokenizer.decode(outputs[0])

"<pad><pad> I hate hate me, I know you hate hate hate hate hate hate hate hate me hate hate hate hate hate hate hate hate me me hate hate hate hate hate hate hate hate hate hate me, but I don''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''</s>"

In [53]:
end_trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

In [55]:
end_trainer.train()

Step,Training Loss
1000,1.4894
2000,0.0256
3000,0.0068
4000,0.0027
5000,0.0014
6000,0.0009
7000,0.0006
8000,0.0004
9000,0.0003
10000,0.0002


TrainOutput(global_step=12326, training_loss=0.12402545101608178, metrics={'train_runtime': 5384.0679, 'train_samples_per_second': 68.68, 'train_steps_per_second': 2.289, 'total_flos': 5.009081277559603e+16, 'train_loss': 0.12402545101608178, 'epoch': 1.0})