In [144]:
%load_ext autoreload
%autoreload 2

from dataset import CausalLMDataset
# from torch.utils.data import DataLoader, Dataset

import torch
import os

from transformers import (
    MODEL_WITH_LM_HEAD_MAPPING,
    WEIGHTS_NAME,
    AdamW,
    AutoConfig,
    AutoModelWithLMHead,
    AutoTokenizer,
    GPT2Tokenizer,
    AutoModelForCausalLM,
    PreTrainedModel,
    PreTrainedTokenizer,
    get_linear_schedule_with_warmup,
)

from transformers import GPT2LMHeadModel
from torch.optim import AdamW

import json

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [145]:
# read data
df = pd.read_csv('data/cleaned_data_labeled.csv', index_col=0)

# train-test split
train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)

train_df.head()

Unnamed: 0,review_text,rating,sentiment,gendered,has_gender,modified text,outputs,masked_output
897,I researched many breast pumps on line before ...,5.0,positive,"['daughter', 'she', 'husband', 'her', 'daughter']",True,i researched many breast pumps on line before ...,[NIM] [NIM] [NIM] [NIM] [NIM] [NIM] [NIM] [NIM...,[NIM] [NIM] [NIM] [NIM] [NIM] [NIM] [NIM] [NIM...
262,This was a subject of much conversation when I...,5.0,positive,['man'],True,this was a subject of much conversation when i...,[NIM] [NIM] [NIM] [NIM] [NIM] [NIM] [NIM] [NIM...,[NIM] [NIM] [NIM] [NIM] [NIM] [NIM] [NIM] [NIM...
151,I bought this 3 in 1 for my fiance' because he...,5.0,positive,['he'],True,i bought this 3 in 1 for my fiance 'because [M...,[NIM] [NIM] [NIM] [NIM] [NIM] [NIM] [NIM] [NIM...,[NIM] [NIM] [NIM] [NIM] [NIM] [NIM] [NIM] [NIM...
865,Loved our Marathon until our 18 month old reac...,1.0,negative,['he'],True,loved our marathon until our 18 month old reac...,[NIM] [NIM] [NIM] [NIM] [NIM] [NIM] [NIM] [NIM...,[NIM] [NIM] [NIM] [NIM] [NIM] [NIM] [NIM] [NIM...
174,My husband is a BMW mechanic and he drives car...,5.0,positive,"['husband', 'he', 'him']",True,my [MASK] is a bmw mechanic and [MASK] drives ...,[NIM] husband [NIM] [NIM] [NIM] [NIM] [NIM] he...,[NIM] [MAL] [NIM] [NIM] [NIM] [NIM] [NIM] [MAL...


In [146]:
import pandas as pd

# read data
df = pd.read_csv('data/cleaned_data_labeled.csv', index_col=0)

# train-test split
train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42)


# tokenizer
tokenizer = GPT2Tokenizer.from_pretrained(
    'distilgpt2',
    pad_token = '<|endoftext|>'  # original tokenizer does not have pad token
)

# datasets
train_dataset = CausalLMDataset(
    df = train_df,
    text_col = 'review_text'
)
val_dataset = CausalLMDataset(
    df = val_df,
    text_col = 'review_text'
)
test_dataset = CausalLMDataset(
    df = test_df,
    text_col = 'review_text'
)
print(train_dataset[0])

{'text': "i was blown away by the wonderful acting of juliet lewis and giovanni ribisi who played the retarded young adults. this was a very compassionate film about the realities of living with a disability. disabled people desire and deserve to have the same opportunities as everyone else. it was wonderful to see the love carla 's siblings and parents had for her. there were 2 reasons i didn't give this movie 5 stars. one was that diane keaton's character of the overly controlling mother was annoying. obviously she was supposed to be annoying, but i think she overacted a bit and she needed better lines, she seemed to say the same stuff over and over again. another reason i didn't love this film was because hollywood once again compels right wing nut jobs to write reviews that portray themselves as normal and good. sorry, hollywood, but there will always be those of us who still believe god's word to be true. being a right wing nut job isn't considered normal or ok in the bible. god n

In [142]:
class CausalLanguageModelingCollate:
    
    '''
    collate_fn in dataloader is used for post processing on a single batch. Like __getitem__ in dataset class
    is used on single example
    '''
    
    def __init__(
        self, 
        tokenizer,
        _tok_return_tensors = 'pt',
        _tok_max_length = None,
        _tok_truncation = False,
        _tok_padding = False
    ):
        
        self.tokenizer = tokenizer
        self._tok_return_tensors = _tok_return_tensors
        self._tok_max_length = _tok_max_length
        self._tok_truncation = _tok_truncation
        self._tok_padding = _tok_padding
        
    
    def __call__(self, batch):
        '''
        __call__: a default method
        First the obj is created using MyCollate(pad_idx) in data loader
        Then if obj(batch) is called -> __call__ runs by default
        
        Overwrites `input_ids` element of each input in `batch` with a partially-masked version.
        '''
        
        # grab text
        batch_texts = [example['text'] for example in batch]

        # tokenize texts with the tokenizer: ['quick', 'brown', 'fox'] -> [12, 2, 9, 0]
        # ['jumps' 'over', 'the'] -> [9, 12, 45, 0]
        # ['the' lazy', 'dog' '.'] -> [123,456,789,098]
        tokenized_batch = self.tokenizer(
            batch_texts,
            return_tensors = self._tok_return_tensors, 
            max_length = self._tok_max_length, 
            truncation = self._tok_truncation, 
            padding = self._tok_padding
        )

        # duplicate inputs as targets
        tokenized_batch['labels'] = tokenized_batch.input_ids.detach().clone()

        return tokenized_batch

    

In [5]:
from torch.utils.data import DataLoader
from collate_fns import DialogCollate

# collate object
collate_fn = CausalLanguageModelingCollate(
    tokenizer = tokenizer,
    _tok_return_tensors = 'pt',
    _tok_max_length = 512,
    _tok_truncation = True,
    _tok_padding = 'longest'
)

# train dataloaders
train_loader = DataLoader(
    train_dataset,
    batch_size = 2,
    shuffle = True,
    collate_fn = collate_fn
)
val_loader = DataLoader(
    val_dataset,
    batch_size = 2,
    shuffle = False,
    collate_fn = collate_fn
)
test_loader = DataLoader(
    test_dataset,
    batch_size = 2,
    shuffle = False,
    collate_fn = collate_fn
)

In [6]:
tokenizer.special_tokens_map

{'bos_token': '<|endoftext|>',
 'eos_token': '<|endoftext|>',
 'unk_token': '<|endoftext|>',
 'pad_token': '<|endoftext|>'}

In [7]:
# example usage
for batch in val_loader:
    break

example_id = 0
print(f'input_ids of example {example_id} in batch:\n',       batch['input_ids'][example_id])
print(f'target_ids of example {example_id} in batch:\n',      batch['labels'][example_id])
print(f'attention_masks of example {example_id} in batch:\n', batch['attention_mask'][example_id])

tokenizer.decode(batch['input_ids'][example_id])

input_ids of example 0 in batch:
 tensor([   72,   655,   836,   470,   760,   810,   284,   923,   764,   612,
          655,  1595,   470,  1283,   284,   307,   257,   835,   284,  6901,
          262,  4467,   764,   345,   481,   655,   423,   284,  1949,   606,
          319,   284,  1975,   340,   764,   616,  4780,   837,  2802,   837,
         5229,   290,  1312,   477, 21192,   416,   428,  1720,   764,  2453,
          645, 21436,  3508,   764, 10966,  2644,   645,   837,   475,   262,
         4467,   481,  6611,   345,  1497, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 5

"i just don't know where to start. there just doesn't seem to be a way to describe the comfort. you will just have to try them on to believe it. my neighbor, mother, husband and i all swear by this product. accept no substitutions. attractive... no, but the comfort will blow you away<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><

Explanation: For GPT-2 models, the `sos`, `bos`, `unk`, and `pad` are all represetned by the same token: `'<|endoftext|>'`.

In [8]:
# infinite positive sample loader

class InfiniteDataGen:
    def __init__(self, dataloader):
        self.dataloader = dataloader
        self.epoch_count = 0
    
    def generate(self): 
        while True:
            for batch in self.dataloader:
                yield batch
            self.epoch_count += 1
            

def create_json(target_dir, filename):
    json_as_string = json.dumps({}, indent=4, sort_keys=False)
    
    if not os.path.exists(target_dir): 
        os.makedirs(target_dir)
    
    with open(os.path.join(target_dir, filename), "w") as outfile:
        outfile.write(json_as_string)
            

def update_json(target_dir, filename, dict_to_save={}):
    
    f = open(os.path.join(target_dir, filename), "r")
    old_dict = json.loads(f.read())
    old_dict.update(dict_to_save)

    json_as_string = json.dumps(old_dict, indent=4, sort_keys=False)
    with open(os.path.join(target_dir, filename), "w") as outfile:
        outfile.write(json_as_string)
    
    f.close()

In [9]:
# create_json(
#     target_dir = CONFIG['LOGGING_DIR'],
#     filename = CONFIG['EXPERIMENT_NAME'] + '.json'
# )

In [10]:
# update_json(
#     target_dir = CONFIG['LOGGING_DIR'],
#     filename = CONFIG['EXPERIMENT_NAME'] + '.json',
#     dict_to_save = {0: {'pos_loss': 0.1, 'neg_loss': 0.5}}
# )

In [11]:
# infinite_positive_data_obj = InfiniteDataGen(notok_loader)
# infinite_positive_data_gen = infinite_positive_data_obj.generate()

In [12]:
# num_batches_processed = 0

# for _ in range(5000):
    
#     batch = next(infinite_positive_data_gen)
    
#     if num_batches_processed % 10 == 0:
#         print(f'\rNumber of batches processed: {num_batches_processed}, epoch {infinite_positive_data_obj.epoch_count}', end='', flush=True)
        
        
#     num_batches_processed += 1
    

In [13]:
CONFIG = {
    'EXPERIMENT_NAME': 'initial',
    
    'START_ITER': 0,
    'TRAIN_ITERS': 20,
    
    'LOGGING_DIR': './logs/',
    'MODEL_SAVE_DIR': './checkpoints/',
    
    'HUGGINGFACE_MODEL_NAME': 'distilgpt2',
    'POS_LR': 0.001,
    'NEG_LR': 0.001,
    'UPDATES_PER_BATCH': 20,
    
    'EXAMPLE_WEIGHT_MODE': 'decay',
    'EXAMPLE_WEIGHT_CARE_MODE': 'sample_avg',
    'EXAMPLE_WEIGHT_REJECTION_THRESHOLD': -7.0,
    
}

In [14]:
# from transformers import GPT2LMHeadModel
# from torch.optim import AdamW

# torch.manual_seed(0)
# DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# MODEL_SAVE_DIR = 'checkpoints/test/'

# if not os.path.exists(MODEL_SAVE_DIR):
#     os.makedirs(MODEL_SAVE_DIR)
    
# model = GPT2LMHeadModel.from_pretrained("microsoft/DialoGPT-medium") 
# model.to(DEVICE)
# model.train()
# print('Number of model parameters:', count_parameters(model))

# # optimizer
# from torch.optim import AdamW
# optimizer = AdamW(model.parameters(), lr=5e-5)

In [120]:
from transformers import GPT2LMHeadModel
from torch.optim import AdamW
from torch.autograd import Variable


class CausalLMWrapper:

    
    def __init__(self, model_name, opt_lr, tokenizer, device):
        self.model = GPT2LMHeadModel.from_pretrained(model_name).to(device)
        self.optimizer = AdamW(self.model.parameters(), lr=opt_lr)
        self.opt_lr = opt_lr
        self.tokenizer = tokenizer
        self.device = device

        
    def reset_optimizers(self, opt_lr=None):
        self.optimizer = AdamW(self.model.parameters(), lr=opt_lr if opt_lr is not None else self.opt_lr)
    
    
    def maximum_likelihood_step(self, batch):
        '''
        Performs a standard (min. negative log likelihood / max. log likelihood)
        gradient update on `self.model` using `batch`.
        '''

        self.optimizer.zero_grad()

        # max sequence length
        seq_len = batch['input_ids'].shape[1]
        
        # each tensor to device
        input_ids = batch['input_ids'].to(self.device)
        labels    = batch['labels'].to(self.device)
        # print(input_ids.shape, position_ids.shape, token_type_ids.shape, target_ids.shape, attention_mask.shape)

        # get model outputs
        outputs = self.model(
            input_ids = input_ids,
            labels = labels
        )

        # calculate the loss
        loss = outputs.loss
        loss.backward()
        
        # consider gradient clipping here before updating
        
        self.optimizer.step()
        
        return  {
            'loss': loss
        }
    
    
    def train_maximum_likelihood_one_epoch(train_loader):
        '''
        Performs one epoch of maximum likelihood training.
        '''
        batch_wise_losses = []
        for batch_id, batch in enumerate(train_loader):

            print(f'\rProcessing neg batch {batch_id} of {len(train_loader)}', end='', flush=True)

            self..optimizer.zero_grad()
            results = self..maximum_likelihood_step(batch)
            batch_wise_losses.append(results['loss'].item())

        return {
            'batch_wise_losses': batch_wise_losses,
            'average_loss': np.array(batch_wise_losses).mean(),
        }
    
    
    def forward_logprobs_of_batch(self, batch):
        '''
        Returns the forward pass results of `batch` in their (masked) log-prob form.
        Masking is performed on the basis of `ignore_index` (TODO: change 
        `ignore_index` from a magic number to a proper parameter); all tokens with
        value `ignore_index` will contribute a log probability of zero.
        
        Returns a tensor of shape (batch_size, seq_len)
        '''

        # max sequence length
        seq_len = batch['input_ids'].shape[1]
        
        # each tensor to device
        input_ids = batch['input_ids'].to(self.device)
        labels    = batch['labels'].to(self.device)
        # print(input_ids.shape, position_ids.shape, token_type_ids.shape, target_ids.shape, attention_mask.shape)

        # get model outputs
        outputs = self.model(
            input_ids = input_ids,
            labels = labels
        )
        
        # calculate the masked logprobs
        masked_logprobs_of_true_labels, ignore_index_mask = \
        CausalLMWrapper._calculate_masked_logprobs(
            logits = outputs.logits,  # (batch_size, seq_len)
            labels = labels,  # (batch_size, seq_len, vocab_size)
            ignore_index = -100,  # TODO: remove this magic number
        )
        
        return  {
            'masked_logprobs': masked_logprobs_of_true_labels,  # (batch_size, seq_len)
            'ignore_index_mask': ignore_index_mask  # (batch_size, seq_len)
        }
    
    
    def nll_on_dataset(self, dataloader):
        '''
        Returns the average negative log-likelihood on the examples in a `dataloader`
        '''
        batch_wise_losses = []
        len_loader = len(dataloader)
        for batch_id, batch in enumerate(dataloader):

            if batch_id % 10 == 0:
                print(f'\rEvaluating batch: {batch_id} of {len_loader}', end='', flush=True)

            # calculate negative log likelihood loss
            logprobs_result = self.forward_logprobs_of_batch(batch)
            example_wise_loss = torch.sum(logprobs_result['masked_logprobs'], dim = 1)
            loss = -example_wise_loss.sum() / logprobs_result['ignore_index_mask'].sum()
            batch_wise_losses.append(loss.item())
            
        return np.array(batch_wise_losses).mean()
    
    
    def batch_generate_samples_unconditional(self, num_samples, out_file, batch_size=8, max_new_tokens=100):
        '''
        Sample `num_samples` samples from the model, `batch_size` samples at a time.
        Sampling is done unconditionally (no conditioning prompt) and independently of each other.
        Saves the samples into `out_file`.
        '''
        
        self.model.eval()
        
        with open(out_file, 'w') as fout:
            
            num_generated_samples = 0
            while num_generated_samples < num_samples:

                print(f'\rGenerated {num_generated_samples} of {num_samples}', end='', flush=True)

                outputs = self.model.generate(
                    max_new_tokens = max_new_tokens,
                    pad_token_id = self.tokenizer.eos_token_id,
                    do_sample = True,  # do_sample = True; otherwise all questions will be identical
                    top_p = 0.95,      # nucleus sampling
                    top_k = 0,         # deactivate top-k words sampling
                    num_return_sequences = batch_size
                ).cpu().tolist()
                num_generated_samples += batch_size
                
                outputs = [
                    [str(token) for token in token_seq if token != self.tokenizer.eos_token_id]
                    for token_seq in outputs
                ]
                
                outputs_tokens_joined = [' '.join(token_seq) for token_seq in outputs]
                
                
                # write the tokens to the output file
                fout.write('\n'.join(outputs_tokens_joined))
        
        
    @staticmethod
    def _onehot_maskgen(sz, idx):
        msk = torch.BoolTensor(sz)
        msk.fill_(False)
        msk[torch.LongTensor(range(sz[0])), idx.cpu()] = True
        if idx.is_cuda == True:
            msk = msk.cuda()
        return Variable(msk)
        
    
    @staticmethod
    def _calculate_masked_logprobs(logits, labels, ignore_index):
        batch_size = logits.shape[0]
        seq_len    = logits.shape[1]
        vocab_size = logits.shape[-1]

        # shift labels to align with logits
        shift_labels = labels[..., 1:].contiguous()

        # and truncate logits to make shapes the same
        trunc_logits = logits[..., :-1, :].contiguous()

        # flatten logits and labels
        flat_logits = trunc_logits.view(-1, trunc_logits.shape[-1])  # (batch_size*seq_len, vocab_size)
        flat_labels = shift_labels.view(-1)  # (batch_size*seq_len)

        # calculate the log probabilities
        flat_logprobs = torch.nn.functional.log_softmax(flat_logits, dim = -1)
        
        # for each example, grab the log probability corresponding to the true label
        onehot_labels = CausalLMWrapper._onehot_maskgen(flat_logprobs.size(), flat_labels.data).to(torch.bool)
        logprobs_of_true_labels = torch.masked_select(flat_logprobs, onehot_labels).view(batch_size, -1)  # (batch_size, seq_len)

        # mask out locations with value `ignore_index`
        ignore_index_mask = ~(shift_labels == ignore_index)  # 0 when value == ignore_index, 1 otherwise
        return logprobs_of_true_labels * ignore_index_mask, ignore_index_mask

    
    @staticmethod
    def _calculate_positive_phase_loss(logits, labels, ignore_index):
        '''
        Returns the (unweighted) negative log likelihood of a batch.
        '''
        
        # get masked logprobs
        masked_logprobs_of_true_labels, ignore_index_mask = \
        CausalLMWrapper._calculate_masked_logprobs(
            logits, labels, ignore_index
        )
        
        # loss
        example_wise_loss = torch.sum(masked_logprobs_of_true_labels, dim = 1)
        loss = - example_wise_loss.sum() / ignore_index_mask.sum()
        return loss

In [121]:
# training

# preparing logging
create_json(
    target_dir = CONFIG['LOGGING_DIR'],
    filename = CONFIG['EXPERIMENT_NAME'] + '.json'
)
    
# preparing model checkpointing
if not os.path.exists(CONFIG['MODEL_SAVE_DIR']): 
    os.makedirs(CONFIG['MODEL_SAVE_DIR'])

# model
model_wrapper = CausalLMWrapper(
    model_name = CONFIG['HUGGINGFACE_MODEL_NAME'],
    opt_lr = 0.001,
    tokenizer = tokenizer,
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
)

# collate object
collate_fn = CausalLanguageModelingCollate(
    tokenizer = tokenizer,
    _tok_return_tensors = 'pt',
    _tok_max_length = 512,
    _tok_truncation = True,
    _tok_padding = 'longest'
)

# train dataloaders
train_loader = DataLoader(
    train_dataset,
    batch_size = 2,
    shuffle = True,
    collate_fn = collate_fn
)
val_loader = DataLoader(
    val_dataset,
    batch_size = 2,
    shuffle = False,
    collate_fn = collate_fn
)
test_loader = DataLoader(
    test_dataset,
    batch_size = 2,
    shuffle = False,
    collate_fn = collate_fn
)

In [124]:
# example of generating text
samples = model_wrapper.batch_generate_samples_unconditional(
    num_samples = 32, 
    out_file = 'gen_data.csv',
    batch_size = 8, 
    max_new_tokens = 100
)

Generated 24 of 32

In [150]:
# datasets and dataloaders from the generated data
generated_df = pd.read_csv('gen_data.csv', header=None)
token_list = generated_df[0]

# dataset from tokens
generated_dataset = CausalLMDataset(
    token_list = token_list,
    tokenizer = tokenizer,
)
print(generated_dataset[0])

# collate object
generated_collate_fn = CausalLanguageModelingCollate(
    tokenizer = tokenizer,
    _tok_return_tensors = 'pt',
    _tok_max_length = 512,
    _tok_truncation = True,
    _tok_padding = 'longest'
)

# train dataloaders
generated_loader = DataLoader(
    generated_dataset,
    batch_size = 2,
    shuffle = True,
    collate_fn = generated_collate_fn
)

# example usage
for batch in generated_loader:
    break

example_id = 0
print(f'input_ids of example {example_id} in batch:\n',       batch['input_ids'][example_id])
print(f'target_ids of example {example_id} in batch:\n',      batch['labels'][example_id])
print(f'attention_masks of example {example_id} in batch:\n', batch['attention_mask'][example_id])

tokenizer.decode(batch['input_ids'][example_id])

{'text': "BloCK makes some interesting predictions that are not insurmountable. As such, after wondering what's going on on, you can really just drop it straight away.\n\n\nThe Mednecine Perspective\nA lot of these figures might be wondering if the fact that we now hear Akali was just a holdout and here they come from.\nRemember that Akali works pretty well in r/s (the girl like that). We already see her doing strong substandard things like"}
input_ids of example 0 in batch:
 tensor([   44,  1025,   722,   290, 30481, 15595, 32329,   357,  6144,    72,
            8,  6822,   503,   674, 11808,   319,   788,    12, 35352,  5436,
         5882,  3841,   543,  6774,   345,  2279,   345,   761,   284,   307,
         3910,   286,    13, 18067, 31026, 17056,   870, 36109,   656, 18892,
        12044,  1496,  4217,  4128, 17267,   625,   422,   257, 12744,  4217,
         1471, 15595,  4217,  5972,  2667,  8554, 21252,    82,   770, 18749,
          481,  1037,   284,  4545,   262, 46264,  

'Moothing and Creating Virtual Models (NNi) Check out our tutorial on then-editor Max Longford which brings you everything you need to be aware of. Getting Started Configuring Components into Bootstrapped PC Direct Import over from a Target PC Or Virtual PC Logging Using SSDs This FAQ will help to teach the quickest way to get started on a PC, without going all the way to a Target PC Nominator SMB machine. It is especially helpful if the already-in centelosi is in a business camp with Deutsche Bank—an alleged violation of the Missouri Constitution—but Pope Benedict repeatedly admitted that he\'s not trying to make a "lazy money," a stance that helped stop a series of controversial banking scandals.\n\n\n\n\n\nPelosi says that he didn\'t even think of Pope Francis before he left the state of Missouri in 1998, during which he has used Pope Benedict XVI as a "marketer" to make a charge that gays and'

In [18]:
# evaluate before training
mean_nll = model_wrapper.nll_on_dataset(train_loader)
print('\nNLL of model on train dataset:', mean_nll)

Evaluating batch: 3860 of 3863
NLL of model on train dataset: 6.045377352371631


In [110]:
batch_wise_losses = []
epoch_wise_losses = []
for it in range(CONFIG['START_ITER'], CONFIG['START_ITER'] + CONFIG['TRAIN_ITERS']):  

    # restart the optimizer at each pos-neg iteration
    model_wrapper.reset_optimizers()
    
    # train on one epoch
    res = model_wrapper.train_maximum_likelihood_one_epoch(
        train_loader      = train_loader
    )
    
    epoch_wise_losses.append(res['average_loss'])
    batch_wise_losses.extend(res['batch_wise_losses'])
    
    # evaluate validation performance
    # loss_test = evaluate(...)
    
# perform final validation performance
# loss_test_final = evaluate(...)

Processing neg batch 36 of 3863

KeyboardInterrupt: 

# Main

In [None]:
%load_ext autoreload
%autoreload 2

from dataset import CausalLMDataset
# from torch.utils.data import DataLoader, Dataset

import torch
import os

from transformers import (
    MODEL_WITH_LM_HEAD_MAPPING,
    WEIGHTS_NAME,
    AdamW,
    AutoConfig,
    AutoModelWithLMHead,
    AutoTokenizer,
    GPT2Tokenizer,
    AutoModelForCausalLM,
    PreTrainedModel,
    PreTrainedTokenizer,
    get_linear_schedule_with_warmup,
)

from transformers import GPT2LMHeadModel
from torch.optim import AdamW

import json

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

In [None]:
# tokenizer
tokenizer = GPT2Tokenizer.from_pretrained(
    'distilgpt2',
    pad_token = '<|endoftext|>'  # original tokenizer does not have pad token
)

####################################################################################################
# define networks
generator = CausalLMWrapper(
    model_name = CONFIG['HUGGINGFACE_MODEL_NAME'],
    opt_lr = 0.001,
    tokenizer = tokenizer,
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
)
# discriminator = ...

####################################################################################################
# get downstream corpus as "true data" distribution
df = pd.read_csv('data/cleaned_data_labeled.csv', index_col=0)

# true data dataset
positive_dataset = CausalLMDataset(
    df = df,
    text_col = 'review_text'
)

# collate object
positive_collate_fn = CausalLanguageModelingCollate(
    tokenizer = tokenizer,
    _tok_return_tensors = 'pt',
    _tok_max_length = 512,
    _tok_truncation = True,
    _tok_padding = 'longest'
)

# positive data dataloaders
positive_loader = DataLoader(
    positive_dataset,
    batch_size = 2,
    shuffle = True,
    collate_fn = positive_collate_fn
)

####################################################################################################
# pretrain generator using MLE
batch_wise_losses = []
epoch_wise_losses = []
for it in range(CONFIG['START_ITER'], CONFIG['START_ITER'] + CONFIG['TRAIN_ITERS']):  
    generator.reset_optimizers()
    res = generator.train_maximum_likelihood_one_epoch(train_loader = train_loader)
    # mean_nll = generator.nll_on_dataset(train_loader)
    epoch_wise_losses.append(res['average_loss'])
    batch_wise_losses.extend(res['batch_wise_losses'])
    
####################################################################################################
# pretrain discriminator
batch_wise_losses = []
epoch_wise_losses = []
for it in range(CONFIG['START_ITER'], CONFIG['START_ITER'] + CONFIG['TRAIN_ITERS']):  
    discriminator.reset_optimizers()
    res = discriminator.train_maximum_likelihood_one_epoch(train_loader = train_loader)
    # mean_nll = discriminator.nll_on_dataset(train_loader)
    epoch_wise_losses.append(res['average_loss'])
    batch_wise_losses.extend(res['batch_wise_losses'])
    
####################################################################################################
# adversarial training

rollout = Rollout(generator, 0.8)
# disc_loss_obj1 = Disc1()
# disc_loss_obj2 = Disc2()
# disc_loss_obj3 = Disc3()

print('#####################################################')
print('Start Adeversarial Training...\n')