In [1]:
import pandas as pd
df = pd.read_json('train.json', lines = True)
df_test = pd.read_json('test.json', lines = True)

In [2]:
len(df)
df.head()

Unnamed: 0,context,response,rots,safety_label,safety_annotations,safety_annotation_reasons,source,etc,dialogue_id,response_id,episode_done
0,I tried a bit of shadowboxing today. Got one g...,"This isn't a joke about black people, right? I...",[It's not right to think black people deserve ...,__needs_caution__,"[needs caution, needs caution, needs caution]",[It's rude to make jokes about the color of pe...,sbic,,0,0,False
1,"You're right, it isn't funny. Finding enjoymen...",I am glad that you agree. Joking about abusing...,[It's not right to think black people deserve ...,__casual__,"[casual, casual, casual]",[The speaker admits they spoke inappropriately...,sbic,,0,1,True
2,Did I tell you how I used to poke holes in con...,That's a pretty low down thing to do.,[It's immoral to poke holes in condoms.],__needs_intervention__,"[needs caution, needs intervention, needs inte...",[Speaker admits to committing a seriously wron...,socialchemistry,reddit/confessions/bsajnv,1,0,False
3,I wanted to prank others like that,You should not poke holes in condoms. You coul...,[It's harmful to tamper with other peoples con...,__needs_intervention__,"[needs caution, needs intervention, needs inte...","[Speaker explains why they did the act., They ...",socialchemistry,reddit/confessions/bsajnv,1,1,False
4,It's harmless. I doesn't hurt anyone. I just w...,You're not thinking. How do you conclude expos...,[It's normal for people to be alarmed by an un...,__needs_intervention__,"[needs caution, needs intervention, needs inte...",[Speaker doesn't see why what they did was wro...,socialchemistry,reddit/confessions/bsajnv,1,2,True


In [3]:
!nvidia-smi

In [4]:
df['text'] = df['context'].astype('str')+' '+df['response'].astype('str')
df.iloc[0].text

"I tried a bit of shadowboxing today. Got one guy called Jamal on the chin and another called Tyson between the eyes. This isn't a joke about black people, right? If it is, it isn't funny."

In [5]:
#split train set to train and validation
from sklearn.model_selection import train_test_split
train_input, val_input , train_labels, val_labels =  train_test_split (df, df['safety_label'] , test_size = .1, random_state = 123456789)

In [6]:
pip install transformers

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [7]:
pip install accelerate

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


## Auto Tokenizer from transformers

In [8]:
#tokenize the input:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('gpt2')
max_length = 128
# def tokenize(element, max_length = 128):
#   outputs = tokenizer(element['text'], truncation= True, max_length = max_length,
#                       return_overflowing_tokens = True,return_length = True,)

#   input_batch = []
#   for length, input_ids in zip(outputs['length'], outputs['input_ids']):
#     if length == max_length:
#       input_batch.append(input_ids)
#   return {"input_ids": input_batch}

tokenizer.add_special_tokens({'pad_token': '[PAD]'})

def tokenize(element):
    outputs = tokenizer(
        element["text"],
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True,
        return_length=True,
        padding="max_length",
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == 128:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}

In [9]:
# pip install datasets

In [10]:
from datasets import DatasetDict, Dataset
raw_datasets = DatasetDict({
    "train":Dataset.from_pandas(train_input),
    "valid": Dataset.from_pandas(val_input),
})

In [11]:
#tokenize the train and validation sets:
tokenized_dataset = raw_datasets.map(tokenize, batched= True, remove_columns=raw_datasets["train"].column_names+['__index_level_0__'])

  0%|          | 0/109 [00:00<?, ?ba/s]

  0%|          | 0/13 [00:00<?, ?ba/s]

In [12]:
len(raw_datasets['train']), len(train_input), len(tokenized_dataset['train']), len(tokenized_dataset)

(108212, 108212, 108278, 2)

In [13]:
tokenized_dataset['train']

Dataset({
    features: ['input_ids'],
    num_rows: 108278
})

## Build a model based on pre-trained text-generation. E.g., GPT2LMHeadModel, to compare the results against.

In [14]:
from transformers import GPT2LMHeadModel, AutoConfig
config = AutoConfig.from_pretrained(
    "gpt2",
    vocab_size=len(tokenizer),
    n_ctx=max_length,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

In [15]:
model = GPT2LMHeadModel(config)
model.resize_token_embeddings(len(tokenizer))
model.parameters()

<generator object Module.parameters at 0x7f3712101a50>

In [16]:
model_size = sum(t.numel() for t in model.parameters())
print(f"GPT-2 size: {model_size/1000**2:.1f}M parameters")

GPT-2 size: 124.4M parameters


In [17]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm = False)

In [18]:
#create datacollator for train set and validation set
batch_size = 16
from torch.utils.data import DataLoader, RandomSampler
train_dataloader = DataLoader(tokenized_dataset['train'], sampler = RandomSampler(tokenized_dataset['train']), batch_size = batch_size,collate_fn = data_collator)
validation_dataloader = DataLoader(tokenized_dataset['valid'], sampler = RandomSampler(tokenized_dataset['valid']), batch_size = batch_size,collate_fn = data_collator)

In [19]:
import torch

In [20]:
import random
seed_val = 12345678
random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

## Fine-tune the model based on current dataset

In [19]:
#fine-tuning the model
import logging
from torch.utils.data import DataLoader
from transformers import AdamW, get_linear_schedule_with_warmup
from accelerate import Accelerator

#use CPU if the GPU is not avaiable
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
if not torch.cuda.is_available():
  logging.warning('GPY is not available. The modeing is being trained on CPU!')


'''max of epochs. Change this responsibily. Higher values for epochs can cause overfitting.
'''
MAX_EPOCH = 2

warmup_steps = 1e2
total_steps = len(train_dataloader)*MAX_EPOCH

#preapre the model for training
model.to(device)
model.train()

BSIZE =batch_size
#set the optimizer
optim = AdamW(model.parameters(), lr=5e-5,eps = 1e-8)


accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(model, optim, train_dataloader, validation_dataloader)

#set the criterion to compute error
criterion = torch.nn.CrossEntropyLoss()
#print the performance metrics over the training phase every iteration_reset=100
iteration_reset = 100

# This changes the learning rate as the training loop progresses
scheduler = get_linear_schedule_with_warmup(optim,
                                            num_warmup_steps = warmup_steps,
                                            num_training_steps = total_steps)

#main loop for epochs
for epoch in range(MAX_EPOCH):
    train_loss = 0.0
    val_loss = 0.0
    loss_sum = 0.0

    #batch training loop
    for i,batch in enumerate(train_dataloader):
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        model.zero_grad() #?????
        outputs = model(input_ids, labels = labels, attention_mask=attention_mask, token_type_ids=None)
        # loss = outputs[0]
        # loss = criterion(outputs.view(-1, tokenizer.vocab_size),labels.view(-1))
        loss  = outputs[0]
        loss_sum += loss.item()
        train_loss += loss
        loss.backward()
        optim.step()
        scheduler.step()

        #printing loss
        if i% iteration_reset == 0 and i>0:
            print('Epoch {0}/{1}, batch {2}/{3}, Loss: {4}'.format(epoch,MAX_EPOCH-1,i,len(train_dataloader),
                                                                   loss_sum/(iteration_reset*BSIZE)))
            loss_sum = 0
    #compute model performance during training, and after each epoch:
    model.eval() #put the model to the val phase; in other words, turn off the training phase for this model
    model.eval()
    with torch.no_grad():
      for i, batch in enumerate(validation_dataloader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, labels = labels, attention_mask = attention_mask)
        loss = outputs[0].item()
        val_loss +=loss
    print('Epoch {0} finished with train loss of {1} and validation loss of {2}'.format(epoch, train_loss/(BSIZE*len(train_dataloader)), val_loss/(BSIZE*len(validation_dataloader))))
    model.train() #put the model to the train phase..
    model.train()


model.eval()
model.eval() #finally, we set off the training and put the model into validation for prediction use only

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch 0/1, batch 100/6768, Loss: 0.24126736402511598
Epoch 0/1, batch 200/6768, Loss: 0.1499951833486557
Epoch 0/1, batch 300/6768, Loss: 0.12903139099478722
Epoch 0/1, batch 400/6768, Loss: 0.12330042116343976
Epoch 0/1, batch 500/6768, Loss: 0.11765031956136227
Epoch 0/1, batch 600/6768, Loss: 0.11545532591640949
Epoch 0/1, batch 700/6768, Loss: 0.11313444435596466
Epoch 0/1, batch 800/6768, Loss: 0.11177447743713856
Epoch 0/1, batch 900/6768, Loss: 0.1089200534671545
Epoch 0/1, batch 1000/6768, Loss: 0.10869546681642532
Epoch 0/1, batch 1100/6768, Loss: 0.10828962601721287
Epoch 0/1, batch 1200/6768, Loss: 0.1059887908399105
Epoch 0/1, batch 1300/6768, Loss: 0.10550639398396015
Epoch 0/1, batch 1400/6768, Loss: 0.10557013876736164
Epoch 0/1, batch 1500/6768, Loss: 0.10401114657521247
Epoch 0/1, batch 1600/6768, Loss: 0.10481816731393337
Epoch 0/1, batch 1700/6768, Loss: 0.10374127559363842
Epoch 0/1, batch 1800/6768, Loss: 0.10372428111732006
Epoch 0/1, batch 1900/6768, Loss: 0.0994

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50258, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dro

In [21]:
# model.save_pretrained('gpt2-fine_tuned_26',from_pt=True)

In [22]:
# torch.save(model.state_dict(), 'gpt2-fine_tuned_26_2')

In [23]:
torch.cuda.empty_cache()

### Load the model

In [10]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [11]:
model = GPT2LMHeadModel(config)
model.load_state_dict(torch.load('gpt2-fine_tuned_26_2'))
model.to(device)



GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50258, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dro

## Check the model performance based on the test set: (Blue metrics and prexibility)

In [12]:
import pandas as pd
df_test = pd.read_json('test.json', lines = True)
df_test['text'] = df_test['context'].astype('str')+' '+df_test['response'].astype('str')

In [13]:
test_sample = df_test.sample(frac = .005, random_state=123456, replace = False)
len(test_sample)

125

In [14]:
#tokenize the test set:
from datasets import DatasetDict, Dataset
test_datasets = DatasetDict({
    "test":Dataset.from_pandas(test_sample),
})
#tokenize the train and validation sets:
tokenized_test_dataset = test_datasets.map(tokenize, batched= True, remove_columns=test_datasets["test"].column_names)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [15]:
from torch.utils.data import DataLoader, RandomSampler
batch_size = 16
test_dataloader = DataLoader(tokenized_test_dataset['test'], sampler = RandomSampler(tokenized_test_dataset['test']), batch_size = batch_size,collate_fn = data_collator)

In [16]:
# first, create the output for the test set:
import numpy as np
model.eval()
test_loss =0.00
perp = []
for i , batch in enumerate(test_dataloader):
    input_ids = batch['input_ids'].to(device)
    labels = batch['labels'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    model.zero_grad()
    outputs = model(input_ids, labels = labels, attention_mask = attention_mask, token_type_ids = None)
    loss = outputs[0]
    test_loss+=loss
    perp.append(loss.detach().to('cpu').tolist())
print('total test loss is : {0}'.format(test_loss/len(tokenized_test_dataset['test'])))
print(np.exp(np.mean(perp)))

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


total test loss is : 0.24054135382175446
42.88226538304938


In [17]:
df_test['text'] = df_test['context']
df_test['labels'] = df_test['response']
df_test_sample = df_test.sample(frac = .005, random_state=123456, replace = False)
len(df_test_sample)

125

In [30]:
#tokenize the test set:
from datasets import DatasetDict, Dataset
test_datasets = DatasetDict({
    "test":Dataset.from_pandas(df_test_sample),
})
#tokenize the train and validation sets:
tokenized_test_sample = test_datasets.map(tokenize, batched= True, remove_columns=test_datasets["test"].column_names)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [25]:
tokenized_test_sample

DatasetDict({
    test: Dataset({
        features: ['input_ids'],
        num_rows: 25
    })
})

In [26]:
# from torch.utils.data import DataLoader, RandomSampler
# batch_size = 16
# test_sample_dataloader = DataLoader(tokenized_test_dataset['test'], sampler = RandomSampler(tokenized_test_sample['test']), batch_size = batch_size,collate_fn = data_collator)

In [18]:
pip install evaluate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [13]:
device

device(type='cuda')

### Get the Bleu and Perplexity metrics on the test set

In [19]:
from torchmetrics.text import BLEUScore
import evaluate

bleu = evaluate.load('bleu')
model.to(device)
# bleu = BLEUScore()
model.eval()
res = 0.00
ins = [];outs=[]
for _,row in df_test_sample.iterrows():
    output = model(tokenizer.encode(row.context,  return_tensors='pt').to(device))
    out = tokenizer.decode(torch.argmax(output.logits, axis = 2)[0])
    ins.append([row.context])
    outs.append(out)
#     res =bleu.compute(predictions=[out], references= [row[1].response],tokenizer = tokenizer)['bleu']

# print('total bleu is  : {0}'.format(res))
bleu.compute(predictions=outs, references= ins)

  warn(f"Failed to load image Python extension: {e}")


{'bleu': 0.030849477664881848,
 'precisions': [0.31100963977676305,
  0.05796316359696641,
  0.01336432306798373,
  0.0037593984962406013],
 'brevity_penalty': 1.0,
 'length_ratio': 1.0035641547861507,
 'translation_length': 1971,
 'reference_length': 1964}

In [16]:
bleu.compute(predictions=outs, references= ins)

{'bleu': 0.0,
 'precisions': [0.005574912891986063, 0.0, 0.0, 0.0],
 'brevity_penalty': 0.6916735972033735,
 'length_ratio': 0.7306517311608961,
 'translation_length': 1435,
 'reference_length': 1964}

In [20]:
# from torchmetrics.text import Perplexity
# import evaluate

# prep = Perplexity
# # model.to(device)
# model.eval()
# res = 0.00
# ins, outs = [], [];
# for _,row in df_test_sample.iterrows():
#     output_tensor = model(tokenizer.encode(row.context,  return_tensors='pt').to(device))
#     out = tokenizer.decode(torch.argmax(output.logits, axis = 2)[0])
#     in_tensor = tokenizer.encode(row.context,  return_tensors='pt').to(device)
#     ins.append([in_tensor])
#     outs.append(out)
# prep.update(ins, outs)

In [21]:
# from torchmetrics.text import Perplexity
# prep = Perplexity()
# in_tensor = tokenizer.encode(row.context,  return_tensors='pt').to(device)
# out_tensor = model(tokenizer.encode(row.context,  return_tensors='pt').to(device)).logits[0]
# perp.update(in_tensor, out_tensor)

In [22]:
# type(tokenizer.encode(row.context,  return_tensors='pt').to(device))
type(model(tokenizer.encode(row.context,  return_tensors='pt').to(device)).logits[0])

torch.Tensor

In [51]:
tokenizer.encode(row[1].response, return_tensors='pt').to(device).shape, output.logits.shape
#PAS 12 va 17 chi hastan??

(torch.Size([1, 51]), torch.Size([1, 15, 50258]))

In [21]:
#metrics using perprelxity :

from torchmetrics.text import Perplexity
perp = Perplexity(ignore_index=-100)
res = 0.00
for row in df_test_sample.iterrows():

    output = model(tokenizer.encode(row[1].context,  return_tensors='pt').to(device))
    
    res +=perp(output.logits, tokenizer.encode(row[1].response, return_tensors='pt').to(device))

print('total perp is  : {0}'.format(res))

In [22]:
# [out], [[row[1].response]]

In [23]:
# getting output for one single item 
input_id_sample = tokenizer.encode('this is a very nice and warm day outside.', return_tensors = 'pt')
outputs = model(input_id_sample.to(device))
output_sample = tokenizer.decode(torch.argmax(outputs.logits, axis = 2)[0])
output_sample

In [None]:
!pip3 install torchmetrics

In [None]:
!pip install bert-score

In [52]:
# from torchmetrics import BERTScore
from torchmetrics.text import BLEUScore
bleu = BLEUScore()
preds = ['the cat is on the mat']
target = [['a cat is on the mat']]

bleu(preds, target)
bleu = BLEUScore()
bleu(['it is nice'], [['it is very nice']])

tensor(0.)

In [41]:
input_id_sample, output_sample, labels

(tensor([[5661,  318,  257,  845, 3621,  290, 5814, 1110, 2354,   13]]),
 ' is a joke good thing I.. of I',
 tensor([[   40,   745, 19458,  ...,  -100,  -100,  -100],
         [ 3347,  1297,   502,  ...,  -100,  -100,  -100],
         [ 1544,   338,   616,  ...,  -100,  -100,  -100],
         ...,
         [11633,   314,  1683,  ...,  -100,  -100,  -100],
         [   40,   836,   470,  ...,  -100,  -100,  -100],
         [ 1026,   318,  1165,  ...,  -100,  -100,  -100]], device='cuda:0'))

In [None]:
import torchmetrics
import bert_score

### Generate output based on the tutorial 

In [37]:
model.eval()
prompt = "<|startoftext|>"

generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
print(generated)

tensor([[  27,   91, 9688, 1659, 5239,   91,   29]])
