In [None]:
!pip install -q transformers==4.27.3
!pip install -q accelerate==0.19.0
!pip install -q datasets==2.12.0
!pip install -q rouge_score
!pip install -q sacrebleu
!pip install -q wandb
!pip install -q sentencepiece
!pip install -q huggingface_hub

In [3]:
import torch
import transformers
import datasets
from transformers import AutoModel,AutoTokenizer
from datasets import load_dataset
from tqdm import tqdm
from itertools import chain
from transformers import Trainer,TrainingArguments,DataCollatorWithPadding,AutoModelForCausalLM
import pandas as pd
import re
from itertools import chain
import numpy as np
from datasets import Dataset, DatasetDict
from transformers import DataCollatorWithPadding
from datasets import load_metric
from tqdm.autonotebook import tqdm

from huggingface_hub import login
login()

In [None]:
class Config:
    checkpoint="EleutherAI/gpt-neo-125m"
    device='cuda' if torch.cuda.is_available() else 'cpu'

gpt_neo_125=AutoModelForCausalLM.from_pretrained(Config.checkpoint).to(Config.device)
tokenizer=AutoTokenizer.from_pretrained(Config.checkpoint)
special_tokens={"bos_token":"<|startoftext|>",
                "eos_token":"<|endoftext|>",
                "mask_token":"[mask]",
                "pad_token":"[pad]",}

other_tokens=["<|speaker-1|>","<|speaker-2|>"]
tokenizer.add_special_tokens(special_tokens)
tokenizer.add_tokens(other_tokens)

vocab=tokenizer.get_vocab()
gpt_neo_125.resize_token_embeddings(len(vocab))


## Datasets -preprocessing

In [None]:
# empethetic dialogue dataset from hf

emp_data=load_dataset("empathetic_dialogues")

def emp_in_id(data):
    input_ids=[]
    token_type_ids=[]
    labels=[]
    for idx in  tqdm(range(len(data['prompt']))):
        sp_1=["<|startoftext|>","<|speaker-1|>"]+tokenizer.tokenize(data['prompt'][idx])
        sp_2=["<|speaker-2|>"]+tokenizer.tokenize(data['utterance'][idx])+["<|endoftext|>"]
        input_ids.append(tokenizer.convert_tokens_to_ids(sp_1+sp_2))
        token_type_ids.append(tokenizer.convert_tokens_to_ids(["<|speaker-1|>"]*len(sp_1)+["<|speaker-2|>"]*len(sp_2)))
        labels.append(tokenizer.convert_tokens_to_ids(["[mask]"]*len(sp_1)+sp_2))
    return input_ids,token_type_ids,labels

emp_input_ids,emp_token_type_ids,emp_labels=emp_in_id(emp_data['test'])   

download={'input_ids':emp_input_ids,
 'token_type_ids':emp_token_type_ids,
 'labels':emp_labels}
import pickle
pickle.dump(download,open('emp_test.p','wb'))

In [30]:
# cnversation chat data  from kaggle

data = pd.read_csv("/kaggle/input/human-conversation-training-data/human_chat.txt", delimiter="\t",header=None)
data.columns=['chats']
chats=data['chats'].tolist()

pattern = r"(?=.*\bhi\b)(?=.*\bHuman 1:).*"
conv_ids=[idx for idx,turn in enumerate(chats) if re.match(pattern, turn, re.IGNORECASE)!=None]
dialogues=[chats[conv_ids[idx]:conv_ids[idx+1]] for idx in range(len(conv_ids)-1)]
dialogues=[['<|startoftext|>']+dialog+['<|endoftext|>'] for dialog in dialogues]
dialogues=[list(map(lambda x: x.replace('Human 1:','<|speaker-1|>').replace('Human 2:','<|speaker-2|>'),turn)) for turn in dialogues]



def input_ids_map(dialog):
    return list(map(lambda x:tokenizer.convert_tokens_to_ids(tokenizer.tokenize(x)),dialog))

def token_type_ids_map(input_id):
    token_type_id=[[tokenizer.vocab['<|speaker-1|>']]*len(turn) if id%2==0 else [tokenizer.vocab['<|speaker-2|>']]*len(turn) for id,turn in enumerate(input_id[1:-1])]
    token_type_id.insert(0,[tokenizer.vocab['<|speaker-1|>']])
    token_type_id.append([tokenizer.vocab['<|speaker-2|>']])
    return token_type_id

def labels_map(input_id):
    label=[[tokenizer.vocab['[mask]']]*len(turn) if id%2==0 else turn for id,turn in enumerate(input_id[1:-1])]
    label.insert(0,[tokenizer.vocab['[mask]']])
    label.append([tokenizer.vocab['<|endoftext|>']])
    return label
    
input_ids=list(map(input_ids_map,tqdm(dialogues)))
token_type_ids=list(map(token_type_ids_map,tqdm(input_ids)))
labels=list(map(labels_map,tqdm(input_ids)))

input_ids=[list(chain(*input_id)) for input_id in input_ids]
token_type_ids=[list(chain(*token_type_id)) for token_type_id in token_type_ids]
labels=[list(chain(*label)) for label in labels]



train=int(np.round(len(input_ids)*(0.8)))
val=int(np.round(len(input_ids)*(0.1)))+train


train_dataset = Dataset.from_dict({"input_ids":input_ids[:train],
                        'token_type_ids':token_type_ids[:train],
                        'labels':labels[:train]})

validation_dataset = Dataset.from_dict({"input_ids":input_ids[train:val],
                               'token_type_ids':token_type_ids[train:val],
                               'labels':labels[train:val]})

test_dataset = Dataset.from_dict({"input_ids":input_ids[val:],
                       'token_type_ids':token_type_ids[val:],
                       'labels':labels[val:]})

chat_dataset_whatsapp = DatasetDict({'train': train_dataset,
                        'validation': validation_dataset,
                        'test': test_dataset})

100%|██████████| 93/93 [00:00<00:00, 523.75it/s]
100%|██████████| 93/93 [00:44<00:00,  2.08it/s]
100%|██████████| 93/93 [00:25<00:00,  3.65it/s]


In [None]:
# daily dialogue data from hugging face 
dialog=load_dataset('daily_dialog')

# adding_special_torkens
bos_id = tokenizer.vocab['<|startoftext|>']
eos_id = tokenizer.vocab['<|endoftext|>']
speaker_1_id = tokenizer.vocab['<|speaker-1|>']
speaker_2_id = tokenizer.vocab['<|speaker-2|>']
mask = tokenizer.vocab['[mask]']


# preprocessing

def speaker_dialogs(data):
    token_ids = [] 
    for dialogue in tqdm(data):
        dialogue_ids = []
        for ids, utterance in enumerate(dialogue):
            tokens = tokenizer.tokenize(utterance)
            ids = tokenizer.convert_tokens_to_ids(tokens)
            dialogue_ids.append(ids)
        token_ids.append(dialogue_ids)

    dialogues_with_speaker_ids = []
    for dialogue in tqdm(token_ids):
        utterances_with_speaker_ids = []
        for i, utterance in enumerate(dialogue):
            if i % 2 == 0:
                utterances_with_speaker_ids.append([speaker_1_id] + utterance)
            else:
                utterances_with_speaker_ids.append([speaker_2_id] + utterance)
        dialogues_with_speaker_ids.append(utterances_with_speaker_ids)
    return dialogues_with_speaker_ids



def all_id(all_text):
    input_ids=[]
    for conv in all_text:
        conv_id=[]
        for ids,turn in enumerate(conv):
            if ids==0:
                conv_id.append([bos_id]+turn)
            elif ids==len(conv)-1:
                conv_id.append(turn+[eos_id])
            else:
                conv_id.append(turn)
                
        input_ids.append(conv_id)
                
    
    token_type_ids=[]
    for conv in input_ids:
        conv_id=[]
        for ids,turn in enumerate(conv):
            if ids%2==0:
                conv_id.append([speaker_1_id for _ in range(len(turn))])
            else:
                conv_id.append([speaker_2_id for _ in range(len(turn))])
        token_type_ids.append(conv_id)
    
    labels=[]
    for conv in input_ids:
        conv_id=[]
        for ids,turn in enumerate(conv):
            if ids%2==0:
                conv_id.append([mask for _ in range(len(turn))])
            else:
                conv_id.append(turn)
                
        labels.append(conv_id)
    return input_ids,token_type_ids,labels


def text_to_tokens(data):
    dialogs_with_sp=speaker_dialogs(data)
    input_ids,token_type_ids,labels=all_id(dialogs_with_sp)                    
    
    input_ids=[list(chain(*input_ids[i])) for i in range(len(input_ids))]
    token_type_ids=[list(chain(*token_type_ids[i])) for i in range(len(token_type_ids))]
    labels=[list(chain(*labels[i])) for i in range(len(labels))]
    
    return input_ids,token_type_ids,labels

    
train_data=dialog['train']['dialog']
val_data=dialog['validation']['dialog']
test_data=dialog['test']['dialog']

train_input_ids,train_token_type_ids,train_labels=text_to_tokens(train_data) 
val_input_ids,val_token_type_ids,val_labels=text_to_tokens(val_data) 
test_input_ids,test_token_type_ids,test_labels=text_to_tokens(test_data) 

# preparing data to datasetdict format

train_dataset = Dataset.from_dict({"input_ids":train_input_ids,
                        'token_type_ids':train_token_type_ids,
                        'labels':train_labels})

validation_dataset = Dataset.from_dict({"input_ids":val_input_ids,
                               'token_type_ids':val_token_type_ids,
                               'labels':val_labels})

test_dataset = Dataset.from_dict({"input_ids":test_input_ids,
                       'token_type_ids':test_token_type_ids,
                       'labels':test_labels})

chat_dataset_daily_dialogue = DatasetDict({'train': train_dataset,
                        'validation': validation_dataset,
                        'test': test_dataset})

## concat

In [None]:
train=int(np.round(len(input_ids)*(0.8)))
val=int(np.round(len(input_ids)*(0.1)))+train

import pickle
emp_train=pickle.load(open("/kaggle/input/empathetic-data/emp_train.p",'rb'))
emp_val=pickle.load(open("/kaggle/input/empathetic-data/emp_val.p",'rb'))
emp_test=pickle.load(open("/kaggle/input/empathetic-data/emp_test.p",'rb'))



emp_train_dataset = Dataset.from_dict({"input_ids":emp_train['input_ids'][:train],
                        'token_type_ids':emp_train['token_type_ids'][:train],
                        'labels':emp_train['labels'][:train]})

emp_validation_dataset = Dataset.from_dict({"input_ids": emp_val['input_ids'][train:val],
                               'token_type_ids':emp_val['token_type_ids'][train:val],
                               'labels':emp_val['labels'][train:val]})

emp_test_dataset = Dataset.from_dict({"input_ids":emp_test['input_ids'][val:],
                       'token_type_ids':emp_test['token_type_ids'][va:],
                       'labels':emp_test["labels"][val:]})

emp_dataset = DatasetDict({'train': emp_train_dataset,
                        'validation': emp_validation_dataset,
                        'test': emp_test_dataset
                          })

# chat_dataset_whatsapp
# chat_dataset_daily_dialogue
# emp_dataset

from datasets import concatenate_datasets
all_data=concatenate_datasets([chat_dataset_whatsapp["train"], chat_dataset_whatsapp["validation"], chat_dataset_whatsapp["test"],
                               chat_dataset_daily_dialogue["train"], chat_dataset_daily_dialogue["validation"], chat_dataset_daily_dialogue["test"],
                              emp_dataset["train"], emp_dataset["validation"], emp_dataset["test"]
                              ])

all_data_shuffled=all_data.shuffle(seed=5)
all_chat_dataset = all_data.train_test_split(test_size=0.1)

In [None]:
# collator function to normatize the input_size
class MyCollator:
    def __init__(self, pad_token_id, max_length):
        self.pad_token_id = pad_token_id
        self.max_length = max_length

    def __call__(self, batch):

        input_ids_batch = [torch.tensor(item['input_ids'],dtype=torch.long) for item in batch]
        token_type_ids_batch = [torch.tensor(item['token_type_ids'],dtype=torch.long) for item in batch]
        labels_batch = [torch.tensor(item['labels'],dtype=torch.long) for item in batch]


        input_ids_padded = torch.nn.utils.rnn.pad_sequence(input_ids_batch, batch_first=True, padding_value=self.pad_token_id)
        input_ids_padded = input_ids_padded[:, :self.max_length]  


        token_type_ids_padded = torch.nn.utils.rnn.pad_sequence(token_type_ids_batch, batch_first=True, padding_value=self.pad_token_id)
        token_type_ids_padded = token_type_ids_padded[:, :self.max_length]  


        labels_padded = torch.nn.utils.rnn.pad_sequence(labels_batch, batch_first=True, padding_value=self.pad_token_id)
        labels_padded = labels_padded[:, :self.max_length]  

        return {
            'input_ids': input_ids_padded,
            'token_type_ids': token_type_ids_padded,
            'labels': labels_padded
        }


# Train 

In [None]:
general_training_args_concat={'output_dir':'/kaggle/working/model',
                        'overwrite_output_dir':True, 
                        'push_to_hub':True,
                                
                        'num_train_epochs':10,
                        'warmup_steps':10,
                        'weight_decay':0.1, 
                        'logging_steps':10,
                        'evaluation_strategy':'steps',
                        'eval_steps':0.1,
                        'save_steps':0.1,
                       
                        'optim':'adamw_torch', 
                        'save_strategy':'steps', 
                        'save_total_limit':1, 
                        'logging_dir':'/kagle/working/logs' ,
                        'report_to':'wandb',  

                        'per_device_train_batch_size':1, 
                        'per_device_eval_batch_size':1,
                        'gradient_accumulation_steps':8}

collator=MyCollator(tokenizer.vocab['[pad]'],tokenizer.model_max_length)
gpt_neo_concat=TrainingArguments(push_to_hub_model_id="gpt_neo_extended_retrain",** general_training_args_concat)
neo_concat_trainer=Trainer(
    model=gpt_neo_125,
    args=gpt_neo_concat,
    tokenizer=tokenizer,
    data_collator= collator,
    train_dataset=all_chat_dataset['train'],
    eval_dataset =all_chat_dataset['test'])

# trainins and pushes model to hf hub
neo_concat_trainer.train()

In [None]:
# chat configuration
class InfConfig:
    top_p = 0.8
    max_length = 8
    min_length = 2
    max_turns = 10
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Bot

In [None]:
bos_id = tokenizer.vocab['<|startoftext|>']
eos_id = tokenizer.vocab['<|endoftext|>']
speaker_1_id = tokenizer.vocab['<|speaker-1|>']
speaker_2_id = tokenizer.vocab['<|speaker-2|>']
mask = tokenizer.vocab['[mask]']

def chat(model):
    query_history = []
    while True:
        utterance = input('You: ')
        if utterance == "close_chat":
            break
        
        input_ids = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(utterance))
        input_ids = [speaker_1_id] + input_ids
        query_history.append(input_ids)
        
        if len(query_history) > InfConfig.max_turns:
            num_exceeded = len(query_history) -  InfConfig.max_turns + 1
            query_history = query_history[num_exceeded:]
            

        input_ids = [bos_id] + list(chain.from_iterable(query_history)) + [speaker_2_id]
        start_sp_id = query_history[0][0]
        next_sp_id = speaker_1_id if start_sp_id == speaker_2_id else speaker_2_id
        token_type_ids = [[start_sp_id] * len(turn) if h % 2 == 0 else [next_sp_id] * len(turn) for h, turn in enumerate(query_history)]
        token_type_ids = [start_sp_id] + list(chain.from_iterable(token_type_ids)) + [speaker_2_id]
        input_len = len(input_ids)
        input_ids = torch.LongTensor(input_ids).unsqueeze(0).to(InfConfig.device)
        token_type_ids = torch.LongTensor(token_type_ids).unsqueeze(0).to(InfConfig.device)
        
        output_ids = model.generate(input_ids=input_ids, 
                                    token_type_ids=token_type_ids, 
                                    pad_token_id=tokenizer.vocab["[pad]"], 
                                    do_sample=True, 
#                                     top_p=InfConfig.top_p, 
                                    max_new_tokens=InfConfig.max_length, 
                                    min_new_tokens=InfConfig.min_length,
                                    output_hidden_states=True, 
                                    output_scores=True, 
                                    return_dict_in_generate=True).sequences
        
        output_ids = output_ids[0].tolist()[input_len:]
        response = tokenizer.decode(output_ids, skip_special_tokens=True)
        response.replace("<|speaker-1|>",'').replace("<|speaker-2|>",'')
        print(f'Bot: {response}')
        query_history.append([speaker_2_id] + tokenizer.encode(response))    

inf_checkpoint="theothertom/gpt_neo_extended_retrain"
model=AutoModelForCausalLM.from_pretrained(inf_checkpoint).to(InfConfig.device)
model=neo_concat_trainer.model
chat(model)

In [None]:
# save model manually if needed
model=neo_concat_trainer.model
model.save_pretrained("model.bin")

In [None]:
# utility dunctions for evaluation
def unk(input_id,token):
    store=[]
    for i,j in enumerate(input_id):
        if j==token:
            store.append(i)
    return store

def map_sp(input_ids):
    speaker_1=[]
    speaker_2=[]
    for input_id in tqdm(input_ids,desc="speaker_index_extracting"):
        speaker_1.append(unk(input_id,tokenizer.vocab['<|speaker-1|>']))
        speaker_2.append(unk(input_id,tokenizer.vocab['<|speaker-2|>']))
    return speaker_1,speaker_2
        

input_ids=chat_dataset_daily_dialogue['test']['input_ids']
sp_1,sp_2=map_sp(input_ids)

## evaluate

In [None]:
rouge_metric = load_metric("rouge")
input_ids=chat_dataset_daily_dialogue['test']['input_ids']
token_type_ids=chat_dataset_daily_dialogue['test']['token_type_ids']
labels=chat_dataset_daily_dialogue['test']['labels']
device='cpu'

def get_metrics(input_ids,token_type_ids,model_name,device,metric_name):
    """
    metric_name:
    "rouge"-for rouge_metric
    "bleau"-for bleau_metric
    
    """
    eval_metric = load_metric(metric_name)
    model=AutoModelForCausalLM.from_pretrained(model_name).to(device)
    for ids,input_id in tqdm(enumerate(input_ids[:3]),desc='running',total=len(input_ids)):
        sp2=sp_2[ids]
        sp1=sp_1[ids][:len(sp2)]
        sp1.append(-1)
        label_batch=[]
        pred_batch=[]
        for step in range(len(sp2)):
            current_input=torch.LongTensor(input_id[:sp2[step]+1]).unsqueeze(0).to(device)
            current_token_type_id=torch.LongTensor(token_type_ids[ids][:sp2[step]+1]).unsqueeze(0).to(device)
            label_batch.append([tokenizer.decode(input_id[sp2[step]+1:sp1[step+1]])])

            with torch.no_grad():
                output= model.generate(input_ids=current_input,
                                        token_type_ids=current_token_type_id,
                                        pad_token_id=eos_id, 
                                        do_sample=True,
                                        top_p=InfConfig.top_p, 
                                        max_new_tokens=InfConfig.max_length, 
                                        min_new_tokens=InfConfig.min_length,
                                        output_hidden_states=True, 
                                        output_scores=True, 
                                        return_dict_in_generate=True).sequences
                
            output=output[0][len(current_input[0]):]
            output=tokenizer.decode(output,skip_special_tokens=True)
            output=output.replace('<|speaker-1|>','').replace('<|speaker-2|>','')
            pred_batch.append([output])    
        eval_metric.add_batch(predictions=pred_batch,references=label_batch)

    if metric_name=="rouge":
        score=eval_metric.compute()
        score={i:score[i].mid.fmeasure for i in score.keys()}
        return pd.DataFrame.from_dict(score, orient="index", columns=["Value"])
    elif  metric_name=='sacrebleu':
        score=eval_metric.compute(smooth_method="floor", smooth_value=0)
        score["precisions"] = [np.round(p, 2) for p in score["precisions"]]
        return pd.DataFrame.from_dict(score, orient="index", columns=["Value"])
            
            
rouge_score_neo_daily=get_metrics(input_ids,token_type_ids,"theothertom/gpt_neo-daily_dialog",'cpu','rouge')                    
bleu_score_neo_daily=get_metrics(input_ids,token_type_ids,"theothertom/gpt_neo-daily_dialog",'cpu','sacrebleu')                    