# Model Fine-tuning

In [None]:
import tensorflow as tf
from transformers import AutoTokenizer, TFGPT2LMHeadModel
import pandas as pd
import tqdm
import urllib.request
import numpy as np

# Specify your filename path here
filename = 'YOUR_FILE_PATH.xlsx'

# Load your dataset
df = pd.read_excel(filename)

train_data = pd.DataFrame(columns=['Question','Answer'])
ind = 0

for i in range(len(df)):
    for j in range(3):
        if pd.isna(df.Question[i]) != True and pd.isna(df.iloc[i,j+1]) != True:
            train_data = pd.concat([train_data, pd.DataFrame({"Question":df.Question[i],"Answer":df.iloc[i,j+1]}, index=[ind])], axis=0)
            ind += 1

train_data = train_data.dropna()
train_data.head()

In [None]:
# Load the pre-trained tokenizer
tokenizer = AutoTokenizer.from_pretrained('microsoft/DialoGPT-small', padding_side="left")

# Check if the tokenizer has a pad token, sep token and eos token. If not, choose one.
if not tokenizer.pad_token:
    tokenizer.pad_token = tokenizer.eos_token
if not tokenizer.sep_token:
    tokenizer.sep_token = tokenizer.eos_token
if not tokenizer.eos_token:
    tokenizer.eos_token = ''

# Load the pre-trained model
model = TFGPT2LMHeadModel.from_pretrained('microsoft/DialoGPT-small')

# Ensure that the model and tokenizer have the same number of tokens
model.resize_token_embeddings(len(tokenizer))

batch_size = 8

def get_chat_data():
    for question, answer in zip(train_data.Question.to_list(), train_data.Answer.to_list()):
        if question is not None and answer is not None:  # Add this check
            sent = tokenizer.encode(question + tokenizer.sep_token + answer + tokenizer.eos_token, add_special_tokens=True) 
            yield sent

dataset = tf.data.Dataset.from_generator(get_chat_data, output_types=tf.int32)
dataset = dataset.padded_batch(batch_size=batch_size, padded_shapes=(None,), padding_values=tokenizer.pad_token_id)

In [None]:
adam = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08)
steps = len(train_data) // batch_size + 1
EPOCHS = 300

min_epoch_loss = np.inf
patience = 10
wait = 0

for epoch in range(EPOCHS):
    epoch_loss = 0
    
    for batch in tqdm.tqdm(dataset, total=steps):
        with tf.GradientTape() as tape:
            result = model(batch, labels=batch)
            loss = result[0]
            batch_loss = tf.reduce_mean(loss)
          
        grads = tape.gradient(batch_loss, model.trainable_variables)
        adam.apply_gradients(zip(grads, model.trainable_variables))
        epoch_loss += batch_loss / steps

    print('[Epoch: {:>4}] cost = {:>.9}'.format(epoch + 1, epoch_loss))

    if epoch_loss < min_epoch_loss:
        min_epoch_loss = epoch_loss
        wait = 0
        
        tokenizer.save_pretrained("model_checkpoint/dialoGPT_ENG")
        model.save_pretrained("model_checkpoint/dialoGPT_ENG")
        print("Saved model & tokenizer at epoch", epoch + 1)
    else:
        wait += 1

    if wait >= patience:
        print("Early stopping at epoch", epoch + 1)
        break

In [None]:
def return_answer_by_chatbot(user_text):
    # add_special_tokens=True automatically creates the attention_mask
    input_ids = tokenizer.encode(user_text + tokenizer.eos_token, return_tensors='tf', add_special_tokens=True)
    
    # generate the mask based on the input_ids
    attention_mask = tf.where(input_ids != tokenizer.pad_token_id, 1, 0)

    output = model.generate(
        input_ids, 
        max_length=100, 
        do_sample=True, 
        top_p=0.9, 
        top_k=20, 
        num_return_sequences=1, 
        early_stopping=True, 
        pad_token_id=tokenizer.eos_token_id, # Set pad_token_id to eos_token_id
        attention_mask=attention_mask, # Set the attention_mask
    )
    sentence = tokenizer.decode(output[0].numpy().tolist(), skip_special_tokens=True)
    chatbot_response = sentence[len(user_text):]  # Remove the user text from the start of the response
    return chatbot_response.strip()  # strip() is used to remove leading and trailing white spaces

In [None]:
return_answer_by_chatbot('I am hungry')

# Convert Model to ONNX Format

In [None]:
import tensorflow as tf
from transformers import AutoTokenizer, TFGPT2LMHeadModel
import pandas as pd
import tqdm
import urllib.request
import numpy as np
import onnxruntime

onnxruntime.set_default_logger_severity(3)

onnx_session = onnxruntime.InferenceSession('onnx_model/dialoGPT_ENG/decoder_model.onnx')
tokenizer = AutoTokenizer.from_pretrained('onnx_model/dialoGPT_ENG')

output_names = [output.name for output in onnx_session.get_outputs()]
print("Output names:", output_names)

### (1) Greedy Search Method

In [None]:
def greedy_decode(input_ids, attention_mask, max_length=50):
    current_length = input_ids.shape[1]
    
    while current_length < max_length:
        outputs = onnx_session.run(output_names=['logits'], input_feed={"input_ids": input_ids, "attention_mask": attention_mask})
        logits = outputs[0]
        predicted_token_id = np.argmax(logits, axis=-1)[0, -1]
        
        # Break if the EOS token is generated
        if predicted_token_id == tokenizer.eos_token_id:
            break

        # Update input_ids and attention_mask
        input_ids = np.concatenate((input_ids, np.array([[predicted_token_id]])), axis=1)
        attention_mask = np.concatenate((attention_mask, np.array([[1]])), axis=1)
        
        current_length += 1

    return input_ids

In [None]:
import time

while True:
    user_text = input("Question: ")
    start_time = time.time() 

    # Note that we use the user text directly, not wrapped in <usr> or <sys> tags, 
    # as GPT2 doesn't use them. The eos_token is used to mark the end of the question.
    input_ids = tokenizer.encode(user_text + tokenizer.eos_token, return_tensors='np').astype(np.int64)
    
    # Create attention mask
    attention_mask = np.ones_like(input_ids, dtype=np.int64)

    decoded_input_ids = greedy_decode(input_ids, attention_mask)
    decoded_text = tokenizer.decode(decoded_input_ids[0])

    # Strip the question text and the eos_token from the response
    decoded_text = decoded_text[len(user_text)+len(tokenizer.eos_token):]
    
    print(f"Answer: {decoded_text}")

    finish_time = time.time() - start_time
    print(f"runtime: {finish_time}")
    print()

### (2) Top-K Sampling Method

In this method, the top k most probable next tokens are filtered and the probability mass is redistributed amongst these tokens. This implies that the number of candidate tokens is always the same, regardless of the distribution of probabilities. For instance, even if a single token has a 95% probability of being the next token, top-k will still consider 99 other possibilities (if k is set to 100), making the sampling process more random.

In [None]:
def softmax(logits):
    e_x = np.exp(logits - np.max(logits))
    return e_x / e_x.sum(axis=0)

def top_k_sampling(logits, k=10):
    indices_to_remove = logits < np.sort(logits)[-k]
    logits[indices_to_remove] = -np.finfo(np.float32).max
    probs = softmax(logits)
    token_id = np.random.choice(len(logits), size=1, p=probs)[0]
    return token_id

def top_k_sampling_decode(input_ids, attention_mask, max_length=50, k=10):
    current_length = input_ids.shape[1]

    while current_length < max_length:
        outputs = onnx_session.run(output_names=['logits'], input_feed={"input_ids": input_ids, "attention_mask": attention_mask})
        logits = outputs[0][0, -1, :]

        # Apply top-k sampling
        predicted_token_id = top_k_sampling(logits, k=k)

        # Break if the EOS token is generated
        if predicted_token_id == tokenizer.eos_token_id:
            break

        # Update input_ids and attention_mask
        input_ids = np.concatenate((input_ids, np.array([[predicted_token_id]])), axis=1).astype(np.int64)
        attention_mask = np.concatenate((attention_mask, np.array([[1]])), axis=1).astype(np.int64)
        
        current_length += 1

    return input_ids

In [None]:
import time

while True:
    user_text = input("Question: ")
    start_time = time.time() 
    
    # Note that we use the user text directly, not wrapped in <usr> or <sys> tags, 
    # as GPT2 doesn't use them. The eos_token is used to mark the end of the question.
    input_ids = tokenizer.encode(user_text + tokenizer.eos_token, return_tensors='np').astype(np.int64)
    
    # Create attention mask
    attention_mask = np.ones_like(input_ids, dtype=np.int64)

    decoded_input_ids = top_k_sampling_decode(input_ids, attention_mask)
    decoded_text = tokenizer.decode(decoded_input_ids[0])

    # Strip the question text and the eos_token from the response
    decoded_text = decoded_text[len(user_text)+len(tokenizer.eos_token):]
    
    print(f"Answer: {decoded_text}")

    finish_time = time.time() - start_time
    print(f"runtime: {finish_time}")
    print()

### (3) Top-P Sampling Method

top-p sampling chooses the smallest set of tokens so that the sum of their probabilities is greater than p. The selected tokens are then renormalized to sum to one. In contrast to top-k, the number of tokens considered for the next step can change dynamically based on the output of the model. For example, if the model is very certain about its next token (i.e., one token has a probability of 95% to be next), it will likely choose this one token (if p is set to 0.95 or higher). So, top-p can be seen as a dynamic version of top-k.

In [None]:
def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=-1, keepdims=True)

def top_p_sampling(logits, p=0.9):
    sorted_logits = np.sort(logits)[::-1]
    sorted_indices = np.argsort(logits)[::-1]

    cumulative_probs = np.cumsum(softmax(sorted_logits))

    indices_to_remove = cumulative_probs > p
    indices_to_remove = np.roll(indices_to_remove, 1)  # Shift the indices_to_remove by one to the right to keep the top token
    indices_to_remove[0] = False  # Set the first token as False to always keep it

    logits[sorted_indices[indices_to_remove]] = -np.inf
    probs = softmax(logits)

    token_id = np.random.choice(len(logits), size=1, p=probs)[0]

    return token_id

def top_p_sampling_decode(input_ids, attention_mask, max_length=50, p=0.9):
    current_length = input_ids.shape[1]

    while current_length < max_length:
        outputs = onnx_session.run(output_names=['logits'], input_feed={"input_ids": input_ids, "attention_mask": attention_mask})
        logits = outputs[0][0, -1, :]

        # Apply Top-p sampling
        predicted_token_id = top_p_sampling(logits, p=p)

        # Break if the EOS token is generated
        if predicted_token_id == tokenizer.eos_token_id:
            break

        # Update input_ids and attention_mask
        input_ids = np.concatenate((input_ids, np.array([[predicted_token_id]])), axis=1).astype(np.int64)
        attention_mask = np.concatenate((attention_mask, np.array([[1]])), axis=1).astype(np.int64)

        current_length += 1

    return input_ids

In [None]:
import time

while True:
    user_text = input("Question: ")
    start_time = time.time()

    # Note that we use the user text directly, not wrapped in <usr> or <sys> tags, 
    # as GPT2 doesn't use them. The eos_token is used to mark the end of the question.
    input_ids = tokenizer.encode(user_text + tokenizer.eos_token, return_tensors='np').astype(np.int64)

    # Create attention mask
    attention_mask = np.ones_like(input_ids, dtype=np.int64)

    decoded_input_ids = top_p_sampling_decode(input_ids, attention_mask)
    decoded_text = tokenizer.decode(decoded_input_ids[0])

    # Strip the question text and the eos_token from the response
    decoded_text = decoded_text[len(user_text)+len(tokenizer.eos_token):]

    print(f"Answer: {decoded_text}")

    finish_time = time.time() - start_time
    print(f"runtime: {finish_time}")
    print()

### (4) Beam Search

In [None]:
def beam_search(input_ids, attention_mask, beam_size=2, max_length=50):
    input_ids = input_ids.tolist()[0]
    attention_mask = attention_mask.tolist()[0]

    finished_beams = []
    running_beam = [(0, input_ids, attention_mask)]
    
    while len(finished_beams) < beam_size and running_beam:
        beam_score, input_ids, attention_mask = running_beam.pop(0)
        input_ids_np = np.array([input_ids], dtype=np.int64)
        attention_mask_np = np.array([attention_mask], dtype=np.int64)
        
        outputs = onnx_session.run(output_names=['logits'], input_feed={"input_ids": input_ids_np, "attention_mask": attention_mask_np})
        logits = outputs[0][0, -1, :]

        # Choose top 2 (beam_size) tokens
        top_k_logits = np.sort(logits)[-beam_size:]
        top_k_tokens = np.argsort(logits)[-beam_size:]

        for i in range(beam_size):
            token = top_k_tokens[i]
            score = top_k_logits[i]
            
            # Add the new token and update attention_mask
            new_input_ids = input_ids + [token]
            new_attention_mask = attention_mask + [1]

            if token == tokenizer.eos_token_id or len(new_input_ids) == max_length:
                finished_beams.append((beam_score + score, new_input_ids, new_attention_mask))
            else:
                running_beam.append((beam_score + score, new_input_ids, new_attention_mask))
                
        # Sort the running beams by score
        running_beam.sort(key=lambda x: x[0], reverse=True)
    
    # Return the highest scoring finished beam
    return max(finished_beams, key=lambda x: x[0])[1]

In [None]:
import time

while True:
    user_text = input("Question: ")
    start_time = time.time()

    # Note that we use the user text directly, not wrapped in <usr> or <sys> tags, 
    # as GPT2 doesn't use them. The eos_token is used to mark the end of the question.
    input_ids = tokenizer.encode(user_text + tokenizer.eos_token, return_tensors='np').astype(np.int64)

    # Create attention mask
    attention_mask = np.ones_like(input_ids, dtype=np.int64)

    decoded_input_ids = beam_search(input_ids, attention_mask)
    decoded_text = tokenizer.decode(decoded_input_ids)

    # Strip the question text and the eos_token from the response
    decoded_text = decoded_text[len(user_text)+len(tokenizer.eos_token):-len(tokenizer.eos_token)]

    print(f"Answer: {decoded_text}")

    finish_time = time.time() - start_time
    print(f"runtime: {finish_time}")
    print()