In [None]:
!pip install tensorflow==2.10.0 #tensorflow 2.10.0 <-- 2.12.0 (버전 다운그레이드 필요)

# How many epochs needs to be interated to get the best result on DialoGPT-large model
# when finetuning with one million words of new dataset?
'''
[ChatGPT & BingChat] Based on the experiences shared in the GitHub repository of the Hugging-Face team,
fine-tuning the Microsoft DialoGPT-large model with one million words requires at least several hundred epochs,
typically around 2000 epochs using a batch size of 2, because the model has a large number of parameters (774 million),
or until the validation loss plateaus. For example:

train_dataset = ... # create or load your training dataset
batch_size = 2
model.compile(loss=model.compute_loss, optimizer="adam")
model.fit(train_dataset.batch(batch_size), 
          validation_data=train_dataset.batch(batch_size),
          batch_size=batch_size, 
          epochs=2000)
'''

In [None]:
# Microsoft/DialoGPT-large(file size = 3GB) supports multiple languages

# Print the current date and time in the format:
# "YYYY-MM-DD HH:MM:SS.microseconds"
import datetime
def print_current_datetime(text=""):
    datetime_string = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S.%f")
    print("{} @ CDT({})".format(text,datetime_string))

from transformers import AutoTokenizer, TFAutoModelForCausalLM
import tensorflow as tf
from timeit import default_timer

model_path = "DialoGPT-large-finetuned-by-Microsoft-Keras"
batch_size = 2
epochs = 2000

print_current_datetime()

tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-large")
model = TFAutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-large")

# Load the text data
with open('한글성경(마침표제거)_정제후말뭉치_백일만단어.txt', 'r', encoding='utf-8') as f:
    text = f.read()

print_current_datetime("Tokenizing the text")

text_tokenized = tokenizer.encode(text, return_tensors='tf')

# Create a TensorSliceDataset from the tokenized text
dataset = tf.data.Dataset.from_tensor_slices(text_tokenized['input_ids'])
dataset = dataset.batch(batch_size)

print_current_datetime("Preparing the dataset")

loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5), loss=[loss, loss])

model.fit(dataset,
          validation_data=dataset,
          batch_size=batch_size, 
          epochs=epochs)

print_current_datetime("Saving Fine-tuned Microsoft DialoGPT Model")

tokenizer.save_pretrained(model_path)
model.save_pretrained(model_path)

In [None]:
# Print the current date and time in the format:
# "YYYY-MM-DD HH:MM:SS.microseconds"
import datetime
def print_current_datetime(text=""):
    datetime_string = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S.%f")
    print("{} @ CDT({})".format(text,datetime_string))

from transformers import AutoTokenizer, TFAutoModelForCausalLM
from timeit import default_timer

model_path = "DialoGPT-large-finetuned-by-Microsoft-Keras"

print_current_datetime("Loading Fine-tuned Microsoft DialoGPT Model")

tokenizer_loaded = AutoTokenizer.from_pretrained(model_path)
model_loaded = TFAutoModelForCausalLM.from_pretrained(model_path)

print_current_datetime()

def generate_ouput(prompt=""):

    start = default_timer()

    input_ids = tokenizer_loaded.encode(prompt, return_tensors="tf")
    output_ids = model_loaded.generate(input_ids=input_ids,
                                       max_length=1024+input_ids.shape[1],
                                       temperature=0.7,
                                       top_p=0.9,
                                       do_sample=True,
                                       num_return_sequences=5, # The model will generate five different responses to the prompt.
                                       pad_token_id=tokenizer_loaded.eos_token_id)
    generated_text = tokenizer_loaded.decode(output_ids[0], skip_special_tokens=True)

    end = default_timer()

    # num_return_sequences=5, which means the model will generate 5 different responses to the prompt.
    # The below code loops through the generated responses and print them out with a response number.
    for i, return_sequence in enumerate(output_ids):
        print(f'Response {i+1}: {tokenizer_loaded.decode(return_sequence, skip_special_tokens=True)}')

    print("Time duration(in seconds):", end - start)
    return generated_text

# Let's chat for 10 lines
for step in range(10):
    prompt = input(">> User:")
    if prompt.lower() == "bye": break

    generated_text = generate_ouput(prompt)
    split_generated_text = generated_text.split(prompt)
    if len(split_generated_text) > 1:
        generated_text = split_generated_text[1]
    # Trim the sentences after the last period(.)
    text_to_remove = generated_text.split('.')[-1]
    generated_text = generated_text.replace(text_to_remove,'')

    print(">> GPT: {}".format( generated_text ))
    print_current_datetime()