In [None]:
!pip install tensorflow==2.10.0 #tensorflow 2.10.0 <-- 2.12.0 (버전 다운그레이드 필요)

# How many epochs needs to be interated to get the best result on DialoGPT-large model
# when finetuning with one million words of new dataset?
'''
[ChatGPT & BingChat] Based on the experiences shared in the GitHub repository of the Hugging-Face team,
fine-tuning the Microsoft DialoGPT-large model with one million words requires at least several hundred epochs,
typically around 2000 epochs using a batch size of 2, because the model has a large number of parameters (774 million),
or until the validation loss plateaus. For example:

train_dataset = ... # create or load your training dataset
batch_size = 2
model.compile(loss=model.compute_loss, optimizer="adam")
model.fit(train_dataset.batch(batch_size), 
          validation_data=train_dataset.batch(batch_size),
          batch_size=batch_size, 
          epochs=2000)
'''

In [9]:
# Microsoft/DialoGPT-large(file size = 3GB) supports multiple languages

# Print the current date and time in the format:
# "YYYY-MM-DD HH:MM:SS.microseconds"
import datetime
def print_current_datetime(text=""):
    datetime_string = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S.%f")
    print("{} @ CDT({})".format(text,datetime_string))

from transformers import AutoTokenizer, TFAutoModelForCausalLM
import tensorflow as tf
from timeit import default_timer

model_path = "DialoGPT-large-finetuned-by-Microsoft-Keras"
learning_rate = 1e-5
batch_size = 2
epochs = 2000

print_current_datetime()

tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-large")
model = TFAutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-large")

# Load the text data
with open('영어성경(NKJV+NIV+NLT)_이백만단어.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# Tokenize the text data
tokenized_text = tokenizer(text, return_tensors='tf')

print_current_datetime("Finetune the DialoGPT-large-by-Microsoft-Keras")

# Define the training function
@tf.function
def train_step(input_ids):
    # Truncate input sequence
    max_seq_length = 1024 # "1024" for DialoGPT-large
    input_ids = input_ids[:, :max_seq_length]
    with tf.GradientTape() as tape:
        outputs = model(input_ids, training=True)
        logits = outputs.logits[:, :-1, :]
        labels = input_ids[:, 1:]
        loss_value = loss(labels, logits)
    grads = tape.gradient(loss_value, model.trainable_weights)
    optimizer.apply_gradients(zip(grads, model.trainable_weights))
    return loss_value

# Create a TensorSliceDataset from the tokenized text
dataset = tf.data.Dataset.from_tensor_slices(tokenized_text['input_ids'])
dataset = dataset.batch(batch_size)

# Define the loss function and optimizer
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

# Fine-tune the model
for epoch in range(epochs):
    start = default_timer()
    print(f'Epoch {epoch+1}/{epochs}')
    for step, batch in enumerate(dataset):
        loss_value = train_step(batch)
        if step % 50 == 0:
            print(f'Step {step} Loss {loss_value}')
    end = default_timer()
    print("Time duration(in seconds):", end - start)

print_current_datetime("Saving Fine-tuned Microsoft DialoGPT Model")

tokenizer.save_pretrained(model_path)
model.save_pretrained(model_path)

 @ CDT(2023-04-26T18:14:25.248695)


All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at microsoft/DialoGPT-large.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.
Token indices sequence length is longer than the specified maximum sequence length for this model (8581266 > 1024). Running this sequence through the model will result in indexing errors


Finetune the DialoGPT-large-by-Microsoft-Keras @ CDT(2023-04-26T18:14:49.050494)
Epoch 1/2000


ResourceExhaustedError: Graph execution error:

Detected at node 'tfgpt2lm_head_model_8/transformer/h_._33/attn/dropout_972/dropout/random_uniform/RandomUniform' defined at (most recent call last):
    File "c:\ProgramData\Anaconda3\lib\runpy.py", line 197, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "c:\ProgramData\Anaconda3\lib\runpy.py", line 87, in _run_code
      exec(code, run_globals)
    File "c:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
      app.launch_new_instance()
    File "c:\ProgramData\Anaconda3\lib\site-packages\traitlets\config\application.py", line 846, in launch_instance
      app.start()
    File "c:\ProgramData\Anaconda3\lib\site-packages\ipykernel\kernelapp.py", line 677, in start
      self.io_loop.start()
    File "c:\ProgramData\Anaconda3\lib\site-packages\tornado\platform\asyncio.py", line 199, in start
      self.asyncio_loop.run_forever()
    File "c:\ProgramData\Anaconda3\lib\asyncio\base_events.py", line 601, in run_forever
      self._run_once()
    File "c:\ProgramData\Anaconda3\lib\asyncio\base_events.py", line 1905, in _run_once
      handle._run()
    File "c:\ProgramData\Anaconda3\lib\asyncio\events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "c:\ProgramData\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 471, in dispatch_queue
      await self.process_one()
    File "c:\ProgramData\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 460, in process_one
      await dispatch(*args)
    File "c:\ProgramData\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 367, in dispatch_shell
      await result
    File "c:\ProgramData\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 662, in execute_request
      reply_content = await reply_content
    File "c:\ProgramData\Anaconda3\lib\site-packages\ipykernel\ipkernel.py", line 360, in do_execute
      res = shell.run_cell(code, store_history=store_history, silent=silent)
    File "c:\ProgramData\Anaconda3\lib\site-packages\ipykernel\zmqshell.py", line 532, in run_cell
      return super().run_cell(*args, **kwargs)
    File "c:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2863, in run_cell
      result = self._run_cell(
    File "c:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2909, in _run_cell
      return runner(coro)
    File "c:\ProgramData\Anaconda3\lib\site-packages\IPython\core\async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "c:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3106, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "c:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3309, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "c:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3369, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "C:\Windows\Temp\ipykernel_11040\3663138582.py", line 61, in <cell line: 57>
      loss_value = train_step(batch)
    File "C:\Windows\Temp\ipykernel_11040\3663138582.py", line 40, in train_step
      outputs = model(input_ids, training=True)
    File "C:\Users\A\AppData\Roaming\Python\Python39\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "C:\Users\A\AppData\Roaming\Python\Python39\site-packages\keras\engine\training.py", line 557, in __call__
      return super().__call__(*args, **kwargs)
    File "C:\Users\A\AppData\Roaming\Python\Python39\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "C:\Users\A\AppData\Roaming\Python\Python39\site-packages\keras\engine\base_layer.py", line 1097, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "C:\Users\A\AppData\Roaming\Python\Python39\site-packages\keras\utils\traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "C:\Users\A\AppData\Roaming\Python\Python39\site-packages\transformers\models\gpt2\modeling_tf_gpt2.py", line 727, in call
      transformer_outputs = self.transformer(
    File "C:\Users\A\AppData\Roaming\Python\Python39\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "C:\Users\A\AppData\Roaming\Python\Python39\site-packages\keras\engine\base_layer.py", line 1097, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "C:\Users\A\AppData\Roaming\Python\Python39\site-packages\keras\utils\traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "C:\Users\A\AppData\Roaming\Python\Python39\site-packages\transformers\models\gpt2\modeling_tf_gpt2.py", line 373, in call
      for i, (block, layer_past) in enumerate(zip(self.h, inputs["past"])):
    File "C:\Users\A\AppData\Roaming\Python\Python39\site-packages\transformers\models\gpt2\modeling_tf_gpt2.py", line 377, in call
      outputs = block(
    File "C:\Users\A\AppData\Roaming\Python\Python39\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "C:\Users\A\AppData\Roaming\Python\Python39\site-packages\keras\engine\base_layer.py", line 1097, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "C:\Users\A\AppData\Roaming\Python\Python39\site-packages\keras\utils\traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "C:\Users\A\AppData\Roaming\Python\Python39\site-packages\transformers\models\gpt2\modeling_tf_gpt2.py", line 199, in call
      output_attn = self.attn(
    File "C:\Users\A\AppData\Roaming\Python\Python39\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "C:\Users\A\AppData\Roaming\Python\Python39\site-packages\keras\engine\base_layer.py", line 1097, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "C:\Users\A\AppData\Roaming\Python\Python39\site-packages\keras\utils\traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "C:\Users\A\AppData\Roaming\Python\Python39\site-packages\transformers\models\gpt2\modeling_tf_gpt2.py", line 160, in call
      attn_outputs = self._attn(query, key, value, attention_mask, head_mask, output_attentions, training=training)
    File "C:\Users\A\AppData\Roaming\Python\Python39\site-packages\transformers\models\gpt2\modeling_tf_gpt2.py", line 120, in _attn
      w = self.attn_dropout(w, training=training)
    File "C:\Users\A\AppData\Roaming\Python\Python39\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "C:\Users\A\AppData\Roaming\Python\Python39\site-packages\keras\engine\base_layer.py", line 1097, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "C:\Users\A\AppData\Roaming\Python\Python39\site-packages\keras\utils\traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "C:\Users\A\AppData\Roaming\Python\Python39\site-packages\keras\layers\regularization\dropout.py", line 116, in call
      output = control_flow_util.smart_cond(
    File "C:\Users\A\AppData\Roaming\Python\Python39\site-packages\keras\utils\control_flow_util.py", line 108, in smart_cond
      return tf.__internal__.smart_cond.smart_cond(
    File "C:\Users\A\AppData\Roaming\Python\Python39\site-packages\keras\layers\regularization\dropout.py", line 112, in dropped_inputs
      return self._random_generator.dropout(
    File "C:\Users\A\AppData\Roaming\Python\Python39\site-packages\keras\backend.py", line 2162, in dropout
      return tf.nn.dropout(
Node: 'tfgpt2lm_head_model_8/transformer/h_._33/attn/dropout_972/dropout/random_uniform/RandomUniform'
OOM when allocating tensor with shape[1,20,1024,1024] and type float on /job:localhost/replica:0/task:0/device:CPU:0 by allocator cpu
	 [[{{node tfgpt2lm_head_model_8/transformer/h_._33/attn/dropout_972/dropout/random_uniform/RandomUniform}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
 [Op:__inference_train_step_202259]

In [None]:
# Print the current date and time in the format:
# "YYYY-MM-DD HH:MM:SS.microseconds"
import datetime
def print_current_datetime(text=""):
    datetime_string = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S.%f")
    print("{} @ CDT({})".format(text,datetime_string))

from transformers import AutoTokenizer, TFAutoModelForCausalLM
from timeit import default_timer

model_path = "DialoGPT-large-finetuned-by-Microsoft-Keras"

print_current_datetime("Loading Fine-tuned Microsoft DialoGPT Model")

tokenizer_loaded = AutoTokenizer.from_pretrained(model_path)
model_loaded = TFAutoModelForCausalLM.from_pretrained(model_path)

print_current_datetime()

def generate_ouput(prompt=""):

    start = default_timer()

    input_ids = tokenizer_loaded.encode(prompt, return_tensors="tf")
    output_ids = model_loaded.generate(input_ids=input_ids,
                                       max_length=1024+input_ids.shape[1],
                                       temperature=0.7,
                                       top_p=0.9,
                                       do_sample=True,
                                       num_return_sequences=5, # The model will generate five different responses to the prompt.
                                       pad_token_id=tokenizer_loaded.eos_token_id)
    generated_text = tokenizer_loaded.decode(output_ids[0], skip_special_tokens=True)

    end = default_timer()

    # num_return_sequences=5, which means the model will generate 5 different responses to the prompt.
    # The below code loops through the generated responses and print them out with a response number.
    for i, return_sequence in enumerate(output_ids):
        print(f'Response {i+1}: {tokenizer_loaded.decode(return_sequence, skip_special_tokens=True)}')

    print("Time duration(in seconds):", end - start)
    return generated_text

# Let's chat for 10 lines
for step in range(10):
    prompt = input(">> User:")
    if prompt.lower() == "bye": break

    generated_text = generate_ouput(prompt)
    split_generated_text = generated_text.split(prompt)
    if len(split_generated_text) > 1:
        generated_text = split_generated_text[1]
    # Trim the sentences after the last period(.)
    text_to_remove = generated_text.split('.')[-1]
    generated_text = generated_text.replace(text_to_remove,'')

    print(">> GPT: {}".format( generated_text ))
    print_current_datetime()

In [None]:
# Microsoft/DialoGPT-large(file size = 3GB) supports multiple languages
from transformers import AutoTokenizer, TFAutoModelForCausalLM
import tensorflow as tf

# Print the current date and time in the format:
# "YYYY-MM-DD HH:MM:SS.microseconds"
import datetime
def print_current_datetime(text=""):
    datetime_string = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S.%f")
    print("{} @ CDT({})".format(text,datetime_string))

print_current_datetime("Loading pre-trained microsoft/DialoGPT-large")

tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-large")
model = TFAutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-large")

tokenizer.save_pretrained("DialoGPT-large-by-Microsoft-Keras")
model.save_pretrained("DialoGPT-large-by-Microsoft-Keras")

print_current_datetime("Loading saved DialoGPT-large-by-Microsoft-Keras")

loaded_tokenizer = AutoTokenizer.from_pretrained("DialoGPT-large-by-Microsoft-Keras")
loaded_model = TFAutoModelForCausalLM.from_pretrained("DialoGPT-large-by-Microsoft-Keras")

prompt = "Hi"

input_ids = loaded_tokenizer.encode(prompt, return_tensors='tf')
output_ids = loaded_model.generate(input_ids,
    max_length=1024,
    temperature=0.7, # This parameter controls the "creativity" of the generated text, "1" results unpredictable text
    top_p=0.7, # Cumulative probability distribution of the next word, "top_k": limits the number of candidate words 
    do_sample=True, # Random sampling of the next token instead of selecting the token with the highest probability 
    num_return_sequences=5, # The model will generate five different responses to the prompt.
    pad_token_id=loaded_tokenizer.eos_token_id)
print(loaded_tokenizer.decode(output_ids[0], skip_special_tokens=True))

# num_return_sequences=5, which means the model will generate 5 different responses to the prompt.
# The below code loops through the generated responses and print them out with a response number.
for i, output_sequence in enumerate(output_ids):
    print(f'Response {i+1}: {loaded_tokenizer.decode(output_sequence, skip_special_tokens=True)}')

Loading pre-trained microsoft/DialoGPT-large @ CDT(2023-04-03T20:46:16.762737)


Downloading (…)okenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading (…)lve/main/config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading tf_model.h5:   0%|          | 0.00/3.10G [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at microsoft/DialoGPT-large.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Loading saved DialoGPT-large-by-Microsoft-Keras @ CDT(2023-04-03T20:51:42.969294)


All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at DialoGPT-large-by-Microsoft-Keras.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


Hi and I'm a big fan of your videos
Response 1: Hi and I'm a big fan of your videos
Response 2: Hi
Response 3: Hi
Response 4: Hi
Response 5: Hi


In [1]:
# Microsoft/DialoGPT-large(file size = 3GB) supports multiple languages
from transformers import AutoTokenizer, TFAutoModelForCausalLM
import tensorflow as tf

# Print the current date and time in the format:
# "YYYY-MM-DD HH:MM:SS.microseconds"
import datetime
def print_current_datetime(text=""):
    datetime_string = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S.%f")
    print("{} @ CDT({})".format(text,datetime_string))

print_current_datetime("Loading saved DialoGPT-large-by-Microsoft-Keras")

loaded_tokenizer = AutoTokenizer.from_pretrained("DialoGPT-large-by-Microsoft-Keras")
loaded_model = TFAutoModelForCausalLM.from_pretrained("DialoGPT-large-by-Microsoft-Keras")

prompt = "Hi"

input_ids = loaded_tokenizer.encode(prompt, return_tensors='tf')
output_ids = loaded_model.generate(input_ids=input_ids,
                                    max_length=1024+input_ids.shape[1],
                                    temperature=0.9,
                                    top_p=0.9,
                                    do_sample=True,
                                    num_return_sequences=5,
                                    pad_token_id=loaded_tokenizer.eos_token_id)
generated_text = loaded_tokenizer.decode(output_ids[0], skip_special_tokens=True)
split_generated_text = generated_text.split(prompt)
if len(split_generated_text) > 1:
    generated_text = split_generated_text[1]

print("Generated-text:",generated_text)

for i, output_sequence in enumerate(output_ids):
    print(f'Response {i+1}: {loaded_tokenizer.decode(output_sequence, skip_special_tokens=True)}')

2023-04-04 07:30:15.742516: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-04-04 07:30:15.742551: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


Loading saved DialoGPT-large-by-Microsoft-Keras @ CDT(2023-04-04T07:30:17.126496)


2023-04-04 07:30:17.926118: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2023-04-04 07:30:17.926146: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2023-04-04 07:30:17.926165: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (ip-10-255-120-161): /proc/driver/nvidia/version does not exist
2023-04-04 07:30:17.926362: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-04 07:30:17.938886: W tensorflow/python/util/util.cc:368] Sets are not currently consider

Generated-text:  you, welcome to the club!
Response 1: Hi you, welcome to the club!
Response 2: Hi
Response 3: Hi
Response 4: Hi.
Response 5: Hi :D


In [2]:
from transformers import AutoTokenizer, TFAutoModelForCausalLM
import tensorflow as tf

import datetime
def print_current_datetime(text=""):
    datetime_string = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S.%f")
    print("{} @ CDT({})".format(text,datetime_string))

print_current_datetime("Finetune(Before) the DialoGPT-large-by-Microsoft-Keras")

loaded_tokenizer = AutoTokenizer.from_pretrained("DialoGPT-large-by-Microsoft-Keras")
loaded_model = TFAutoModelForCausalLM.from_pretrained("DialoGPT-large-by-Microsoft-Keras")

prompt = "Jesus"

input_ids = loaded_tokenizer.encode(prompt, return_tensors='tf')
output_ids = loaded_model.generate(input_ids=input_ids,
                                    max_length=1024+input_ids.shape[1],
                                    temperature=0.9,
                                    top_p=0.9,
                                    do_sample=True,
                                    num_return_sequences=5,
                                    pad_token_id=loaded_tokenizer.eos_token_id)
generated_text = loaded_tokenizer.decode(output_ids[0], skip_special_tokens=True)

print_current_datetime(generated_text)

for i, output_sequence in enumerate(output_ids):
    print(f'Response {i+1}: {loaded_tokenizer.decode(output_sequence, skip_special_tokens=True)}')

Finetune(Before) the DialoGPT-large-by-Microsoft-Keras @ CDT(2023-04-04T07:51:44.185421)


All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at DialoGPT-large-by-Microsoft-Keras.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


Jesus @ CDT(2023-04-04T07:52:15.653434)
Response 1: Jesus
Response 2: Jesus... the title was a little unclear.
Response 3: Jesus
Response 4: Jesus a... it's a trap.
Response 5: Jesus, is there any chance you could turn off the game from your home computer. It is your PC's fault, you can't make a computer turn itself on.


In [1]:
import datetime
def print_current_datetime(text=""):
    datetime_string = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S.%f")
    print("{} @ CDT({})".format(text,datetime_string))

import tensorflow as tf
from transformers import TFAutoModelForCausalLM, AutoTokenizer
from timeit import default_timer

# Load the text data
with open('영어성경(NKJV+NIV+NLT)_이백만단어.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# Instantiate the model
model = TFAutoModelForCausalLM.from_pretrained('DialoGPT-large-by-Microsoft-Keras')

# Instantiate the tokenizer
tokenizer = AutoTokenizer.from_pretrained('DialoGPT-large-by-Microsoft-Keras')

# Tokenize the text data
tokenized_text = tokenizer(text, return_tensors='tf')
print(tokenized_text)

print_current_datetime("Finetune the DialoGPT-large-by-Microsoft-Keras")

# Define the training parameters
model_path = "DialoGPT-large-by-Microsoft-Keras-finetuned"
learning_rate = 1e-5
batch_size = 6
epochs = 6

# Define the training function
@tf.function
def train_step(input_ids):
    # Truncate input sequence
    max_seq_length = 1024 # "1024" for DialoGPT-large
    input_ids = input_ids[:, :max_seq_length]
    with tf.GradientTape() as tape:
        outputs = model(input_ids, training=True)
        logits = outputs.logits[:, :-1, :]
        labels = input_ids[:, 1:]
        loss_value = loss(labels, logits)
    grads = tape.gradient(loss_value, model.trainable_weights)
    optimizer.apply_gradients(zip(grads, model.trainable_weights))
    return loss_value

# Create a TensorSliceDataset from the tokenized text
dataset = tf.data.Dataset.from_tensor_slices(tokenized_text['input_ids'])
dataset = dataset.batch(batch_size)

# Define the loss function and optimizer
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

# Fine-tune the model
for epoch in range(epochs):
    start = default_timer()
    print(f'Epoch {epoch+1}/{epochs}')
    for step, batch in enumerate(dataset):
        loss_value = train_step(batch)
        if step % 50 == 0:
            print(f'Step {step} Loss {loss_value}')
    end = default_timer()
    print("Time duration(in seconds):", end - start)

# Save the fine-tuned model
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

print_current_datetime()

print_current_datetime("Finetune(After) the DialoGPT-large-by-Microsoft-Keras")

loaded_tokenizer = AutoTokenizer.from_pretrained("DialoGPT-large-by-Microsoft-Keras-finetuned")
loaded_model = TFAutoModelForCausalLM.from_pretrained("DialoGPT-large-by-Microsoft-Keras-finetuned")

prompt = "Jesus"

input_ids = loaded_tokenizer.encode(prompt, return_tensors='tf')
output_ids = loaded_model.generate(input_ids=input_ids,
                                    max_length=1024+input_ids.shape[1],
                                    temperature=0.9,
                                    top_p=0.9,
                                    do_sample=True,
                                    num_return_sequences=5,
                                    pad_token_id=loaded_tokenizer.eos_token_id)
generated_text = loaded_tokenizer.decode(output_ids[0], skip_special_tokens=True)

print_current_datetime(generated_text)

for i, output_sequence in enumerate(output_ids):
    print(f'Response {i+1}: {loaded_tokenizer.decode(output_sequence, skip_special_tokens=True)}')

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at DialoGPT-large-by-Microsoft-Keras.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.
Token indices sequence length is longer than the specified maximum sequence length for this model (3054759 > 1024). Running this sequence through the model will result in indexing errors


{'input_ids': <tf.Tensor: shape=(1, 3054759), dtype=int32, numpy=array([[ 818,  262, 3726, ...,  345,  477,   13]])>, 'attention_mask': <tf.Tensor: shape=(1, 3054759), dtype=int32, numpy=array([[1, 1, 1, ..., 1, 1, 1]])>}
Finetune the DialoGPT-large-by-Microsoft-Keras @ CDT(2023-04-04T17:47:04.236880)
Epoch 1/6
Step 0 Loss 12.522553443908691
Time duration(in seconds): 74.71662520000001
Epoch 2/6
Step 0 Loss 10.721572875976562
Time duration(in seconds): 30.79476849999999
Epoch 3/6
Step 0 Loss 10.05924129486084
Time duration(in seconds): 31.29861169999998
Epoch 4/6
Step 0 Loss 9.5293607711792
Time duration(in seconds): 31.979575799999992
Epoch 5/6
Step 0 Loss 9.15305233001709
Time duration(in seconds): 31.681460799999996
Epoch 6/6
Step 0 Loss 8.982477188110352
Time duration(in seconds): 31.39858380000001
 @ CDT(2023-04-04T17:52:13.947517)
Finetune(After) the DialoGPT-large-by-Microsoft-Keras @ CDT(2023-04-04T17:52:13.948516)


All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at DialoGPT-large-by-Microsoft-Keras-finetuned.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


Jesus @ CDT(2023-04-04T17:52:32.831162)
Response 1: Jesus
Response 2: Jesus the that's not how it works.
Response 3: Jesus on dude
Response 4: Jesus!, i want to learn Spanish
Response 5: Jesus
