In [None]:
# GPTv2 모델 파인튜닝 & 저장 : 약 3시간 걸림 (8 CPU & 0 GPU)

import tensorflow as tf
from transformers import TFGPT2LMHeadModel, AutoTokenizer
from timeit import default_timer

# Load the text data
with open('한글성경들(마침표제거)_정제후말뭉치_약백만단어.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# Instantiate the GPT-2 model
model = TFGPT2LMHeadModel.from_pretrained('skt/kogpt2-base-v2', from_pt=True)

# Instantiate the tokenizer
tokenizer = AutoTokenizer.from_pretrained('skt/kogpt2-base-v2')

# Tokenize the text data
tokenized_text = tokenizer(text, return_tensors='tf')
print(tokenized_text)

# Define the training parameters
model_path = "./output/gpt2-finetuned-epoch-66"
learning_rate = 3e-5
batch_size = 16
epochs = 66

# Define the training function
@tf.function
def train_step(input_ids):
    # Truncate input sequence
    max_seq_length = 1024 # "1024" for GPT-2
    input_ids = input_ids[:, :max_seq_length]
    with tf.GradientTape() as tape:
        outputs = model(input_ids, training=True)
        logits = outputs.logits[:, :-1, :]
        labels = input_ids[:, 1:]
        loss_value = loss(labels, logits)
    grads = tape.gradient(loss_value, model.trainable_weights)
    optimizer.apply_gradients(zip(grads, model.trainable_weights))
    return loss_value

# Create a TensorSliceDataset from the tokenized text
dataset = tf.data.Dataset.from_tensor_slices(tokenized_text['input_ids'])
dataset = dataset.batch(batch_size)

# Define the loss function and optimizer
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

# Fine-tune the model
for epoch in range(epochs):
    start = default_timer()
    print(f'Epoch {epoch+1}/{epochs}')
    for step, batch in enumerate(dataset):
        loss_value = train_step(batch)
        if step % 50 == 0:
            print(f'Step {step} Loss {loss_value}')
    end = default_timer()
    print("Time duration(in seconds):", end - start)

# Save the fine-tuned model
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

2023-03-09 01:46:15.052943: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-03-09 01:46:15.052967: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-03-09 01:46:19.551355: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2023-03-09 01:46:19.551389: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2023-03-09 01:46:19.551414: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (ip-10-255-120-161): /proc/driver/nvidia/version does not exist
2023-03-09 01:46:19.551676: I tensorflow/core/platform/cpu_

In [1]:
# GPT 모델 활용 : 문장 생성

import tensorflow as tf
from transformers import TFGPT2LMHeadModel, AutoTokenizer
from timeit import default_timer

# Load the tokenizer and model
model_path = "./output/gpt2-finetuned-epoch-66"
model = TFGPT2LMHeadModel.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# GPT가 생성할 문장의 방향성을 알려주기 위한 시작 문자열
sent = '예수님'

# 텍스트 시퀀스를 정수 시퀀스로 변환
input_ids = tokenizer.encode(sent)
input_ids = tf.convert_to_tensor([input_ids])
print(input_ids)

start = default_timer()
# 정수 시퀀스를 입력받아 GPT가 이어서 문장을 생성 : 약 20초 걸림 (using 1 cpu)
generated_ids = model.generate(input_ids, # a tensor containing the input sequence encoded as integer IDs
                        max_length=128, # the maximum length of the generated sequence, in terms of tokens
                        repetition_penalty=2.0, # 1.0 indicates no penalty for repeating tokens, up to the 2.0
                        num_return_sequences=1, # the number of independent sequences to generate for each prompt
                        early_stopping=True, # stops generating a sentence before max_length working with eos_token_id
                        use_cache=True, # enables or disables the use of the model's internal cache (repetitive output)
                        eos_token_id=tokenizer.eos_token_id)

output_ids = generated_ids.numpy().tolist()[0]
print(output_ids)

# 정수 시퀀스를 텍스트 시퀀스로 변환
decoded = tokenizer.decode(output_ids, skip_special_tokens=True)
print(decoded)
end = default_timer()
print("Time duration(in seconds):", end - start)

2023-03-09 04:47:02.218296: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-03-09 04:47:02.218325: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-03-09 04:47:04.467570: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2023-03-09 04:47:04.467593: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2023-03-09 04:47:04.467610: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (ip-10-255-120-161): /proc/driver/nvidia/version does not exist
2023-03-09 04:47:04.467818: I tensorflow/core/platform/cpu_

tf.Tensor([[17835  7177]], shape=(1, 2), dtype=int32)
[17835, 7177, 23916, 24117, 8702, 7816, 16913, 7182, 9051, 6947, 7399, 7220, 9022, 20085, 9782, 9582, 28608, 9080, 16082, 9320, 13801, 11920, 9402, 10433, 31825, 23678, 9414, 8006, 25856, 28732, 8146, 11404, 24487, 20776, 18406, 35420, 36861, 11792, 7372, 16256, 19300, 9157, 9612, 16157, 6889, 14618, 9960, 50339, 10645, 9926, 9651, 10010, 39514, 8137, 32389, 17339, 9018, 20767, 14684, 9673, 9661, 14782, 11698, 19495, 44111, 9169, 13721, 10536, 9466, 9142, 7810, 7788, 11947, 28797, 10781, 21511, 17969, 6903, 34935, 28056, 13183, 8143, 18636, 10106, 8159, 11403, 11091, 24752, 16107, 9036, 14247, 9079, 10280, 739, 7567, 8135, 10551, 9375, 9078, 35180, 10554, 9481, 18961, 22163, 23567, 9710, 387, 12082, 9585, 7807, 8286, 41878, 9108, 9306, 33136, 15312, 16691, 23354, 9034, 7489, 13927, 37202, 9176, 10070, 14143, 19749, 9564, 11936]
예수님께서 말씀하셨습니다 
그런데 그 빛이 지금처럼 짜임새 있는 모습이 아니었고 생물 하나 없이 텅 비어 있었어요. 어둠이 깊은 바다를 덮고 있었고 빛과 그림자가 생겨라 그러자 빛의 형성이 

In [2]:
# GPT 모델 활용 : 문장 생성

import tensorflow as tf
from transformers import TFGPT2LMHeadModel, AutoTokenizer
from timeit import default_timer

# Load the tokenizer and model
model_path = "./output/gpt2-finetuned-epoch-66"
model = TFGPT2LMHeadModel.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# GPT가 생성할 문장의 방향성을 알려주기 위한 시작 문자열
sent = '예수님을 누구라고 생각하니?'

# 텍스트 시퀀스를 정수 시퀀스로 변환
input_ids = tokenizer.encode(sent)
input_ids = tf.convert_to_tensor([input_ids])
print(input_ids)

start = default_timer()
# 정수 시퀀스를 입력받아 GPT가 이어서 문장을 생성 : 약 20초 걸림 (using 1 cpu)
generated_ids = model.generate(input_ids, # a tensor containing the input sequence encoded as integer IDs
                        max_length=128, # the maximum length of the generated sequence, in terms of tokens
                        repetition_penalty=2.0, # 1.0 indicates no penalty for repeating tokens, up to the 2.0
                        num_return_sequences=1, # the number of independent sequences to generate for each prompt
                        early_stopping=True, # stops generating a sentence before max_length working with eos_token_id
                        use_cache=True, # enables or disables the use of the model's internal cache (repetitive output)
                        eos_token_id=tokenizer.eos_token_id)

output_ids = generated_ids.numpy().tolist()[0]
print(output_ids)

# 정수 시퀀스를 텍스트 시퀀스로 변환
decoded = tokenizer.decode(output_ids, skip_special_tokens=True)
print(decoded)
end = default_timer()
print("Time duration(in seconds):", end - start)

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at ./output/gpt2-finetuned-epoch-66.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


tf.Tensor([[17835 20108 14938  9329  9658 19202   406]], shape=(1, 7), dtype=int32)
[17835, 20108, 14938, 9329, 9658, 19202, 406, 16563, 6947, 7399, 7220, 9022, 11146, 9782, 9582, 28608, 9080, 16082, 9320, 13801, 11920, 9402, 10433, 31825, 23678, 9414, 16913, 7182, 28732, 8146, 11404, 24487, 20776, 18406, 9169, 13721, 16235, 41701, 9564, 17828, 10010, 47637, 9835, 9177, 9036, 14247, 9079, 9466, 739, 7567, 8135, 10551, 9375, 9078, 35180, 10554, 9481, 16691, 23354, 9133, 9018, 20767, 14684, 9673, 9661, 18961, 11698, 7177, 23916, 24117, 8702, 7816, 6889, 9108, 9135, 8718, 8017, 8006, 25856, 9051, 7261, 8286, 41878, 40340, 8149, 24917, 17339, 6903, 17969, 8137, 10106, 7530, 12859, 8263, 10171, 14782, 39514, 9052, 12503, 34677, 9142, 7810, 7788, 10578, 9049, 7888, 7251, 13675, 7807, 43143, 11091, 9793, 12611, 24186, 11792, 11947, 28797, 10781, 7372, 16256, 14927, 39417, 34693, 24389, 23295, 10027, 11608, 24860]
예수님을 누구라고 생각하니?"
그런데 그 사람은 지금처럼 짜임새 있는 모습이 아니었고 생물 하나 없이 텅 비어 있었습니다 어둠이 깊은 바다를 덮

In [3]:
# Print the current date and time in the format:
# "YYYY-MM-DD HH:MM:SS.microseconds"
import datetime
datetime_string = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")
print("Current date and time: ", datetime_string)

Current date and time:  2023-03-09 04:48:39.625730
