In [2]:
# GPTv2 모델 파인튜닝 & 저장 : 약 1시간 걸림

import tensorflow as tf
from transformers import TFGPT2LMHeadModel, AutoTokenizer
from timeit import default_timer

# Load the text data
with open('bible_korean_easy.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# Instantiate the GPT-2 model
model = TFGPT2LMHeadModel.from_pretrained('skt/kogpt2-base-v2', from_pt=True)

# Instantiate the tokenizer
tokenizer = AutoTokenizer.from_pretrained('skt/kogpt2-base-v2')

# Tokenize the text data
tokenized_text = tokenizer(text, return_tensors='tf')
print(tokenized_text)

# Define the training parameters
model_path = "./output/gpt2-finetuned-epoch-66"
max_seq_length = 1024 # Usually 1024 for GPT-2
learning_rate = 3e-5
batch_size = 16
epochs = 66

# Define the training function
@tf.function
def train_step(input_ids):
    # Truncate input sequence
    max_seq_length = max_seq_length
    input_ids = input_ids[:, :max_seq_length]
    with tf.GradientTape() as tape:
        outputs = model(input_ids, training=True)
        logits = outputs.logits[:, :-1, :]
        labels = input_ids[:, 1:]
        loss_value = loss(labels, logits)
    grads = tape.gradient(loss_value, model.trainable_weights)
    optimizer.apply_gradients(zip(grads, model.trainable_weights))
    return loss_value

# Create a TensorSliceDataset from the tokenized text
dataset = tf.data.Dataset.from_tensor_slices(tokenized_text['input_ids'])
dataset = dataset.batch(batch_size)

# Define the loss function and optimizer
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

# Fine-tune the model
for epoch in range(epochs):
    start = default_timer()
    print(f'Epoch {epoch+1}/{epochs}')
    for step, batch in enumerate(dataset):
        loss_value = train_step(batch)
        if step % 50 == 0:
            print(f'Step {step} Loss {loss_value}')
    end = default_timer()
    print("Time duration(in seconds):", end - start)
    
    # Save the fine-tuned model
    model.save_pretrained(model_path)
    tokenizer.save_pretrained(model_path)

2023-03-07 22:47:07.988091: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2023-03-07 22:47:07.988115: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2023-03-07 22:47:07.988132: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (ip-10-255-120-161): /proc/driver/nvidia/version does not exist
2023-03-07 22:47:07.988364: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-07 22:47:08.001102: W tensorflow/python/util/util.cc:368] Sets are not currently consider

{'input_ids': <tf.Tensor: shape=(1, 1042061), dtype=int32, numpy=array([[ 9342,   392, 20252, ...,  9050,  7523,   389]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(1, 1042061), dtype=int32, numpy=array([[1, 1, 1, ..., 1, 1, 1]], dtype=int32)>}
Epoch 1/66
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Step 0 Loss 2.855166435241699
Time duration(in seconds): 15.320759003981948
Epoch 2/66
Step 0 Loss 2.426194906234741
Time duration(in seconds): 4.289433935016859
Epoch 3/66
Step 0 L

In [3]:
# GPT 모델 활용 : 문장 생성

import tensorflow as tf
from transformers import TFGPT2LMHeadModel, AutoTokenizer

# Load the tokenizer and model
model_path = "./output/gpt2-finetuned-epoch-66"
model = TFGPT2LMHeadModel.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# GPT가 생성할 문장의 방향성을 알려주기 위한 시작 문자열
sent = '예수님'

# 텍스트 시퀀스를 정수 시퀀스로 변환
input_ids = tokenizer.encode(sent)
input_ids = tf.convert_to_tensor([input_ids])
print(input_ids)

start = default_timer()
# 정수 시퀀스를 입력받아 GPT가 이어서 문장을 생성 : 약 20초 걸림 (using 1 cpu)
generated_ids = model.generate(input_ids, # a tensor containing the input sequence encoded as integer IDs
                        max_length=128, # the maximum length of the generated sequence, in terms of tokens
                        repetition_penalty=2.0, # 1.0 indicates no penalty for repeating tokens, up to the 2.0
                        num_return_sequences=1, # the number of independent sequences to generate for each prompt
                        early_stopping=True, # stops generating a sentence before max_length working with eos_token_id
                        use_cache=True, # enables or disables the use of the model's internal cache (repetitive output)
                        eos_token_id=tokenizer.eos_token_id)

output_ids = generated_ids.numpy().tolist()[0]
print(output_ids)

# 정수 시퀀스를 텍스트 시퀀스로 변환
decoded = tokenizer.decode(output_ids, skip_special_tokens=True)
print(decoded)
end = default_timer()
print("Time duration(in seconds):", end - start)

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at ./output/gpt2-finetuned-epoch-66.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


tf.Tensor([[17835  7177]], shape=(1, 2), dtype=int32)
[17835, 7177, 23916, 24117, 8702, 7816, 16691, 9640, 16478, 11792, 7372, 50075, 16256, 20085, 16157, 10010, 8344, 392, 47714, 10910, 9022, 10503, 9136, 9402, 19495, 15172, 9506, 12219, 8744, 16913, 7182, 36500, 34265, 9020, 401, 394, 11993, 9049, 7888, 11597, 10554, 9481, 6921, 10078, 26415, 9045, 20252, 9724, 24381, 9347, 405, 9298, 13966, 29335, 22366, 9659, 8210, 8017, 18961, 49084, 9342, 393, 35110, 9108, 9306, 12315, 387, 35420, 28732, 8146, 13926, 6866, 9078, 35180, 6969, 26764, 18911, 9937, 11257, 9085, 40708, 10917, 10106, 7172, 21598, 33835, 9106, 14858, 7532, 9018, 20767, 14684, 9673, 9661, 13083, 9135, 10542, 22294, 12997, 12351, 9130, 9394, 9143, 6824, 9835, 21154, 8, 12199, 8711, 10033, 13805, 21734, 9563, 20056, 35739, 7244, 9933, 31569, 9023, 13363, 9172, 10470, 10802, 9457, 18910, 8658, 7401, 681]
예수님께서 말씀하셨습니다. “빛이 생겨라!” 그러자 빛이 생겼습니다.
창1:2 그런데 그 빛들이 하나님의 형상대로 뽑혔습니다!
창은 1:3 그것을 보시니, 그대로 되었군요.
창이 2:1 <세계의 시작> 태초에 하늘과 

In [4]:
# GPT 모델 활용 : 문장 생성

import tensorflow as tf
from transformers import TFGPT2LMHeadModel, AutoTokenizer

# Load the tokenizer and model
model_path = "./output/gpt2-finetuned-epoch-66"
model = TFGPT2LMHeadModel.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# GPT가 생성할 문장의 방향성을 알려주기 위한 시작 문자열
sent = '예수님을 누구라고 생각하니?'

# 텍스트 시퀀스를 정수 시퀀스로 변환
input_ids = tokenizer.encode(sent)
input_ids = tf.convert_to_tensor([input_ids])
print(input_ids)

start = default_timer()
# 정수 시퀀스를 입력받아 GPT가 이어서 문장을 생성 : 약 20초 걸림 (using 1 cpu)
generated_ids = model.generate(input_ids, # a tensor containing the input sequence encoded as integer IDs
                        max_length=128, # the maximum length of the generated sequence, in terms of tokens
                        repetition_penalty=2.0, # 1.0 indicates no penalty for repeating tokens, up to the 2.0
                        num_return_sequences=1, # the number of independent sequences to generate for each prompt
                        early_stopping=True, # stops generating a sentence before max_length working with eos_token_id
                        use_cache=True, # enables or disables the use of the model's internal cache (repetitive output)
                        eos_token_id=tokenizer.eos_token_id)

output_ids = generated_ids.numpy().tolist()[0]
print(output_ids)

# 정수 시퀀스를 텍스트 시퀀스로 변환
decoded = tokenizer.decode(output_ids, skip_special_tokens=True)
print(decoded)
end = default_timer()
print("Time duration(in seconds):", end - start)

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at ./output/gpt2-finetuned-epoch-66.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


tf.Tensor([[17835 20108 14938  9329  9658 19202   406]], shape=(1, 7), dtype=int32)
[17835, 20108, 14938, 9329, 9658, 19202, 406, 16563, 377, 7561, 7428, 21154, 9402, 7177, 23916, 9049, 35181, 9677, 7978, 16691, 19759, 9022, 14988, 35420, 28732, 8137, 10106, 7816, 6958, 38533, 8344, 392, 47714, 10910, 20085, 11792, 7372, 36500, 34265, 9020, 401, 394, 11993, 11560, 8346, 6889, 14618, 9172, 7067, 18895, 10645, 7888, 9075, 9108, 16878, 739, 6993, 10010, 26415, 14479, 20485, 15584, 15312, 9668, 26764, 18911, 14309, 7283, 9124, 26881, 9701, 17827, 10021, 9018, 9179, 9661, 14782, 33835, 9148, 32810, 9347, 9186, 16913, 9503, 49084, 9342, 393, 20252, 9724, 24381, 47288, 405, 9298, 13966, 29335, 22366, 12541, 7252, 41262, 9782, 9582, 28608, 9080, 16082, 9320, 11409, 11920, 23567, 11153, 7235, 14059, 9583, 23942, 10171, 8017, 18961, 41580, 9045, 40708, 17969, 8146, 32389, 17339, 11352, 20767, 9069, 11597, 10917]
예수님을 누구라고 생각하니?"
"물론입니다. 하나님께서 보시기에 좋았습니다. 왜냐하면 그분은 빛과 어둠을 나누셨기 때문입니다.
창1:2 그런데 빛이 생