In [22]:
# GPTv2 모델 사용
# TFGPT2LMHeadModel.from_pretrained('GPT 모델 이름') : 두개의 문장이 이어지는 문장인지를 판단하는 GPT 모델 로드
# AutoTokenizer.from_pretrained('GPT 모델 이름') : 위 로드된 모델이 학습되었을 당시에 사용된 토크나이저를 로드

import numpy as np
import random
import tensorflow as tf
from transformers import AutoTokenizer
from transformers import TFGPT2LMHeadModel

model = TFGPT2LMHeadModel.from_pretrained('skt/kogpt2-base-v2', from_pt=True)
tokenizer = AutoTokenizer.from_pretrained('skt/kogpt2-base-v2')

# GPT가 생성할 문장의 방향성을 알려주기 위한 시작 문자열
sent = '예수님'

# 텍스트 시퀀스를 정수 시퀀스로 변환
input_ids = tokenizer.encode(sent)
input_ids = tf.convert_to_tensor([input_ids])
print(input_ids)

# 정수 시퀀스를 입력받아 GPT가 이어서 문장을 생성 : 약 20초 걸림 (using 1 cpu)
output = model.generate(input_ids, # a tensor containing the input sequence encoded as integer IDs
                        max_length=128, # the maximum length of the generated sequence, in terms of tokens
                        repetition_penalty=2.0, # avoiding repeated tokens (higher value means more diverse output)
                        use_cache=True) # enables or disables the use of the model's internal cache (repetitive output)

output_ids = output.numpy().tolist()[0]
print(output_ids)

# 정수 시퀀스를 텍스트 시퀀스로 변환
tokenizer.decode(output_ids)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFGPT2LMHeadModel: ['transformer.h.6.attn.masked_bias', 'transformer.h.10.attn.masked_bias', 'transformer.h.5.attn.masked_bias', 'transformer.h.1.attn.masked_bias', 'transformer.h.0.attn.masked_bias', 'transformer.h.2.attn.masked_bias', 'transformer.h.7.attn.masked_bias', 'transformer.h.4.attn.masked_bias', 'lm_head.weight', 'transformer.h.11.attn.masked_bias', 'transformer.h.3.attn.masked_bias', 'transformer.h.8.attn.masked_bias', 'transformer.h.9.attn.masked_bias']
- This IS expected if you are initializing TFGPT2LMHeadModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFGPT2LMHeadModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassifica

tf.Tensor([[17835  7177]], shape=(1, 2), dtype=int32)
[17835, 7177, 6995, 16098, 7281, 9885, 13358, 10010, 6947, 7399, 7220, 9022, 6866, 19588, 9077, 18337, 17955, 16913, 13485, 8146, 8196, 9265, 7162, 9018, 7895, 10936, 9034, 8325, 9148, 45887, 9402, 19495, 24117, 8137, 12904, 10590, 11698, 32937, 9351, 7470, 19325, 8702, 11768, 9129, 10542, 19561, 7788, 15709, 9782, 11649, 13023, 9337, 15092, 8092, 9620, 22375, 9076, 9038, 9863, 10578, 15605, 8263, 35453, 11718, 21319, 7532, 15378, 10401, 50997, 9277, 19635, 8075, 11594, 9199, 9929, 6824, 13675, 30903, 11114, 9355, 12517, 43242, 13203, 9134, 18607, 9362, 39376, 43056, 13768, 28569, 24488, 406, 9316, 32010, 23753, 7991, 15525, 37767, 10070, 7235, 10917, 24454, 11387, 35187, 20337, 31994, 9046, 7890, 25226, 9272, 46588, 14485, 9172, 7587, 13486, 9723, 681, 9661, 16691, 8, 12199, 8711, 10033, 13805, 21734, 9563, 19367, 13386]


'예수님께 기도드리고 싶습니다.\n그런데 그게 무슨 소용이 있겠습니까?\n이제 저는 이십 년 전쯤부터 제가 하나님의 말씀을 듣고 있습니다.\n하나님은 우리를 구원하시는 분입니다.\n그래서 우리가 지금 어떻게 해야 할까요?\n우리가 무엇을 원하고 어떤 일을 원하는지 알아야 합니다.\n그러면 우리는 왜 살아야 하는가?\n왜 사는 것이 필요한가요?\n우리는 무엇 때문에 살고 있는가?\n우리에게 주어진 것은 무엇인가?\n그리고 우리의 삶은 무엇인가?라는 질문을 던져야 됩니다.\n저는 오늘도 이렇게 질문합니다.\n예수님이 우리에게 주신 메시지는 무엇일까요?\n바로 ‘믿음의 힘’이었습니다.</d> 지난해 12월 31일 오후 서울 종로구 세종'

In [None]:
# 입력문장의 다음 단어로 가장 확률이 높은 단어 Top 5 예측
output = model(input_ids)
top5 = tf.math.top_k(output.logits[0, -1], k=5)
tokenizer.convert_ids_to_tokens(top5.indices.numpy())

In [None]:
# Upgrade transformers and datasets to latest versions
'''
pip install tensorflow==2.7.0
pip install transformers==4.21.0
'''

In [1]:
# GPTv2 모델 파인튜닝 & 저장

import tensorflow as tf
from transformers import TFGPT2LMHeadModel, AutoTokenizer
from timeit import default_timer

# Load the text data
with open('bible_john_corpus.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# Instantiate the GPT-2 model
model = TFGPT2LMHeadModel.from_pretrained('skt/kogpt2-base-v2', from_pt=True)

# Instantiate the tokenizer
tokenizer = AutoTokenizer.from_pretrained('skt/kogpt2-base-v2')

# Tokenize the text data
tokenized_text = tokenizer(text, return_tensors='tf')
print(tokenized_text)

# Define the training function
@tf.function
def train_step(input_ids):
    # Truncate input sequence
    max_seq_length = 1024 # Usually 1024 for GPT-2
    input_ids = input_ids[:, :max_seq_length]
    with tf.GradientTape() as tape:
        outputs = model(input_ids, training=True)
        logits = outputs.logits[:, :-1, :]
        labels = input_ids[:, 1:]
        loss_value = loss(labels, logits)
    grads = tape.gradient(loss_value, model.trainable_weights)
    optimizer.apply_gradients(zip(grads, model.trainable_weights))
    return loss_value

# Define the training parameters
batch_size = 16
learning_rate = 3e-5
epochs = 1

# Create a TensorSliceDataset from the tokenized text
dataset = tf.data.Dataset.from_tensor_slices(tokenized_text['input_ids'])
dataset = dataset.batch(batch_size)

# Define the loss function and optimizer
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

# Fine-tune the model
for epoch in range(epochs):
    start = default_timer()
    print(f'Epoch {epoch+1}/{epochs}')
    for step, batch in enumerate(dataset):
        loss_value = train_step(batch)
        if step % 50 == 0:
            print(f'Step {step} Loss {loss_value}')
    end = default_timer()
    print("Time duration(in seconds):", end - start)
    
    # Save the fine-tuned model
    model.save_pretrained(f'./output/gpt2-finetuned-epoch-{epoch+1}')
    tokenizer.save_pretrained(f'./output/gpt2-finetuned-epoch-{epoch+1}')

2023-03-07 22:02:30.539018: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-03-07 22:02:30.539044: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-03-07 22:02:34.136619: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2023-03-07 22:02:34.136642: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2023-03-07 22:02:34.136657: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (ip-10-255-120-161): /proc/driver/nvidia/version does not exist
2023-03-07 22:02:34.136860: I tensorflow/core/platform/cpu_

{'input_ids': <tf.Tensor: shape=(1, 27695), dtype=int32, numpy=array([[ 9724,  7492,  7953, ...,  8705, 32240,  9051]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(1, 27695), dtype=int32, numpy=array([[1, 1, 1, ..., 1, 1, 1]], dtype=int32)>}
Epoch 1/1
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Step 0 Loss 3.841675043106079
Time duration(in seconds): 15.190023487026338


In [2]:
# GPT 모델 활용 : 문장 요약

import tensorflow as tf
from transformers import TFGPT2LMHeadModel, AutoTokenizer
from timeit import default_timer

# Load the tokenizer and model
model_path = "./output/gpt2-finetuned-epoch-1"
model = TFGPT2LMHeadModel.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Generate summaries
max_length = 40
num_beams = 4
input_text = "예수님"
input_ids = tokenizer.encode(input_text, return_tensors="tf")
output_ids = model.generate(input_ids, max_length=max_length, num_beams=num_beams, no_repeat_ngram_size=2, early_stopping=True)
output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

# Print the generated summary
print("Input Text: ", input_text)
print("Generated Summary: ", output_text)

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at ./output/gpt2-finetuned-epoch-1.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


Input Text:  예수님
Generated Summary:  예수님께 감사드립니다."
"감사합니다, 목사님."
"목사님, 저는 저를 사랑하고 있습니다."
그녀는 고개를 끄덕였다.
"그런데 왜 저


In [8]:
# GPT 모델 활용 : 문장 생성

import tensorflow as tf
from transformers import TFGPT2LMHeadModel, AutoTokenizer

# Load the tokenizer and model
model_path = "./output/gpt2-finetuned-epoch-1"
model = TFGPT2LMHeadModel.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# GPT가 생성할 문장의 방향성을 알려주기 위한 시작 문자열
sent = '예수님'

# 텍스트 시퀀스를 정수 시퀀스로 변환
input_ids = tokenizer.encode(sent)
input_ids = tf.convert_to_tensor([input_ids])
print(input_ids)

start = default_timer()
# 정수 시퀀스를 입력받아 GPT가 이어서 문장을 생성 : 약 20초 걸림 (using 1 cpu)
generated_ids = model.generate(input_ids, # a tensor containing the input sequence encoded as integer IDs
                        max_length=128, # the maximum length of the generated sequence, in terms of tokens
                        repetition_penalty=2.0, # 1.0 indicates no penalty for repeating tokens, up to the 2.0
                        num_return_sequences=1, # the number of independent sequences to generate for each prompt
                        early_stopping=True, # stops generating a sentence before max_length working with eos_token_id
                        use_cache=True) # enables or disables the use of the model's internal cache (repetitive output)

output_ids = generated_ids.numpy().tolist()[0]
print(output_ids)

# 정수 시퀀스를 텍스트 시퀀스로 변환
decoded = tokenizer.decode(output_ids, skip_special_tokens=True)
print(decoded)
end = default_timer()
print("Time duration(in seconds):", end - start)

start = default_timer()
# 가장 높은 확률 순으로 한 문장만 완성될 때까지만 생성
generated_ids = model.generate(input_ids, max_length=128, repetition_penalty=2.0, early_stopping=True, eos_token_id=tokenizer.eos_token_id)
output_ids = generated_ids.numpy().tolist()[0]
decoded = tokenizer.decode(output_ids, skip_special_tokens=True)
print(decoded)
end = default_timer()
print("Time duration(in seconds):", end - start)

start = default_timer()
# 높은 확률(44위)내 랜덤으로 한 문장만 완성될 때까지만 생성 : 부자연스러운 문장이 될 수 있음
generated_ids = model.generate(input_ids, max_length=128, repetition_penalty=2.0, do_sample=True, top_k=44, early_stopping=True, eos_token_id=tokenizer.eos_token_id)
output_ids = generated_ids.numpy().tolist()[0]
decoded = tokenizer.decode(output_ids, skip_special_tokens=True)
print(decoded)
end = default_timer()
print("Time duration(in seconds):", end - start)

# Stop generating text if the generated text is a complete sentence
'''
generated_text = ""
while True:
    # Generate one sequence of text
    output = model.generate(input_ids, max_length=40, repetition_penalty=2.0)
    sequence = tokenizer.decode(output[0], skip_special_tokens=True)
    generated_text += sequence.strip()
    if generated_text.endswith((".", "!", "?")):
        break
return generated_text
print("A complete sentence has been generated :", generated_text)
'''

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at ./output/gpt2-finetuned-epoch-1.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


tf.Tensor([[17835  7177]], shape=(1, 2), dtype=int32)
[17835, 7177, 6995, 16098, 7281, 9885, 13358, 8017, 10010, 6947, 7399, 7220, 9022, 13041, 13080, 7816, 8137, 9068, 12102, 9018, 15406, 9080, 9548, 10574, 9054, 9393, 14486, 9290, 12487, 9414, 14782, 10972, 12858, 17582, 10909, 33561, 7489, 9181, 12306, 14085, 387, 9455, 9351, 7470, 29543, 9179, 9383, 9658, 8718, 18961, 47637, 11791, 9432, 44235, 19826, 22507, 11273, 9846, 10948, 15378, 9402, 19495, 10156, 9445, 10687, 15562, 9481, 34693, 43056, 14121, 24692, 25203, 11611, 13885, 9362, 10021, 9063, 9415, 9661, 31204, 10078, 21319, 9433, 15709, 24736, 9173, 31011, 9927, 13768, 14145, 6958, 9355, 9258, 19520, 12683, 9316, 9572, 9237, 19747, 9135, 9685, 17155, 7978, 6872, 8263, 13675, 8146, 8196, 9148, 9094, 39576, 45937, 10166, 14955, 16085, 18789, 9075, 9294, 11283, 9871, 45023, 7788, 14269, 9199, 9554, 13076, 13300, 13680]
예수님께 기도드리고 싶었습니다.
그런데 그분이 돌아가셨을 때 나는 이 땅에 있는 모든 사람들이 다 죽었다는 것을 알고 있었어요.
나는 그때 내가 죽은 줄로만 알았던 사람들, 즉 우리를 죽인 사람이라고 

'\ngenerated_text = ""\nwhile True:\n    # Generate one sequence of text\n    output = model.generate(input_ids, max_length=40, repetition_penalty=2.0)\n    sequence = tokenizer.decode(output[0], skip_special_tokens=True)\n    generated_text += sequence.strip()\n    if generated_text.endswith((".", "!", "?")):\n        break\nreturn generated_text\nprint("A complete sentence has been generated :", generated_text)\n'