In [5]:
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForCausalLM
from tensorflow.keras.optimizers.schedules import PolynomialDecay
from IPython.display import clear_output
from tensorflow.keras.optimizers import Adam

In [6]:
# Load tokenizer and GPT-2 model
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TFAutoModelForCausalLM.from_pretrained(model_name)

# Add a padding token if not present
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
    model.resize_token_embeddings(len(tokenizer))

# Load dataset
with open("combined_dataset6.csv", "r", encoding="utf-8") as f:
    poems = f.read().splitlines()
poems = poems[1:]

# Tokenize all poems at once
max_length = 256
inputs = tokenizer(poems, return_tensors='tf', padding='max_length', truncation=True, max_length=max_length)

# Convert inputs to TensorFlow dataset
dataset = tf.data.Dataset.from_tensor_slices((inputs['input_ids'], inputs['attention_mask']))

# Prepare the dataset for training
batch_size = 32
dataset = dataset.shuffle(buffer_size=1000).batch(batch_size)

# Define the optimizer with linear decay
num_train_epochs = 25  # Number of training epochs
total_steps = len(poems) // batch_size * num_train_epochs

learning_rate_schedule = PolynomialDecay(
    initial_learning_rate=5e-5,
    end_learning_rate=5e-7,
    decay_steps=total_steps,
    power=1.0  # Linear decay
)

# Define the optimizer
initial_learning_rate = 5e-5
optimizer = Adam(learning_rate=initial_learning_rate)

# Define the training step
@tf.function
def train_step(input_ids, attention_mask):
    with tf.GradientTape() as tape:
        outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return loss

loss_values = []
current_step = 0

# Perform training
for epoch in range(num_train_epochs):

    epoch_loss = 0
    step_count = 0
    dataset_iterator = iter(dataset)  # Create a new iterator for the dataset at the start of each epoch

    for step, (input_ids, attention_mask) in enumerate(dataset_iterator):

        # Update the learning rate for the current step
        current_lr = learning_rate_schedule(current_step)
        optimizer.learning_rate = current_lr

        # Perform a training step
        with tf.GradientTape() as tape:
            loss = train_step(input_ids, attention_mask)

        grads = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))

        epoch_loss += loss.numpy()
        step_count += 1
        current_step += 1

    epoch_loss /= step_count  # Calculate average loss per epoch
    loss_values.append(epoch_loss)

    print(f"Epoch {epoch+1} - Loss: {epoch_loss}")

    # Plot the loss
    plt.figure(figsize=(10, 5))
    plt.plot(loss_values, label='Training Loss', marker='o')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title('Training Loss Over Epochs')
    plt.legend()
    plt.grid(True)
    plt.show()
    clear_output(wait=True)



All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


KeyboardInterrupt: 

In [None]:
def generate_poem(prompt: str, temperature, top_k, top_p, min_length, max_length) -> str:
    input_ids = tokenizer.encode(prompt, return_tensors='tf')
    attention_mask = tf.ones_like(input_ids)  # Create attention mask
    generated_text_ids = model.generate(
        input_ids,
        attention_mask=attention_mask,
        min_length=min_length,
        max_length=max_length,
        num_return_sequences=1,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        pad_token_id=tokenizer.eos_token_id
    )
    generated_text = tokenizer.decode(generated_text_ids[0], skip_special_tokens=True)
    return generated_text

# Example usage with adjusted generation parameters
prompt = "Шагыйрь"
poem = generate_poem(prompt, temperature=0.9, min_length=200, max_length=400, top_k=50, top_p=0.9)

print(poem)