In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelWithLMHead, LineByLineTextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments

# 1. Download GPT2 Czech model from hugging face
model_name = "spital/gpt2-small-czech-cs"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelWithLMHead.from_pretrained(model_name)


In [None]:

# 2. Load a local data.csv file with event titles and description
data = pd.read_csv('data/data.csv')

# 3. Fine-tune the downloaded model on this file to be able to generate event descriptions based on event titles
train_texts = data['name'].tolist() + data['description'].tolist()

file = open("data/train.txt", "w")
file.write("\r\n".join(train_texts))
file.close()

train_dataset = LineByLineTextDataset(tokenizer=tokenizer, file_path="data/train.txt", block_size=128)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    save_total_limit=2,
    prediction_loss_only=True,
    learning_rate=2e-5,
    overwrite_output_dir=True
)
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset
)

trainer.train()


In [None]:

# 4. Generate event descriptions based on event titles
for title in data['name']:
    input_text = title
    encoded_input = tokenizer.encode(input_text, return_tensors='pt')
    output = model.generate(encoded_input, max_length=100, do_sample=True)
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    print(f"Title: {title}\nGenerated Description: {generated_text}")
