In [None]:
from transformers import EncoderDecoderModel, BertTokenizerFast, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd

In [None]:
data = {
    "input_text": [
        "What are the key challenges in scaling biotech startups in North Carolina?",
        "Describe a successful commercialization of research in the NC Research Triangle."
    ],
    "target_text": [
        "Biotech startups in NC face regulatory, funding, and talent acquisition challenges.",
        "A Duke lab developed a novel diagnostic tool, licensed it to a local company, and scaled production via a university-industry partnership."
    ]
}

df = pd.DataFrame(data)
dataset = Dataset.from_pandas(df)

In [None]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

def preprocess_function(examples):
    inputs = tokenizer(examples["input_text"], padding="max_length", truncation=True, max_length=128)
    targets = tokenizer(examples["target_text"], padding="max_length", truncation=True, max_length=128)
    inputs["labels"] = targets["input_ids"]
    return inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)

In [None]:
model = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-uncased", "bert-base-uncased")

In [None]:
training_args = TrainingArguments(
    output_dir="./llm_advisory_output",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=1,
    logging_dir="./logs"
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
    tokenizer=tokenizer
)

trainer.train()