In [1]:
!pip install -r requirements.txt
import pandas as pd
from transformers import GPTNeoForCausalLM, GPT2Tokenizer, Trainer, TrainingArguments



In [2]:
# Load preprocessed data
df = pd.read_csv("clinical_trials.csv")

In [None]:
# Initialize tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-2.7B")
model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-2.7B")

In [None]:
# Tokenize input data
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

tokenized_data = df.map(tokenize_function, batched=True, batch_size=len(df))

In [None]:
# Set up training arguments
training_args = TrainingArguments(
    output_dir="./output",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    logging_steps=1000,
    save_steps=1000,
    warmup_steps=500,
    evaluation_strategy="steps",
    eval_steps=1000,
    learning_rate=2e-5,
)

In [None]:
# Set up Trainer and train model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data,
)

trainer.train()

In [None]:
# Save trained model
model.save_pretrained("model")