In [5]:
import pandas as pd
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
import torch
from datasets import Dataset

In [6]:
data = pd.read_csv("assessment.csv")
data = data.dropna(subset=["Program"])

train_texts = data["Program"].tolist()
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

In [7]:
train_encodings = []
for text in train_texts:
    tokenized_text = tokenizer.encode(text, truncation=True, padding=True)
    train_encodings.append(tokenized_text)

In [8]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
train_dataset = Dataset.from_dict(train_encodings)

# Initialize the pre-trained GPT-2 model
model = GPT2LMHeadModel.from_pretrained("gpt2")

In [9]:
training_args = TrainingArguments(
    output_dir="./output",  # Output directory where model checkpoints and logs will be saved
    per_device_train_batch_size=4,
    num_train_epochs=3,
    logging_dir="./logs",
)

In [10]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
    train_dataset=train_dataset,
)

# Fine-tune the model
trainer.train()



Step,Training Loss


TrainOutput(global_step=42, training_loss=2.1928983415876115, metrics={'train_runtime': 6004.2806, 'train_samples_per_second': 0.026, 'train_steps_per_second': 0.007, 'total_flos': 83090866176000.0, 'train_loss': 2.1928983415876115, 'epoch': 3.0})

In [13]:
# Define output directory
output_dir = "output/trained_model"

# Save model
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

('output/trained_model\\tokenizer_config.json',
 'output/trained_model\\special_tokens_map.json',
 'output/trained_model\\vocab.json',
 'output/trained_model\\merges.txt',
 'output/trained_model\\added_tokens.json')

In [20]:
input_text = "The primary objective to conducting this external heatlth and safety"
input_ids = tokenizer.encode(input_text, return_tensors="pt")

with torch.no_grad():
    outputs = model(input_ids=input_ids)
    predictions = outputs.logits[:, -1, :]

predicted_index = torch.argmax(predictions, dim=-1)
predicted_token = tokenizer.decode(predicted_index)

print("Predicted next word:", predicted_token)

Predicted next word:  assessment
