In [None]:
import os 
import torch

from datasets import load_dataset
from transformers import GPT2Tokenizer, TFGPT2Model, TFGPT2LMHeadModel, TrainingArguments, Trainer

In [None]:
train_dir = 'dataset/train' # train the model 
validation_dir = 'dataset/validation' # validate the model
test_dir = 'dataset/test' # unseen data
OUTPUT_DIR = "Results"
LOG_DIR = "Logs"

# Loading Pre-Trained Model

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = TFGPT2Model.from_pretrained('gpt2')
model_lm = TFGPT2LMHeadModel.from_pretrained('gpt2')

# Training the model


## Setting up training arguments

In [None]:
training_args = TrainingArguments(
    output_dir = OUTPUT_DIR,          # Directory to save the model
    evaluation_strategy = "epoch",    # Evaluate every epoch
    learning_rate = 2e-5,             # Learning rate
    per_device_train_batch_size = 2,  # Batch size per device
    per_device_eval_batch_size = 2,   # Evaluation batch size
    num_train_epochs = 3,             # Number of epochs
    weight_decay = 0.01,              # Weight decay
    save_strategy = "epoch",          # Save model every epoch
    logging_dir = LOG_DIR,            # Directory for logs
    logging_steps = 10,               # Log every 10 steps
)

## Initialising the trainer

In [None]:
def compute_metrics(eval_pred):
    logits = eval_pred.predictions  # Predictions from the model
    labels = eval_pred.label_ids    # True labels

    # Calculate loss (if not already returned by the model)
    loss_fn = torch.nn.CrossEntropyLoss()
    loss = loss_fn(torch.tensor(logits).view(-1, logits.shape[-1]), torch.tensor(labels).view(-1)).item()

    # Calculate accuracy
    predictions = logits.argmax(axis=-1)
    accuracy = (predictions == labels).astype(float).mean().item()

    return {"loss": loss, "accuracy": accuracy}

In [None]:
train_dataset = load_dataset("text", data_files={"train"})
eval_dataset = load_dataset("text", data_files={"eval"})
test_dataset = load_dataset("text", data_files={"test"})

In [None]:
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=["train"])
tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True, remove_columns=["eval"])
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True, remove_columns=["test"])

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset["train"],
    eval_dataset=tokenized_eval_dataset["train"], 
    tokenizer=tokenizer,
    compute_metrics=compute_metrics  # Pass the compute_metrics function
)

In [None]:
# Train the model
trainer.train()

## Saving the model

In [None]:
trainer.save_model("fine-tuned-model")
tokenizer.save_pretrained("fine-tuned-model")

# Evaluation

In [None]:
print("Chatbot is ready! Type 'exit' to stop.")
while True:
    user_input = input("You: ")
    if user_input.lower() == 'exit':
        break

    encoded_input = tokenizer(user_input, return_tensors='tf')

    attention_mask = encoded_input['attention_mask']

    generated_ids = model_lm.generate(
        encoded_input['input_ids'],
        attention_mask=attention_mask,  # Use attention mask
        max_new_tokens=200,  # Control the length of generated text.  Adjust as needed.
        temperature=0.9,      # Adjust for creativity (0.2-0.8 typically good)
        top_p=0.9,           # Adjust for sampling strategy (0.8-0.95 typically good)
        # top_k=40 # Another sampling parameter. Can be used instead of top_p.
        pad_token_id=tokenizer.eos_token_id,  # Ensure padding token is set
        eos_token_id=tokenizer.eos_token_id  # Ensure the model knows when to stop
    )

    generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    print("gpt:", generated_text)

# Model for translation

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-small")