In [None]:
from transformers import AutoTokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer
import json
import pandas as pd
import torch

data_file = "/content/train (2).json"

model_name = "google-t5/t5-base"

def load_data_from_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return pd.DataFrame(data)

def preprocess_data(data_file):

    data = []

    data_df = load_data_from_json(data_file)
    source_texts = [pair for pair in data_df['src']]
    target_texts = [pair for pair in data_df['tgt']]

    tokenizer = AutoTokenizer.from_pretrained(model_name)

    source_encodings = tokenizer(source_texts, padding=True, truncation=True, return_tensors="pt", max_length=256, add_special_tokens=True)
    target_encodings = tokenizer(target_texts, padding=True, truncation=True, return_tensors="pt", max_length=256, add_special_tokens=True)

    for i in range(len(source_texts)):
        example = {
            "input_ids": source_encodings['input_ids'][i],
            "attention_mask": source_encodings['attention_mask'][i],
            "labels": target_encodings['input_ids'][i],
        }
        data.append(example)

    return data

train_data = preprocess_data(data_file)

training_args = TrainingArguments(
    output_dir="./output",
    num_train_epochs=5,
    per_device_train_batch_size=16,
    save_steps=1000,
)

model = T5ForConditionalGeneration.from_pretrained(model_name)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
)

trainer.train()

print("Training completed! You can now use the fine-tuned model for generation.")

In [None]:
output_model_dir = "./fine_tuned_model"

trainer.save_model(output_model_dir)

print("Fine-tuned model saved successfully!")

In [None]:
import json
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import T5ForConditionalGeneration, AutoTokenizer

test_data_file = "/content/valid (1).json"

def load_testing_data_from_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

model_name = "google-t5/t5-base"
model = T5ForConditionalGeneration.from_pretrained("/content/fine_tuned_model")

tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess_testing_data(data):
    inputs = [example["src"] for example in data]
    targets = [example["tgt"] for example in data]
    return inputs, targets

test_data = load_testing_data_from_json(test_data_file)

inputs, targets = preprocess_testing_data(test_data)

predictions = []
for input_text in inputs:
    input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=256, truncation=True)
    outputs = model.generate(input_ids, max_length=256, num_beams=4, early_stopping=True)
    predicted_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    predictions.append(predicted_text)
    print(predicted_text)

accuracy = accuracy_score(targets, predictions)
precision, recall, f1_score, _ = precision_recall_fscore_support(targets, predictions, average='weighted')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1_score}")