In [None]:
!pip install nltk
!pip install datasets
!pip install transformers[torch]
!pip install tokenizers
!pip install evaluate
!pip install rouge_score
!pip install sentencepiece

In [None]:
import nltk
import evaluate
import numpy as np
from datasets import load_dataset
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer

In [None]:
# Load the tokenizer, model, and data collator
MODEL_NAME = "google/flan-t5-base"

tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
import pandas as pd
from datasets import Dataset

# Load your CSV file
df = pd.read_csv('best.csv')
# print(df.iloc[0])
dataset = Dataset.from_pandas(df)

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
import re


def process_text(text):
    processed_text = ""
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    alphabetic_tokens = [i for i in tokens if re.match('^[a-zA-Z]+$', i)]
    stop_words = stopwords.words('english')

    allowed_words = ["no", "not", "don't", "don", "but", "however", "never", "wasn't", "shouldn't", "mustn't"]

    filtered_tokens = [i for i in alphabetic_tokens if i not in stop_words or i in allowed_words]

    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    processed_text = ' '.join(lemmatized_words)

    return processed_text

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
def tokenize_function(examples):
    # Preprocess the prompts and responses
    processed_prompts = [process_text(prompt) for prompt in examples["Prompt"]]
    processed_responses = [process_text(response) for response in examples["Response"]]

    # Tokenize the processed prompts and responses
    tokenized_prompts = tokenizer(processed_prompts, truncation=True, padding="max_length", max_length=512)
    tokenized_responses = tokenizer(processed_responses, truncation=True, padding="max_length", max_length=512)

    return {
        "input_ids": tokenized_prompts["input_ids"],
        "attention_mask": tokenized_prompts["attention_mask"],
        "labels": tokenized_responses["input_ids"]
    }

tokenized_dataset = dataset.map(tokenize_function, batched=True)

print(tokenized_dataset)





# Get the max length of the tokenized inputs and labels for setting global max lengths
max_input_length = max([len(x) for x in tokenized_dataset["input_ids"]])
max_label_length = max([len(x) for x in tokenized_dataset["labels"]])

print(f"Max input length: {max_input_length}")
print(f"Max label length: {max_label_length}")

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Dataset({
    features: ['Prompt', 'Response', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 3000
})
Max input length: 512
Max label length: 512


In [None]:
from datasets import DatasetDict

def preprocess_function(examples):
    prefix = "extract structured details: "

    inputs = [prefix + prompt for prompt in examples["Prompt"]]

    # Tokenize inputs with adjusted parameters if necessary
    model_inputs = tokenizer(inputs, max_length=max_input_length, padding="max_length", truncation=True)

    # Tokenize the responses with appropriate padding and truncation
    # Note: You might not need padding here if all responses are of similar length,
    # which could be the case with structured data.
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["Response"], max_length=512, padding="max_length", truncation=True)

    # Replace padding token id in labels with -100 to ignore padding in the loss
    labels["input_ids"] = [
        [(label if label != tokenizer.pad_token_id else -100) for label in label_input_ids]
        for label_input_ids in labels["input_ids"]
    ]

    # Combine inputs and labels for the model
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

train_test_split = dataset.train_test_split(test_size=0.25)
tokenized_dataset = DatasetDict({
    'train': train_test_split['train'],
    'test': train_test_split['test']
})

print(tokenized_dataset)

# Apply the preprocessing
tokenized_dataset = tokenized_dataset.map(preprocess_function, batched=True)

print(f"Keys of tokenized train dataset: {list(tokenized_dataset['train'].features)}")
print(f"Keys of tokenized test dataset: {list(tokenized_dataset['test'].features)}")

DatasetDict({
    train: Dataset({
        features: ['Prompt', 'Response'],
        num_rows: 2250
    })
    test: Dataset({
        features: ['Prompt', 'Response'],
        num_rows: 750
    })
})


Map:   0%|          | 0/2250 [00:00<?, ? examples/s]



Map:   0%|          | 0/750 [00:00<?, ? examples/s]

Keys of tokenized train dataset: ['Prompt', 'Response', 'input_ids', 'attention_mask', 'labels']
Keys of tokenized test dataset: ['Prompt', 'Response', 'input_ids', 'attention_mask', 'labels']


In [None]:
nltk.download("punkt", quiet=True)
metric = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [None]:
def compute_metrics(eval_preds):
   preds, labels = eval_preds

   # decode preds and labels
   labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
   decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
   decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

   # rougeLSum expects newline after each sentence
   decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
   decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

   result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

   return result

In [None]:
# Global Parameters
L_RATE = 3e-4
BATCH_SIZE = 8
PER_DEVICE_EVAL_BATCH = 4
WEIGHT_DECAY = 0.01
SAVE_TOTAL_LIM = 10
NUM_EPOCHS = 3

# Set up training arguments
training_args = Seq2SeqTrainingArguments(
   output_dir="./results",
   evaluation_strategy="epoch",
   learning_rate=L_RATE,
   per_device_train_batch_size=BATCH_SIZE,
   per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
   weight_decay=WEIGHT_DECAY,
   save_total_limit=SAVE_TOTAL_LIM,
   num_train_epochs=NUM_EPOCHS,
   predict_with_generate=True,
   push_to_hub=False
)

In [None]:
trainer = Seq2SeqTrainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_dataset["train"],
   eval_dataset=tokenized_dataset["test"],
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,0.0469,0.002599,0.363583,0.343372,0.363573,0.363536
2,0.0034,0.002215,0.363498,0.343245,0.363492,0.363455
3,0.0022,0.001732,0.363583,0.343372,0.363573,0.363536




TrainOutput(global_step=1689, training_loss=0.015758526843987392, metrics={'train_runtime': 2834.7253, 'train_samples_per_second': 2.381, 'train_steps_per_second': 0.596, 'total_flos': 4622112129024000.0, 'train_loss': 0.015758526843987392, 'epoch': 3.0})