<a href="https://colab.research.google.com/github/bartek717/projectx/blob/main/random_baseline_FLAN_T5_finetune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets
!pip install transformers[torch]
!pip install accelerate -U
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Seq2SeqTrainingArguments, Seq2SeqTrainer, BertTokenizer, BertModel
from datasets import load_dataset, Dataset
import torch
import pandas as pd
from google.colab import files
import time
import math
import os
# here you can change the model if you want. Looks like the available options are (flan-t5-) small, base, large, xl, xxl. Probably easiest to use one of the smaller models for our use case.
import random


from google.colab import drive
drive.mount('/content/drive')


Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6
Collecting accelerate>=0.20.3 (from transformers[torch])
  Downloading accelerate-0.25.0-py

In [None]:
# subpubmed or whatever preprocessing
!git clone https://github.com/vgupta123/sumpubmed.git
%cd sumpubmed/

Cloning into 'sumpubmed'...
remote: Enumerating objects: 130808, done.[K
remote: Counting objects: 100% (44/44), done.[K
remote: Compressing objects: 100% (38/38), done.[K
remote: Total 130808 (delta 22), reused 17 (delta 6), pack-reused 130764[K
Receiving objects: 100% (130808/130808), 346.82 MiB | 23.59 MiB/s, done.
Resolving deltas: 100% (38368/38368), done.
Updating files: 100% (130765/130765), done.
/content/sumpubmed


In [None]:
def process_data(text_dir, summary_dir):
    texts = []
    summaries = []

    # Assuming the file names in both directories match
    for filename in os.listdir(text_dir):
        if filename.endswith(".txt"):
            # Read the full text
            with open(os.path.join(text_dir, filename), 'r', encoding='utf-8') as file:
                texts.append(file.read())

            # Read the corresponding summary
            summary_file = "abst_"+(filename.split("_")[-1])
            with open(os.path.join(summary_dir, summary_file), 'r', encoding='utf-8') as file:
                summaries.append(file.read())

    return texts, summaries


In [None]:
texts_full, summaries_full = process_data('line_text/', 'shorter_abstract/')

In [None]:
def clean_text(text):
    # Replace <cit> and <dig> with an empty string
    cleaned_text = text.replace("<cit>", "").replace("<dig>", "")
    return cleaned_text

In [None]:
cleaned_texts = [clean_text(text) for text in texts_full]
cleaned_summaries = [clean_text(summary) for summary in summaries_full]

In [None]:
texts = cleaned_texts[:int(0.85*len(cleaned_texts))]
summaries = cleaned_summaries[:int(0.85*len(cleaned_summaries))]

remaining_texts = texts[int(0.85*len(texts)):]
remaining_summaries = summaries[int(0.85*len(summaries)):]

In [None]:
# BEGINNING OF RANDOM SELECTION

In [None]:
def getRandomSummaries(percentage):
    num_samples = int(percentage * len(texts))
    indices = random.sample(range(len(texts)), num_samples)
    selected_texts = [texts[i] for i in indices]
    selected_summaries = [summaries[i] for i in indices]

    return selected_texts, selected_summaries

In [None]:
# 70, 50, 30, 10, 5, 3, 1

percents = [0.70, 0.50, 0.30, 0.10, 0.05, 0.03, 0.01]
selected_summaries=[]
selected_texts = []

for i in percents:
    t, s = getRandomSummaries(i)
    selected_summaries.append(s)
    selected_texts.append(t)


print('original: ' + str(len(texts)))
for i in selected_summaries:
  print('new: :' + str(len(i)))


original: 27785
new: :19449
new: :13892
new: :8335
new: :2778
new: :1389
new: :833
new: :277


In [None]:
# END OF CORESET SELECTION

In [None]:
def tokenize_function(examples):
    # Tokenize the inputs and labels
    tokenized_inputs = tokenizer(examples["texts"], padding=True, truncation=True, max_length=1024)
    with tokenizer.as_target_tokenizer():
        tokenized_labels = tokenizer(examples["summaries"], padding=True, truncation=True, max_length=256)

    tokenized_inputs["labels"] = tokenized_labels["input_ids"]
    return tokenized_inputs

In [None]:
def train_model(model, train_dataset, eval_dataset, tokenizer, percent):
    start = time.time()
    # we need to standardize the training arguments
    training_args = Seq2SeqTrainingArguments(
        output_dir="./results",
        evaluation_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        weight_decay=0.01,
        save_total_limit=3,
        num_train_epochs=3,
        predict_with_generate=True,
    )

    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
    )

    trainer.train()
    end = time.time()
    elapsed_time = end - start
    evaluation_results = trainer.evaluate()

    model_output_dir = '/content/drive/My Drive/t5finerandom_' + str(percent)[-2:]
    tokenizer_output_dir = '/content/drive/My Drive/t5finerandom_' + str(percent)[-2:]
    model.save_pretrained(model_output_dir)
    tokenizer.save_pretrained(tokenizer_output_dir)

    return time, evaluation_results, model_output_dir, tokenizer_output_dir

In [None]:
for i in range(len(selected_summaries)):
    model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")
    tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    summary = selected_summaries[i]
    text = selected_texts[i]
    print('percent: ' + str(percents[i]))
    print('summary length: ' + str(len(summary)))
    print('text length: ' + str(len(text)))
    dataset = Dataset.from_dict({"texts": text, "summaries": summary})
    total_examples = len(dataset)
    print('total examples: ' + str(len(dataset)))
    train_size = int(0.95 * total_examples)
    eval_and_test_size = (total_examples - train_size)
    tokenized_datasets = dataset.map(tokenize_function, batched=True)
    train_dataset = tokenized_datasets.select(range(train_size))
    eval_dataset = tokenized_datasets.select(range(train_size, train_size+eval_and_test_size))


    time, evaluation_results, model_output_dir, tokenizer_output_dir = train_model(model, train_dataset, eval_dataset, tokenizer, percents[i])
    print(time)
    print(evaluation_results)
    print(model_output_dir)
    print(tokenizer_output_dir)
    print('-'*100)

percent: 0.7
summary length: 19449
text length: 19449
total examples: 19449


Map:   0%|          | 0/19449 [00:00<?, ? examples/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,2.7686,2.525283
2,2.6828,2.486931
3,2.6531,2.481452


<module 'time' (built-in)>
{'eval_loss': 2.4814517498016357, 'eval_runtime': 15.6427, 'eval_samples_per_second': 62.202, 'eval_steps_per_second': 31.133, 'epoch': 3.0}
/content/drive/My Drive/t5finerandom_.7
/content/drive/My Drive/t5finerandom_.7
----------------------------------------------------------------------------------------------------
percent: 0.5
summary length: 13892
text length: 13892
total examples: 13892


Map:   0%|          | 0/13892 [00:00<?, ? examples/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,2.7834,2.55549
2,2.697,2.523752
3,2.683,2.516917


<module 'time' (built-in)>
{'eval_loss': 2.5169167518615723, 'eval_runtime': 11.2308, 'eval_samples_per_second': 61.883, 'eval_steps_per_second': 30.986, 'epoch': 3.0}
/content/drive/My Drive/t5finerandom_.5
/content/drive/My Drive/t5finerandom_.5
----------------------------------------------------------------------------------------------------
percent: 0.3
summary length: 8335
text length: 8335
total examples: 8335


Map:   0%|          | 0/8335 [00:00<?, ? examples/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,2.8226,2.662359
2,2.7855,2.635823
3,2.7383,2.6287


<module 'time' (built-in)>
{'eval_loss': 2.628699541091919, 'eval_runtime': 6.6566, 'eval_samples_per_second': 62.644, 'eval_steps_per_second': 31.397, 'epoch': 3.0}
/content/drive/My Drive/t5finerandom_.3
/content/drive/My Drive/t5finerandom_.3
----------------------------------------------------------------------------------------------------
percent: 0.1
summary length: 2778
text length: 2778
total examples: 2778


Map:   0%|          | 0/2778 [00:00<?, ? examples/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,3.0702,2.717663
2,2.8792,2.655594
3,2.8515,2.642456


<module 'time' (built-in)>
{'eval_loss': 2.642455816268921, 'eval_runtime': 2.2292, 'eval_samples_per_second': 62.355, 'eval_steps_per_second': 31.402, 'epoch': 3.0}
/content/drive/My Drive/t5finerandom_.1
/content/drive/My Drive/t5finerandom_.1
----------------------------------------------------------------------------------------------------
percent: 0.05
summary length: 1389
text length: 1389
total examples: 1389


Map:   0%|          | 0/1389 [00:00<?, ? examples/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,3.9013,2.962275
2,3.1988,2.806766
3,3.0189,2.769913


<module 'time' (built-in)>
{'eval_loss': 2.7699131965637207, 'eval_runtime': 1.1292, 'eval_samples_per_second': 61.992, 'eval_steps_per_second': 30.996, 'epoch': 3.0}
/content/drive/My Drive/t5finerandom_05
/content/drive/My Drive/t5finerandom_05
----------------------------------------------------------------------------------------------------
percent: 0.03
summary length: 833
text length: 833
total examples: 833


Map:   0%|          | 0/833 [00:00<?, ? examples/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,No log,2.865191
2,3.838400,2.773341
3,3.226800,2.751098


<module 'time' (built-in)>
{'eval_loss': 2.7510976791381836, 'eval_runtime': 0.6768, 'eval_samples_per_second': 62.061, 'eval_steps_per_second': 31.031, 'epoch': 3.0}
/content/drive/My Drive/t5finerandom_03
/content/drive/My Drive/t5finerandom_03
----------------------------------------------------------------------------------------------------
percent: 0.01
summary length: 277
text length: 277
total examples: 277


Map:   0%|          | 0/277 [00:00<?, ? examples/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,No log,4.212804
2,No log,3.796881
3,No log,3.635277


<module 'time' (built-in)>
{'eval_loss': 3.6352767944335938, 'eval_runtime': 0.2285, 'eval_samples_per_second': 61.267, 'eval_steps_per_second': 30.634, 'epoch': 3.0}
/content/drive/My Drive/t5finerandom_01
/content/drive/My Drive/t5finerandom_01
----------------------------------------------------------------------------------------------------
