In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install pytorch_lightning
!pip install sentencepiece
!pip install accelerate
!pip install peft
!pip install bitsandbytes



In [1]:
import os

#현재 폴더 경로; 작업 폴더 기준
print(os.getcwd())

/content


In [2]:
os.chdir("/content/drive/MyDrive/chart2text/FLANT5")

In [3]:
print(os.getcwd())

/content/drive/MyDrive/chart2text/FLANT5


In [4]:
import os
import json
def load_dataset():


    pew_dataset_root_path='../dataset/pew_dataset_reduced'
    statista_dataset_root_path='../dataset/statista_dataset_reduced'
    scicap_data_root_path='../dataset/scicap_data_reduced'
    simulated_scatter_root_path='../dataset/simulated_scatter_dataset'

    # pew_dataset_root_path='/Users/dongunyun/study/datascience/chart2text/dataset/pew_dataset'
    # statista_dataset_root_path='/Users/dongunyun/study/datascience/chart2text/dataset/statista_dataset'
    # scicap_data_root_path='/Users/dongunyun/study/datascience/chart2text/dataset/scicap_data'
    # simulated_scatter_root_path='/Users/dongunyun/study/datascience/chart2text/dataset/simulated_scatter_dataset'

    train_dataset=[]
    valid_dataset=[]
    test_dataset=[]

    capspath=os.path.join(simulated_scatter_root_path,'data','train')
    imagepath=os.path.join(simulated_scatter_root_path,'image','train')
    fileEx = r'.json'
    file_list = [file.split('.')[0] for file in os.listdir(capspath) if file.endswith(fileEx)]

    for filename in file_list:
        image_path=os.path.join(imagepath,f'{filename}.png')
        cap_path=os.path.join(capspath,f'{filename}.json')
        with open(cap_path) as f:
            json_object = json.load(f)
        if "description_rewrite" in json_object:
            train_dataset.append({'image':image_path,'text':json_object['description_rewrite'],'origin_text':cap_path})

    capspath=os.path.join(simulated_scatter_root_path,'data','valid')
    imagepath=os.path.join(simulated_scatter_root_path,'image','valid')
    fileEx = r'.json'
    file_list = [file.split('.')[0] for file in os.listdir(capspath) if file.endswith(fileEx)]

    for filename in file_list:
        image_path=os.path.join(imagepath,f'{filename}.png')
        cap_path=os.path.join(capspath,f'{filename}.json')
        with open(cap_path) as f:
            json_object = json.load(f)
        if "description_rewrite" in json_object:
            valid_dataset.append({'image':image_path,'text':json_object['description_rewrite'],'origin_text':cap_path})

    capspath=os.path.join(simulated_scatter_root_path,'data','test')
    imagepath=os.path.join(simulated_scatter_root_path,'image','test')
    fileEx = r'.json'
    file_list = [file.split('.')[0] for file in os.listdir(capspath) if file.endswith(fileEx)]

    for filename in file_list:
        image_path=os.path.join(imagepath,f'{filename}.png')
        cap_path=os.path.join(capspath,f'{filename}.json')
        with open(cap_path) as f:
            json_object = json.load(f)
        if "description_rewrite" in json_object:
            test_dataset.append({'image':image_path,'text':json_object['description_rewrite'],'origin_text':cap_path})
    dataset=dict()
    dataset['train']=train_dataset
    dataset['valid']=valid_dataset
    dataset['test']=test_dataset
    return dataset

In [5]:
dataset = load_dataset()
print(len(dataset['train']))

3504


In [6]:
import gc
import torch
gc.collect()
torch.cuda.empty_cache()

In [13]:
import argparse
from torch.utils.data import DataLoader
from typing import List
from transformers import T5Tokenizer, T5ForConditionalGeneration

from data.summary_data import SummaryChartDataset
from model.summary_model import SummaryChartModule
from peft import PeftModel,PeftConfig
import pytorch_lightning as pl
from peft import LoraConfig, get_peft_model, TaskType
#from pytorch_lightning.loggers import WandbLogger
#from pytorch_lightning.callbacks import LearningRateMonitor
from pytorch_lightning.callbacks import ModelCheckpoint

class ArgDataClass:
    def __init__(self,input_max_length,output_max_length,batch_size,valid_batch_size,num_workers):
        self.input_max_length=input_max_length
        self.output_max_length=output_max_length
        self.batch_size=batch_size
        self.valid_batch_size=valid_batch_size
        self.num_workers=num_workers

def main():
    output_dir="./output"

    tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large")
    model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-large",torch_dtype=torch.bfloat16)

    dataset = load_dataset()
    args=ArgDataClass(input_max_length=2048+1024,output_max_length=512,batch_size=1,valid_batch_size=1,num_workers=1)
    train_dataset = SummaryChartDataset(dataset["train"], tokenizer=tokenizer, input_max_length=args.input_max_length,output_max_length=args.output_max_length,
                            split="train")

    val_dataset = SummaryChartDataset(dataset["valid"], tokenizer=tokenizer, input_max_length=args.input_max_length,output_max_length=args.output_max_length,
                            split="valid")


    lora_config = LoraConfig(
        r=32, # Rank
        lora_alpha=32,
        lora_dropout=0.05,
        bias="none",
        task_type=TaskType.SEQ_2_SEQ_LM # FLAN-T5
    )


    peft_model_id = "/content/drive/MyDrive/chart2text/FLANT5/output/summary_chart-checkpoint-last"
    config = PeftConfig.from_pretrained(peft_model_id)
    peft_model = PeftModel.from_pretrained(model, peft_model_id, is_trainable=True)

    config = {"max_steps":2000*1000,
            "check_val_every_n_epoch":1,
            "log_every_n_steps":1,
            "gradient_clip_val":1,
            "num_training_samples_per_epoch": len(dataset["train"])//1.2,
            "lr":5e-5,
            "train_batch_sizes": [args.batch_size],
            "val_batch_sizes": [args.valid_batch_size],
            "num_nodes": 1,
            "warmup_steps": 50,
            "result_path": output_dir,
            "verbose": True,
            }


    model_module = SummaryChartModule(config, tokenizer, peft_model, args, train_dataset, val_dataset)
    checkpoint_callback = ModelCheckpoint(dirpath=output_dir, every_n_train_steps = 20000, save_last = False, save_top_k = -1)

    trainer = pl.Trainer(
        accelerator="gpu",
        devices=1,
        max_steps=config['max_steps'],
        check_val_every_n_epoch=1,
        log_every_n_steps=50,
        gradient_clip_val=1,
        num_nodes=1,
        num_sanity_val_steps=0,
        default_root_dir=output_dir,
        callbacks=[checkpoint_callback],
    )

    trainer.fit(model_module)


if __name__ == '__main__':
    main()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/callbacks/model_checkpoint.py:639: Checkpoint directory /content/drive/MyDrive/chart2text/FLANT5/output exists and is not empty.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                  | Params
------------------------------------------------
0 | model | PeftModelForSeq2SeqLM | 792 M 
------------------------------------------------
9.4 M     Trainable params
783 M     Non-trainable param

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]



Epoch: 0 Step: 3504 Validation Metric: 1000.0


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch: 1 Step: 7008 Validation Metric: 1000.0


/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/call.py:54: Detected KeyboardInterrupt, attempting graceful shutdown...


In [6]:
import argparse
from torch.utils.data import DataLoader
from typing import List
from transformers import T5Tokenizer, T5ForConditionalGeneration

from data.summary_data import SummaryChartDataset
from model.summary_model import SummaryChartModule
from peft import PeftModel,PeftConfig
import pytorch_lightning as pl
from peft import LoraConfig, get_peft_model, TaskType
#from pytorch_lightning.loggers import WandbLogger
#from pytorch_lightning.callbacks import LearningRateMonitor
from pytorch_lightning.callbacks import ModelCheckpoint

class ArgDataClass:
    def __init__(self,input_max_length,output_max_length,batch_size,valid_batch_size,num_workers):
        self.input_max_length=input_max_length
        self.output_max_length=output_max_length
        self.batch_size=batch_size
        self.valid_batch_size=valid_batch_size
        self.num_workers=num_workers

def main():
    output_dir="./output"

    tokenizer = T5Tokenizer.from_pretrained("/content/drive/MyDrive/chart2text/FLANT5/output/summary_chart-checkpoint-last")
    model = T5ForConditionalGeneration.from_pretrained("/content/drive/MyDrive/chart2text/FLANT5/output/summary_chart-checkpoint-last",device_map="auto",load_in_8bit=True)

    dataset = load_dataset()
    args=ArgDataClass(input_max_length=2048+512,output_max_length=512,batch_size=1,valid_batch_size=1,num_workers=1)
    train_dataset = SummaryChartDataset(dataset["train"][:1], tokenizer=tokenizer, input_max_length=args.input_max_length,output_max_length=args.output_max_length,
                            split="train")

    val_dataset = SummaryChartDataset(dataset["valid"][:1], tokenizer=tokenizer, input_max_length=args.input_max_length,output_max_length=args.output_max_length,
                            split="valid")


    config = {"max_steps":2000*1000,
            "check_val_every_n_epoch":1,
            "log_every_n_steps":1,
            "gradient_clip_val":1,
            "num_training_samples_per_epoch": len(dataset["train"])//1.2,
            "lr":5e-5,
            "train_batch_sizes": [args.batch_size],
            "val_batch_sizes": [args.valid_batch_size],
            "num_nodes": 1,
            "warmup_steps": 50,
            "result_path": output_dir,
            "verbose": True,
            }


    model_module = SummaryChartModule(config, tokenizer, model, args, train_dataset, val_dataset)
    checkpoint_callback = ModelCheckpoint(dirpath=output_dir, every_n_train_steps = 20000, save_last = False, save_top_k = -1)

    trainer = pl.Trainer(
        accelerator="gpu",
        devices=1,
        max_steps=config['max_steps'],
        check_val_every_n_epoch=1,
        log_every_n_steps=50,
        gradient_clip_val=1,
        num_nodes=1,
        num_sanity_val_steps=0,
        default_root_dir=output_dir,
        callbacks=[checkpoint_callback],
    )

    trainer.fit(model_module)


if __name__ == '__main__':
    main()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
You passed `quantization_config` to `from_pretrained` but the model you're loading already has a `quantization_config` attribute. The `quantization_config` attribute will be overwritten with the one you passed to `from_pretrained`.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.utilities.rank_zero:You are using a CUDA device ('NVIDIA A100-SXM4-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/genera

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]



{'x-axis-label': 'datetime', 'y-axis-label': "'prudence'", 'x-axis-features': [{'group': 'Khalid', 'color': '#ff7200', 'min': '2023-12-24 03:45:10.000', 'max': '2024-01-09 06:28:14.000'}, {'group': 'siege', 'color': '#ff4000', 'min': '2023-12-25 06:50:43.000', 'max': '2023-12-30 16:41:58.000'}, {'group': 'explain', 'color': '#bd0000', 'min': '2023-12-04 08:43:30.000', 'max': '2023-12-13 23:37:20.000'}, {'group': 'brown', 'color': '#3affbc', 'min': '2023-12-15 20:18:23.000', 'max': '2023-12-28 20:23:17.000'}, {'group': 'article', 'color': '#0000fa', 'min': '2023-12-07 11:12:54.000', 'max': '2023-12-09 19:01:03.000'}, {'group': 'outta', 'color': '#0000bd', 'min': '2023-12-11 01:56:26.000', 'max': '2024-01-01 11:44:20.000'}, {'group': 'impress', 'color': '#ff7200', 'min': '2023-12-20 04:31:05.000', 'max': '2023-12-26 02:25:29.000'}, {'group': 'пре', 'color': '#91ff66', 'min': '2023-12-08 00:46:19.000', 'max': '2023-12-24 11:20:28.000'}], 'y-axis-features': [{'group': 'Khalid', 'color': '#

Validation: |          | 0/? [00:00<?, ?it/s]

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/call.py:54: Detected KeyboardInterrupt, attempting graceful shutdown...


In [None]:
answers

In [None]:
import numpy as np
from PIL import Image
from matplotlib import cm
myarray=pixel_values.detach().cpu().numpy()
im = Image.fromarray((myarray.transpose(1, 2, 0) * 255).astype(np.uint8)).convert('RGB')
im.show()