In [1]:
!pip install -q nltk rouge-score pycocoevalcap

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.3/104.3 MB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone


In [2]:
from datasets import load_dataset
from transformers import DonutProcessor, VisionEncoderDecoderModel
import torch
import io
from PIL import Image
import pandas as pd
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from pycocoevalcap.cider.cider import Cider
from tqdm import tqdm

def compute_metric(gt, pred):
    """Menghitung relaxed accuracy per sampel"""
    try:
        gt = float(gt)
        pred = float(pred)
        return abs(gt - pred)/abs(gt) <= 0.05
    except:
        return str(gt).lower() == str(pred).lower()

def calculate_corpus_metrics(all_answers, all_preds):
    """Menghitung metrik tingkat korpus"""
    # Preprocessing teks
    all_answers_lower = [a.lower().strip() for a in all_answers]
    all_preds_lower = [p.lower().strip() for p in all_preds]
    
    # Inisialisasi skor default
    bleu_score = 0.0
    rouge_l_f = 0.0
    cider_score = 0.0
    
    # Hitung BLEU
    try:
        bleu_score = corpus_bleu(
            [[ref.split()] for ref in all_answers_lower],
            [pred.split() for pred in all_preds_lower],
            smoothing_function=SmoothingFunction().method4
        )
    except Exception as e:
        print(f"Error BLEU: {str(e)}")
    
    # Hitung ROUGE-L
    try:
        scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
        rouge_scores = [scorer.score(a, p)['rougeL'] for a, p in zip(all_answers, all_preds)]
        rouge_l_f = sum([s.fmeasure for s in rouge_scores])/len(rouge_scores)
    except Exception as e:
        print(f"Error ROUGE: {str(e)}")
    
    # Hitung CIDEr
    try:
        cider = Cider()
        cider_score, _ = cider.compute_score(
            {i: [a] for i, a in enumerate(all_answers_lower)},
            {i: [p] for i, p in enumerate(all_preds_lower)}
        )
    except Exception as e:
        print(f"Error CIDEr: {str(e)}")
    
    return bleu_score, rouge_l_f, cider_score

def predict_and_save_to_csv(dataset, processor, model, device, output_filename="predictions.csv", num_samples=None):
    results = []
    all_answers = []
    all_preds = []

    test_data = dataset['test']
    if num_samples is not None:
        test_data = test_data.select(range(num_samples))
    
    # Gunakan tqdm untuk progress bar
    for idx in tqdm(range(len(test_data)), desc="Memproses sampel"):
        sample = test_data[idx]
        
        # Process image
        image = Image.open(io.BytesIO(sample['image'])).convert("RGB")
        
        # Prepare input prompt
        input_prompt = f"<opencqa> {sample['query']} <s_answer>"
        
        # Tokenize
        decoder_input_ids = processor.tokenizer(
            input_prompt, 
            add_special_tokens=False, 
            return_tensors="pt"
        ).input_ids
        
        # Process image
        pixel_values = processor(image, return_tensors="pt").pixel_values.to(device)
        decoder_input_ids = decoder_input_ids.to(device)
        
        # Generate
        outputs = model.generate(
            pixel_values,
            decoder_input_ids=decoder_input_ids,
            max_length=model.decoder.config.max_position_embeddings,
            early_stopping=True,
            pad_token_id=processor.tokenizer.pad_token_id,
            eos_token_id=processor.tokenizer.eos_token_id,
            use_cache=True,
            num_beams=4,
            bad_words_ids=[[processor.tokenizer.unk_token_id]],
            return_dict_in_generate=True,
        )
        
        # Decode sequence
        sequence = processor.batch_decode(outputs.sequences)[0]
        sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
        sequence = sequence.split("<s_answer>")[1].strip()
        
        # Hitung metrik
        ans_ref = sample['label']
        ans_pred = sequence
        relaxed_acc = int(compute_metric(ans_ref, ans_pred))
        
        results.append({
            "ID": sample["imgname"],
            "ans_ref": ans_ref,
            "ans_pred": ans_pred,
            "relaxed_accuracy": relaxed_acc
        })
        
        all_answers.append(ans_ref)
        all_preds.append(ans_pred)
    
    # Hitung metrik korpus
    bleu, rouge, cider = calculate_corpus_metrics(all_answers, all_preds)
    
    # Buat DataFrame
    df = pd.DataFrame(results)
    
    # Tambahkan metrik korpus ke semua baris
    df['BLEU'] = bleu
    df['ROUGE-L'] = rouge
    df['CIDEr'] = cider
    
    # Simpan ke CSV
    df.to_csv(output_filename, index=False)
    print(f"Hasil disimpan ke {output_filename}")
    
    # Cetak ringkasan
    print("\nRingkasan Metrik:")
    print(f"Relaxed Accuracy: {df['relaxed_accuracy'].mean():.4f}")
    print(f"BLEU: {bleu:.4f}")
    print(f"ROUGE-L: {rouge:.4f}")
    print(f"CIDEr: {cider:.4f}")
    
    return df

## Inference IndoChart

In [3]:
# Load dataset
dataset = load_dataset("akunskripsiapillv1/indochart-v2-dataset")

# Load model dan processor
model_name = "akunskripsiapillv1/finetuned-unichart-indochart-v2"
processor = DonutProcessor.from_pretrained(model_name)
model = VisionEncoderDecoderModel.from_pretrained(model_name)

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Eksekusi fungsi
predict_and_save_to_csv(dataset, processor, model, device, output_filename="eval_unichart_results.csv")

train.parquet:   0%|          | 0.00/2.19G [00:00<?, ?B/s]

val.parquet:   0%|          | 0.00/274M [00:00<?, ?B/s]

test.parquet:   0%|          | 0.00/273M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/29423 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3678 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3678 [00:00<?, ? examples/s]

preprocessor_config.json:   0%|          | 0.00/439 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/809M [00:00<?, ?B/s]

Config of the encoder: <class 'transformers.models.donut.modeling_donut_swin.DonutSwinModel'> is overwritten by shared encoder config: DonutSwinConfig {
  "attention_probs_dropout_prob": 0.0,
  "depths": [
    2,
    2,
    14,
    2
  ],
  "drop_path_rate": 0.1,
  "embed_dim": 128,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "image_size": [
    960,
    960
  ],
  "initializer_range": 0.02,
  "layer_norm_eps": 1e-05,
  "mlp_ratio": 4.0,
  "model_type": "donut-swin",
  "num_channels": 3,
  "num_heads": [
    4,
    8,
    16,
    32
  ],
  "num_layers": 4,
  "patch_size": 4,
  "path_norm": true,
  "qkv_bias": true,
  "transformers_version": "4.47.0",
  "use_absolute_embeddings": false,
  "window_size": 10
}

Config of the decoder: <class 'transformers.models.mbart.modeling_mbart.MBartForCausalLM'> is overwritten by shared decoder config: MBartConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_cross_attention": true,
  "add_f

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

Memproses sampel:   0%|          | 0/3678 [00:00<?, ?it/s]Legacy behavior is being used. The current behavior will be deprecated in version 5.0.0. In the new behavior, if both images and text are provided, the default value of `add_special_tokens` will be changed to `False` when calling the tokenizer if `add_special_tokens` is unset. To test the new behavior, set `legacy=False`as a processor call argument.
Memproses sampel: 100%|██████████| 3678/3678 [54:17<00:00,  1.13it/s]


Hasil disimpan ke eval_unichart_results.csv

Ringkasan Metrik:
Relaxed Accuracy: 0.0242
BLEU: 0.3344
ROUGE-L: 0.5780
CIDEr: 2.4631


Unnamed: 0,ID,ans_ref,ans_pred,relaxed_accuracy,BLEU,ROUGE-L,CIDEr
0,27355.png,Statistik ini membandingkan jarak rata-rata ya...,Statistik ini menunjukkan jarak rata-rata yang...,0,0.334386,0.577987,2.463054
1,17750.png,Statistik menunjukkan tingkat kebebasan intern...,Statistik ini menunjukkan tingkat kebebasan in...,0,0.334386,0.577987,2.463054
2,T0048_donut_a.png,"Pada tahun 2021, Jumlah Pelanggan Sosial Perus...","Pada tahun 2021, jumlah pelanggan sosial Perus...",0,0.334386,0.577987,2.463054
3,13229.png,Statistik ini menunjukkan tingkat kelulusan pe...,Statistik ini menunjukkan tingkat kehilunan pe...,0,0.334386,0.577987,2.463054
4,1253.png,Statistik ini memberi peringkat tim Asosiasi B...,Grafik ini menunjukkan pengeluaran tim Nationa...,0,0.334386,0.577987,2.463054
...,...,...,...,...,...,...,...
3673,21552.png,Statistik ini menggambarkan kontribusi pariwis...,Statistik ini menggambarkan kontribusi pariwis...,0,0.334386,0.577987,2.463054
3674,25077.png,Grafik ini mengungkapkan bagian dari orang-ora...,Menurut survei yang dilakukan di Amerika Serik...,0,0.334386,0.577987,2.463054
3675,9388.png,Statistik ini menunjukkan kerugian penjualan s...,Statistik ini menunjukkan kerugian penjualan d...,0,0.334386,0.577987,2.463054
3676,15925.png,"Pada kuartal kedua tahun 2020, 15,7 juta orang...",Statistik ini menunjukkan jumlah orang yang di...,0,0.334386,0.577987,2.463054


## Inference BPS Dataset

In [4]:
# Load dataset
dataset = load_dataset("akunskripsiapillv1/indochart-v2-dataset")

# Load model dan processor
model_name = "akunskripsiapillv1/finetuned-unichart-bps-v2"
processor = DonutProcessor.from_pretrained(model_name)
model = VisionEncoderDecoderModel.from_pretrained(model_name)

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Filter hanya data BPS untuk evaluasi
def filter_bps(example):
    return example["source"] == "bps"

# Terapkan filter hanya ke split 'test'
test_bps = dataset["test"].filter(filter_bps)

# Eksekusi fungsi
predict_and_save_to_csv({"test": test_bps}, processor, model, device, output_filename="eval_unichart_bps_results.csv")

preprocessor_config.json:   0%|          | 0.00/439 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/809M [00:00<?, ?B/s]

Config of the encoder: <class 'transformers.models.donut.modeling_donut_swin.DonutSwinModel'> is overwritten by shared encoder config: DonutSwinConfig {
  "attention_probs_dropout_prob": 0.0,
  "depths": [
    2,
    2,
    14,
    2
  ],
  "drop_path_rate": 0.1,
  "embed_dim": 128,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "image_size": [
    960,
    960
  ],
  "initializer_range": 0.02,
  "layer_norm_eps": 1e-05,
  "mlp_ratio": 4.0,
  "model_type": "donut-swin",
  "num_channels": 3,
  "num_heads": [
    4,
    8,
    16,
    32
  ],
  "num_layers": 4,
  "patch_size": 4,
  "path_norm": true,
  "qkv_bias": true,
  "transformers_version": "4.47.0",
  "use_absolute_embeddings": false,
  "window_size": 10
}

Config of the decoder: <class 'transformers.models.mbart.modeling_mbart.MBartForCausalLM'> is overwritten by shared decoder config: MBartConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_cross_attention": true,
  "add_f

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

Filter:   0%|          | 0/3678 [00:00<?, ? examples/s]

Memproses sampel: 100%|██████████| 1178/1178 [26:45<00:00,  1.36s/it]


Hasil disimpan ke eval_unichart_bps_results.csv

Ringkasan Metrik:
Relaxed Accuracy: 0.0000
BLEU: 0.3940
ROUGE-L: 0.6792
CIDEr: 2.9770


Unnamed: 0,ID,ans_ref,ans_pred,relaxed_accuracy,BLEU,ROUGE-L,CIDEr
0,T0048_donut_a.png,"Pada tahun 2021, Jumlah Pelanggan Sosial Perus...","Pada tahun 2021, jumlah pelanggan sosial Perus...",0,0.393987,0.679231,2.976972
1,T0587_column_c.png,"Di tahun 2023, tercatat 84 unit kendaraan bus ...","Pada tahun 2023, Jumlah Kendaraan Bus di Kecam...",0,0.393987,0.679231,2.976972
2,T0456_column_a.png,Tahun 2018 menunjukkan adanya 5 kelurahan deng...,"Pada tahun 2018, jumlah kelurahan yang memilik...",0,0.393987,0.679231,2.976972
3,T0591_pie_a.png,"Pada tahun 2018, Jumlah Kendaraan Truk di Keca...","Pada tahun 2018, jumlah truk di Kecamatan Kedu...",0,0.393987,0.679231,2.976972
4,T0305_pie_b.png,"Pada tahun 2020, Produksi Susu di Kecamatan Ke...",Tahun 2020 mencatat Produksi Susu di Kecamatan...,0,0.393987,0.679231,2.976972
...,...,...,...,...,...,...,...
1173,T0315_bar_a.png,"Di 2018, jumlah Sekolah Madrasah Aliyah (MA) d...","Pada tahun 2018, jumlah Sekolah Madrasah Aliya...",0,0.393987,0.679231,2.976972
1174,T0598_donut_b.png,"Pada tahun 2016, Jumlah Kendaraan Sepeda Motor...","Pada tahun 2016, jumlah sepeda motor di Kecama...",0,0.393987,0.679231,2.976972
1175,T1175_column_c.png,Tahun 2020 mencatat bahwa luas panen tanaman h...,"Pada tahun 2020, luas panen tanaman hias anggr...",0,0.393987,0.679231,2.976972
1176,T1188_bar_b.png,"Di tahun 2018, jumlah warga yang telah mendapa...","Pada tahun 2018, jumlah penduduk yang telah me...",0,0.393987,0.679231,2.976972


## Inference Statista Dataset

In [5]:
# Load dataset
dataset = load_dataset("akunskripsiapillv1/indochart-v2-dataset")

# Load model dan processor
model_name = "akunskripsiapillv1/finetuned-unichart-statista-v2"
processor = DonutProcessor.from_pretrained(model_name)
model = VisionEncoderDecoderModel.from_pretrained(model_name)

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Filter hanya data Statista untuk evaluasi
def filter_statista(example):
    return example["source"] == "statista"

# Terapkan filter hanya ke split 'test'
test_statista = dataset["test"].filter(filter_statista)

# Eksekusi fungsi
predict_and_save_to_csv({"test": test_statista}, processor, model, device, output_filename="eval_unichart_statista_results.csv")

preprocessor_config.json:   0%|          | 0.00/439 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/809M [00:00<?, ?B/s]

Config of the encoder: <class 'transformers.models.donut.modeling_donut_swin.DonutSwinModel'> is overwritten by shared encoder config: DonutSwinConfig {
  "attention_probs_dropout_prob": 0.0,
  "depths": [
    2,
    2,
    14,
    2
  ],
  "drop_path_rate": 0.1,
  "embed_dim": 128,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "image_size": [
    960,
    960
  ],
  "initializer_range": 0.02,
  "layer_norm_eps": 1e-05,
  "mlp_ratio": 4.0,
  "model_type": "donut-swin",
  "num_channels": 3,
  "num_heads": [
    4,
    8,
    16,
    32
  ],
  "num_layers": 4,
  "patch_size": 4,
  "path_norm": true,
  "qkv_bias": true,
  "transformers_version": "4.47.0",
  "use_absolute_embeddings": false,
  "window_size": 10
}

Config of the decoder: <class 'transformers.models.mbart.modeling_mbart.MBartForCausalLM'> is overwritten by shared decoder config: MBartConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_cross_attention": true,
  "add_f

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

Filter:   0%|          | 0/3678 [00:00<?, ? examples/s]

Memproses sampel: 100%|██████████| 2500/2500 [29:35<00:00,  1.41it/s]


Hasil disimpan ke eval_unichart_statista_results.csv

Ringkasan Metrik:
Relaxed Accuracy: 0.0348
BLEU: 0.2923
ROUGE-L: 0.5357
CIDEr: 2.2332


Unnamed: 0,ID,ans_ref,ans_pred,relaxed_accuracy,BLEU,ROUGE-L,CIDEr
0,27355.png,Statistik ini membandingkan jarak rata-rata ya...,Statistik ini menunjukkan jarak rata-rata yang...,0,0.292324,0.535724,2.233246
1,17750.png,Statistik menunjukkan tingkat kebebasan intern...,Statistik ini menunjukkan tingkat kebebasan in...,0,0.292324,0.535724,2.233246
2,13229.png,Statistik ini menunjukkan tingkat kelulusan pe...,Statistik ini menunjukkan tingkat keluarga pen...,0,0.292324,0.535724,2.233246
3,1253.png,Statistik ini memberi peringkat tim Asosiasi B...,Statistik ini menunjukkan penggemar Twitter ti...,0,0.292324,0.535724,2.233246
4,1746.png,Statistik menunjukkan jumlah kerusakan yang di...,Statistik ini menggambarkan jumlah kerusakan m...,0,0.292324,0.535724,2.233246
...,...,...,...,...,...,...,...
2495,21552.png,Statistik ini menggambarkan kontribusi pariwis...,Statistik ini menggambarkan kontribusi pariwis...,0,0.292324,0.535724,2.233246
2496,25077.png,Grafik ini mengungkapkan bagian dari orang-ora...,Statistik ini menunjukkan hasil survei yang di...,0,0.292324,0.535724,2.233246
2497,9388.png,Statistik ini menunjukkan kerugian penjualan s...,Statistik ini menunjukkan kerugian penjualan d...,0,0.292324,0.535724,2.233246
2498,15925.png,"Pada kuartal kedua tahun 2020, 15,7 juta orang...",Statistik ini menunjukkan jumlah orang yang di...,0,0.292324,0.535724,2.233246
