In [1]:
import os
os.environ["HF_HOME"] = "/media/storage/alif/huggingface"

In [2]:
from transformers import AutoProcessor, LlavaForConditionalGeneration, BitsAndBytesConfig
from peft import PeftModel
from PIL import Image
from datasets import load_dataset
import io
import re
import torch
import pandas as pd
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from pycocoevalcap.cider.cider import Cider
from tqdm.auto import tqdm

# === Load ChartInstruct-LLaMA ===
def load_chartinstruct_model(CHARTINSTRUCT_BASE, CHARTINSTRUCT_ADAPTER):
    processor = AutoProcessor.from_pretrained(CHARTINSTRUCT_BASE)

    quant_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16
    )

    base_model = LlavaForConditionalGeneration.from_pretrained(
        CHARTINSTRUCT_BASE,
        torch_dtype=torch.float16,
        quantization_config=quant_config,
    )

    model = PeftModel.from_pretrained(base_model, CHARTINSTRUCT_ADAPTER)
    model.to("cuda")
    return model, processor

# === Generate Descriptions ===
def generate_chartinstruct(model, processor, image, prompt):
    inputs = processor(text=prompt, images=image, return_tensors="pt").to("cuda")
    if "pixel_values" in inputs:
        inputs["pixel_values"] = inputs["pixel_values"].to(torch.float16)

    prompt_len = inputs["input_ids"].shape[1]
    outputs = model.generate(**inputs, max_new_tokens=256)
    response = processor.decode(outputs[0][prompt_len:], skip_special_tokens=True).strip()
    return prune_sentence(response)

# ==== Compute Metrics ====
def extract_numbers(text):
    """Ambil semua angka (integer dan desimal) dari teks sebagai list of float"""
    numbers = re.findall(r'\d+\.?\d*', text)
    return list(map(float, numbers))

def compute_relaxed_accuracy(gt_text, pred_text, tolerance=0.05):
    """Bandingkan angka-angka dari gt dan pred dengan toleransi error"""
    gt_numbers = extract_numbers(gt_text)
    pred_numbers = extract_numbers(pred_text)

    # Handle case tidak ada angka sama sekali
    if len(gt_numbers) == 0 and len(pred_numbers) == 0:
        return 1.0  # Anggap benar jika tidak ada angka di keduanya
    if len(gt_numbers) != len(pred_numbers):
        return 0.0  # Jumlah angka berbeda → salah

    correct = 0
    for gt, pred in zip(gt_numbers, pred_numbers):
        denominator = max(gt, 1)  # Hindari division by zero
        if abs(gt - pred) / denominator <= tolerance:
            correct += 1

    return correct / len(gt_numbers)

def calculate_corpus_metrics(all_answers, all_preds):
    """Menghitung metrik tingkat korpus."""
    # Preprocessing teks
    all_answers_lower = [a.lower().strip() for a in all_answers]
    all_preds_lower = [p.lower().strip() for p in all_preds]
    
    bleu_score = 0.0
    rouge_l_f = 0.0
    cider_score = 0.0
    
    # Hitung BLEU
    try:
        bleu_score = corpus_bleu(
            [[ref.split()] for ref in all_answers_lower],
            [pred.split() for pred in all_preds_lower],
            smoothing_function=SmoothingFunction().method4
        )
    except Exception as e:
        print(f"Error BLEU: {str(e)}")
    
    # Hitung ROUGE-L
    try:
        scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
        rouge_scores = [scorer.score(a, p)['rougeL'] for a, p in zip(all_answers, all_preds)]
        rouge_l_f = sum([s.fmeasure for s in rouge_scores]) / len(rouge_scores)
    except Exception as e:
        print(f"Error ROUGE: {str(e)}")
    
    # Hitung CIDEr
    try:
        cider = Cider()
        cider_score, _ = cider.compute_score(
            {i: [a] for i, a in enumerate(all_answers_lower)},
            {i: [p] for i, p in enumerate(all_preds_lower)}
        )
    except Exception as e:
        print(f"Error CIDEr: {str(e)}")
    
    return bleu_score, rouge_l_f, cider_score

def prune_sentence(text):
    """
    Mengambil kalimat utuh dari awal sampai titik terakhir.
    Jika tidak ada titik, ambil sampai koma terakhir.
    Jika tidak ada titik atau koma, ambil seluruh teks.
    Hapus koma jika ada di akhir hasil.
    """
    matches = list(re.finditer(r'\.(?!\d)', text))
    if matches:
        last_valid = matches[-1].start()
        return text[:last_valid + 1].strip()
    
    last_comma = text.rfind(",")
    if last_comma != -1:
        return text[:last_comma].strip()
    
    return text.strip()

def evaluate_model_with_metrics(dataset, model, processor, device="cuda", output_file="predictions.csv"):
    results = []
    all_answers = []
    all_preds = []
    total_numbers = 0
    correct_numbers = 0

    for idx in tqdm(range(len(dataset["test"])), desc="Evaluasi Berjalan..."):
        sample = dataset["test"][idx]

        # Load gambar
        image = Image.open(io.BytesIO(sample["image"])).convert("RGB")

        # Buat prompt
        prompt = f"<image>\nQuestion: {sample['query']} Answer:"

        # Generate jawaban
        pred_text = generate_chartinstruct(model, processor, image, prompt)
        
        # Ground truth
        ans_ref = sample["label"]
        ans_pred = pred_text

        # Hitung relaxed accuracy
        relaxed_acc = compute_relaxed_accuracy(ans_ref, ans_pred)

        # Simpan hasil
        results.append({
            "ID": sample["imgname"],
            "ans_ref": ans_ref,
            "ans_pred": ans_pred,
            "relaxed_accuracy": relaxed_acc
        })

        all_answers.append(ans_ref)
        all_preds.append(ans_pred)

        # Statistik jumlah angka
        gt_nums = extract_numbers(ans_ref)
        pred_nums = extract_numbers(ans_pred)
        total_numbers += len(gt_nums)
        if len(gt_nums) == len(pred_nums):
            correct_numbers += sum(1 for g,p in zip(gt_nums,pred_nums) if abs(g-p)/max(g,1) <= 0.05)

    # Hitung metrik korpus
    bleu_score, rouge_l_f, cider_score = calculate_corpus_metrics(all_answers, all_preds)

    # Buat DataFrame
    df = pd.DataFrame(results)
    df["BLEU"] = bleu_score
    df["ROUGE-L"] = rouge_l_f
    df["CIDEr"] = cider_score

    # Simpan ke CSV
    df.to_csv(output_file, index=False)
    print(f"\nHasil evaluasi disimpan ke: {output_file}")

    # Print ringkasan
    print("\n=== Ringkasan Evaluasi ===")
    print(f"Relaxed Accuracy (per sampel): {df['relaxed_accuracy'].mean():.4f}")
    if total_numbers > 0:
        print(f"Relaxed Accuracy (per angka): {correct_numbers / total_numbers:.4f}")
    print(f"BLEU: {bleu_score:.4f}")
    print(f"ROUGE-L: {rouge_l_f:.4f}")
    print(f"CIDEr: {cider_score:.4f}")

    return df

## DEFINE 

In [None]:
# Load dataset
dataset = load_dataset("akunskripsiapillv1/indochart-v2-dataset")

# Define model and adapter paths
base_model_path = "ahmed-masry/ChartInstruct-LLama2"
adapter_path = "akunskripsiapillv1/finetuned-chartinstruct-llama-v2"

# Load model and processor
model, processor = load_chartinstruct_model(base_model_path, adapter_path)

## SAMPEL

In [6]:
def evaluate_sample_predictions(dataset_samples, model, processor, device="cuda"):
    results = []

    for idx in range(len(dataset_samples)):
        sample = dataset_samples[idx]

        # Load gambar
        image = Image.open(io.BytesIO(sample["image"])).convert("RGB")

        # Buat prompt
        prompt = f"<image>\nQuestion: {sample['query']} Answer:"

        # Generate jawaban
        pred_text = generate_chartinstruct(model, processor, image, prompt)

        # Simpan hasil
        result = {
            "ID": sample["imgname"],
            "Query": sample["query"],
            "Ground Truth": sample["label"],
            "Prediction": pred_text,
        }

        results.append(result)
        print(f"\n--- Sample {idx+1} ---")
        print(f"ID: {result['ID']}")
        print(f"Query: {result['Query']}")
        print(f"Ground Truth: {result['Ground Truth']}")
        print(f"Prediction: {result['Prediction']}\n")

    return pd.DataFrame(results)

In [7]:
# Ambil 3 sampel pertama dari dataset test
test_samples = dataset["test"].select(range(3))

# Jalankan evaluasi pada 3 sampel
df_preview = evaluate_sample_predictions(test_samples, model, processor, device="cuda")

Expanding inputs for image tokens in LLaVa should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.50.
Expanding inputs for image tokens in LLaVa should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.50.



--- Sample 1 ---
ID: 27355.png
Query: Buatkan deskripsi dari grafik berikut ini secara lengkap dan informatif
Ground Truth: Statistik ini membandingkan jarak rata-rata yang ditempuh per orang setiap tahun untuk tujuan bisnis di Inggris pada tahun 2018, dengan moda transportasi. Perjalanan Surface Rail berada di posisi kedua, dengan rata-rata 120 mil yang ditempuh per orang per tahun untuk tujuan bisnis.
Prediction: Statistik ini menunjukkan jangkauan tahunan rata-rata dari perangkat yang digunakan untuk tujuan bisnis di Inggris pada tahun 2018, dengan modus transportasi. Pada tahun 2018, jangkauan rata-rata perangkat yang digunakan untuk tujuan bisnis di Inggris adalah 375 mil per orang per tahun. Ini adalah jangkauan tertinggi untuk perangkat yang digunakan untuk tujuan bisnis di Inggris. Surface Rail adalah perangkat yang digunakan untuk tujuan bisnis dengan jangkauan rata-rata tertinggi kedua, dengan 120 mil per orang per tahun.


--- Sample 2 ---
ID: 17750.png
Query: Buatkan deskr

## IndoChart Dataset

In [8]:
# Jalankan evaluasi
df_results = evaluate_model_with_metrics(
    dataset=dataset,
    model=model,
    processor=processor,
    output_file="eval_chartinstruct_results.csv"
)

Evaluasi Berjalan...:   0%|          | 0/3678 [00:00<?, ?it/s]


Hasil evaluasi disimpan ke: eval_chartinstruct_results.csv

=== Ringkasan Evaluasi ===
Relaxed Accuracy (per sampel): 0.1941
Relaxed Accuracy (per angka): 0.2479
BLEU: 0.1849
ROUGE-L: 0.4076
CIDEr: 0.5713


## BPS Dataset

In [3]:
# Load dataset
dataset = load_dataset("akunskripsiapillv1/indochart-v2-dataset")

# Filter hanya data BPS untuk evaluasi
def filter_bps(example):
    return example["source"] == "bps"

# Terapkan filter hanya ke split 'test'
test_bps = dataset["test"].filter(filter_bps)

# Define model and adapter paths
base_model_path = "ahmed-masry/ChartInstruct-LLama2"
adapter_path = "akunskripsiapillv1/finetuned-chartinstruct-llama2-bps-v2"

# Load model dan processor
model, processor = load_chartinstruct_model(base_model_path, adapter_path)

# Jalankan evaluasi
df_results = evaluate_model_with_metrics(
    dataset={"test": test_bps},
    model=model,
    processor=processor,
    output_file="eval_chartinstruct_bps_results.csv"
)

`low_cpu_mem_usage` was None, now default to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluasi Berjalan...:   0%|          | 0/1178 [00:00<?, ?it/s]

Expanding inputs for image tokens in LLaVa should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.50.
Expanding inputs for image tokens in LLaVa should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.50.



Hasil evaluasi disimpan ke: eval_chartinstruct_bps_results.csv

=== Ringkasan Evaluasi ===
Relaxed Accuracy (per sampel): 0.6156
Relaxed Accuracy (per angka): 0.5876
BLEU: 0.3750
ROUGE-L: 0.6466
CIDEr: 1.9627


## Statista Dataset

In [4]:
# Filter hanya data Statista untuk evaluasi
def filter_statista(example):
    return example["source"] == "statista"

# Terapkan filter hanya ke split 'test'
test_statista = dataset["test"].filter(filter_statista)

# Define model and adapter paths
base_model_path = "ahmed-masry/ChartInstruct-LLama2"
adapter_path = "akunskripsiapillv1/finetuned-chartinstruct-llama2-statista-v2"

# Load model dan processor
model, processor = load_chartinstruct_model(base_model_path, adapter_path)

# Jalankan evaluasi
df_results = evaluate_model_with_metrics(
    dataset={"test": test_statista},
    model=model,
    processor=processor,
    output_file="eval_chartinstruct_statista_results.csv"
)

Filter:   0%|          | 0/3678 [00:00<?, ? examples/s]

`low_cpu_mem_usage` was None, now default to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluasi Berjalan...:   0%|          | 0/2500 [00:00<?, ?it/s]


Hasil evaluasi disimpan ke: eval_chartinstruct_statista_results.csv

=== Ringkasan Evaluasi ===
Relaxed Accuracy (per sampel): 0.0466
Relaxed Accuracy (per angka): 0.0475
BLEU: 0.1218
ROUGE-L: 0.3042
CIDEr: 0.0392
