# METRIK RELAXED ACCURACY

In [1]:
import pandas as pd
import re

def extract_numbers(text):
    """Ambil semua angka dari teks sebagai list of int"""
    return list(map(int, re.findall(r'\d+', text)))

def compute_relaxed_accuracy(gt_text, pred_text, tolerance=0.05):
    """Bandingkan angka-angka dari gt dan pred dengan toleransi error (bisa beda panjang)"""
    gt_numbers = extract_numbers(gt_text)
    pred_numbers = extract_numbers(pred_text)
    
    matched = 0
    used_indices = set()
    
    for gt in gt_numbers:
        for i, pred in enumerate(pred_numbers):
            if i in used_indices:
                continue
            if abs(gt - pred) / max(gt, 1) <= tolerance:
                matched += 1
                used_indices.add(i)
                break  # Stop after first match

    return matched / len(gt_numbers) if gt_numbers else 1.0  # Avoid division by zero

## UniChart

In [2]:
## UNICHART - INDOCHART

# Load data CSV
df = pd.read_csv('/kaggle/input/inference-unichart-output/eval_unichart_results.csv')

# Hitung relaxed accuracy untuk tiap baris
df['relaxed_acc'] = df.apply(lambda row: compute_relaxed_accuracy(row['ans_ref'], row['ans_pred']), axis=1)

# Hitung rata-rata relaxed accuracy
overall_score = df['relaxed_acc'].mean()

print(f"Relaxed Accuracy (rata-rata): {overall_score:.4f}")

Relaxed Accuracy (rata-rata): 0.7829


In [3]:
## UNICHART - BPS

# Load data CSV
df = pd.read_csv('/kaggle/input/inference-unichart-output/eval_unichart_bps_results.csv')

# Hitung relaxed accuracy untuk tiap baris
df['relaxed_acc'] = df.apply(lambda row: compute_relaxed_accuracy(row['ans_ref'], row['ans_pred']), axis=1)

# Hitung rata-rata relaxed accuracy
overall_score = df['relaxed_acc'].mean()

print(f"Relaxed Accuracy (rata-rata): {overall_score:.4f}")

Relaxed Accuracy (rata-rata): 0.9849


In [4]:
## UNICHART - STATISTA

# Load data CSV
df = pd.read_csv('/kaggle/input/inference-unichart-output/eval_unichart_statista_results.csv')

# Hitung relaxed accuracy untuk tiap baris
df['relaxed_acc'] = df.apply(lambda row: compute_relaxed_accuracy(row['ans_ref'], row['ans_pred']), axis=1)

# Hitung rata-rata relaxed accuracy
overall_score = df['relaxed_acc'].mean()

print(f"Relaxed Accuracy (rata-rata): {overall_score:.4f}")

Relaxed Accuracy (rata-rata): 0.6971


## ChartInstruct-LLama

In [5]:
## CHARTINSTRUCT-LLAMA - INDOCHART

# Load data CSV
df = pd.read_csv('/kaggle/input/inference-unichart-output/eval_chartinstruct_results.csv')

# Hitung relaxed accuracy untuk tiap baris
df['relaxed_acc'] = df.apply(lambda row: compute_relaxed_accuracy(row['ans_ref'], row['ans_pred']), axis=1)

# Hitung rata-rata relaxed accuracy
overall_score = df['relaxed_acc'].mean()

print(f"Relaxed Accuracy (rata-rata): {overall_score:.4f}")

Relaxed Accuracy (rata-rata): 0.8160


In [6]:
## CHARTINSTRUCT-LLAMA - BPS

# Load data CSV
df = pd.read_csv('/kaggle/input/inference-unichart-output/eval_chartinstruct_bps_results.csv')

# Hitung relaxed accuracy untuk tiap baris
df['relaxed_acc'] = df.apply(lambda row: compute_relaxed_accuracy(row['ans_ref'], row['ans_pred']), axis=1)

# Hitung rata-rata relaxed accuracy
overall_score = df['relaxed_acc'].mean()

print(f"Relaxed Accuracy (rata-rata): {overall_score:.4f}")

Relaxed Accuracy (rata-rata): 0.9302


In [7]:
## CHARTINSTRUCT-LLAMA - STATISTA

# Load data CSV
df = pd.read_csv('/kaggle/input/inference-unichart-output/eval_chartinstruct_statista_results.csv')

# Hitung relaxed accuracy untuk tiap baris
df['relaxed_acc'] = df.apply(lambda row: compute_relaxed_accuracy(row['ans_ref'], row['ans_pred']), axis=1)

# Hitung rata-rata relaxed accuracy
overall_score = df['relaxed_acc'].mean()

print(f"Relaxed Accuracy (rata-rata): {overall_score:.4f}")

Relaxed Accuracy (rata-rata): 0.7767


## Skenario testing

In [8]:
# Load data CSV
df = pd.read_csv('/kaggle/input/inference-unichart-output/skenario-testing-metadata.csv')

# Hitung relaxed accuracy untuk tiap baris
df['relaxed_acc'] = df.apply(lambda row: compute_relaxed_accuracy(row['ans_ref'], row['ans_pred']), axis=1)

# Hitung rata-rata relaxed accuracy
overall_score = df['relaxed_acc'].mean()
print(f"Relaxed Accuracy (rata-rata): {overall_score:.4f}")
print("\n" + "="*40 + "\n")

# Hitung rata-rata relaxed accuracy berdasarkan tipe_skenario dan model
grouped_scores = df.groupby(['tipe_skenario', 'model'])['relaxed_acc'].mean()

# Tampilkan hasil
print("Relaxed Accuracy per kombinasi tipe_skenario dan model:")
print(grouped_scores)

Relaxed Accuracy (rata-rata): 0.6463


Relaxed Accuracy per kombinasi tipe_skenario dan model:
tipe_skenario        model              
gangguan visual      ChartInstruct-Llama    0.745029
                     UniChart               0.704302
penghilangan elemen  ChartInstruct-Llama    0.777778
                     UniChart               0.730117
resolusi piksel      ChartInstruct-Llama    0.776566
                     UniChart               0.676065
tiga dimensi         ChartInstruct-Llama    0.642105
                     UniChart               0.416165
variasi gaya visual  ChartInstruct-Llama    0.494069
                     UniChart               0.500501
Name: relaxed_acc, dtype: float64


# METRIK NLP (BLEU, ROUGE, CIDER)

In [9]:
!pip install -q evaluate
!pip install -q rouge_score
!pip install -q nltk
!pip install -q pycocoevalcap

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.12.0 which is incompatible.
torch 2.5.1+cu124 requires nvidia-cublas-cu12==12.4.5.8; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cublas-cu12 12.8.4.1 which is incompatible.
torch 2.5.1+cu124 requires nvidia-cudnn-cu12==9.1.0.70; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cudnn-cu12 9.3.0.75 which is incompatible.
torch 2.5.1+cu124 requires nvidia-cufft-cu12==11.2.1.3; platform_system == "Linux" and platform_machine == "x86

## UniChart

In [10]:
## UNICHART - INDOCHART

import pandas as pd
import evaluate
from pycocoevalcap.cider.cider import Cider
from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
from rouge_score import rouge_scorer

# 1. Baca data CSV
df = pd.read_csv('/kaggle/input/inference-unichart-output/eval_unichart_results.csv')
refs = df['ans_ref'].tolist()
preds = df['ans_pred'].tolist()

# 2. Hitung BLEU menggunakan Hugging Face Evaluate
bleu = evaluate.load("bleu")
bleu_inputs = {
    "predictions": preds,
    "references": [[ref] for ref in refs]
}
bleu_score = bleu.compute(**bleu_inputs)
print("BLEU Score:", bleu_score)

# 3. Hitung ROUGE menggunakan Hugging Face Evaluate
try:
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    rouge_1_f = []
    rouge_2_f = []
    rouge_l_f = []

    for ref, pred in zip(refs, preds):
        scores = scorer.score(ref, pred)
        rouge_1_f.append(scores['rouge1'].fmeasure)
        rouge_2_f.append(scores['rouge2'].fmeasure)
        rouge_l_f.append(scores['rougeL'].fmeasure)

    rouge_1_avg = sum(rouge_1_f) / len(rouge_1_f)
    rouge_2_avg = sum(rouge_2_f) / len(rouge_2_f)
    rouge_l_avg = sum(rouge_l_f) / len(rouge_l_f)

    print(f"ROUGE-1 F1: {rouge_1_avg:.4f}")
    print(f"ROUGE-2 F1: {rouge_2_avg:.4f}")
    print(f"ROUGE-L F1: {rouge_l_avg:.4f}")

except Exception as e:
    print(f"Error ROUGE: {str(e)}")

# 4. Hitung CIDEr menggunakan pycocoevalcap
refs_dict = {i: [{"caption": ref}] for i, ref in enumerate(refs)}
preds_dict = {i: [{"caption": pred}] for i, pred in enumerate(preds)}

tokenizer = PTBTokenizer()
refs_tok = tokenizer.tokenize(refs_dict)
preds_tok = tokenizer.tokenize(preds_dict)

try:
    cider_evaluator = Cider()
    cider_score, _ = cider_evaluator.compute_score(refs_tok, preds_tok)
    print("CIDEr Score:", cider_score)
except Exception as e:
    print(f"Error CIDEr: {str(e)}")

2025-07-24 05:15:42.098689: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753334142.286914      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753334142.345411      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

BLEU Score: {'bleu': 0.3755374400522246, 'precisions': [0.7102885956442442, 0.49000229832222475, 0.3581289371397936, 0.27402985415027425], 'brevity_penalty': 0.8735457968939642, 'length_ratio': 0.8809061396918104, 'translation_length': 186420, 'reference_length': 211623}
ROUGE-1 F1: 0.6590
ROUGE-2 F1: 0.4668
ROUGE-L F1: 0.5780


PTBTokenizer tokenized 214977 tokens at 420064.69 tokens per second.
PTBTokenizer tokenized 189576 tokens at 416869.55 tokens per second.


CIDEr Score: 2.6533427374364056


In [11]:
## UNICHART - BPS

import pandas as pd
import evaluate
from pycocoevalcap.cider.cider import Cider
from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
from rouge_score import rouge_scorer

# 1. Baca data CSV
df = pd.read_csv('/kaggle/input/inference-unichart-output/eval_unichart_bps_results.csv')
refs = df['ans_ref'].tolist()
preds = df['ans_pred'].tolist()

# 2. Hitung BLEU menggunakan Hugging Face Evaluate
bleu = evaluate.load("bleu")
bleu_inputs = {
    "predictions": preds,
    "references": [[ref] for ref in refs]
}
bleu_score = bleu.compute(**bleu_inputs)
print("BLEU Score:", bleu_score)

# 3. Hitung ROUGE menggunakan Hugging Face Evaluate
try:
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    rouge_1_f = []
    rouge_2_f = []
    rouge_l_f = []

    for ref, pred in zip(refs, preds):
        scores = scorer.score(ref, pred)
        rouge_1_f.append(scores['rouge1'].fmeasure)
        rouge_2_f.append(scores['rouge2'].fmeasure)
        rouge_l_f.append(scores['rougeL'].fmeasure)

    rouge_1_avg = sum(rouge_1_f) / len(rouge_1_f)
    rouge_2_avg = sum(rouge_2_f) / len(rouge_2_f)
    rouge_l_avg = sum(rouge_l_f) / len(rouge_l_f)

    print(f"ROUGE-1 F1: {rouge_1_avg:.4f}")
    print(f"ROUGE-2 F1: {rouge_2_avg:.4f}")
    print(f"ROUGE-L F1: {rouge_l_avg:.4f}")

except Exception as e:
    print(f"Error ROUGE: {str(e)}")

# 4. Hitung CIDEr menggunakan pycocoevalcap
refs_dict = {i: [{"caption": ref}] for i, ref in enumerate(refs)}
preds_dict = {i: [{"caption": pred}] for i, pred in enumerate(preds)}

tokenizer = PTBTokenizer()
refs_tok = tokenizer.tokenize(refs_dict)
preds_tok = tokenizer.tokenize(preds_dict)

try:
    cider_evaluator = Cider()
    cider_score, _ = cider_evaluator.compute_score(refs_tok, preds_tok)
    print("CIDEr Score:", cider_score)
except Exception as e:
    print(f"Error CIDEr: {str(e)}")

BLEU Score: {'bleu': 0.454347134435689, 'precisions': [0.7625135520990994, 0.5362629238647112, 0.3802847923526756, 0.2740418410041841], 'brevity_penalty': 1.0, 'length_ratio': 1.0288809860399364, 'translation_length': 93159, 'reference_length': 90544}
ROUGE-1 F1: 0.7799
ROUGE-2 F1: 0.5550
ROUGE-L F1: 0.6792


PTBTokenizer tokenized 92335 tokens at 330529.22 tokens per second.
PTBTokenizer tokenized 94603 tokens at 331759.42 tokens per second.


CIDEr Score: 3.3871307239884816


In [12]:
## UNICHART - STATISTA

import pandas as pd
import evaluate
from pycocoevalcap.cider.cider import Cider
from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
from rouge_score import rouge_scorer

# 1. Baca data CSV
df = pd.read_csv('/kaggle/input/inference-unichart-output/eval_unichart_statista_results.csv')
refs = df['ans_ref'].tolist()
preds = df['ans_pred'].tolist()

# 2. Hitung BLEU menggunakan Hugging Face Evaluate
bleu = evaluate.load("bleu")
bleu_inputs = {
    "predictions": preds,
    "references": [[ref] for ref in refs]
}
bleu_score = bleu.compute(**bleu_inputs)
print("BLEU Score:", bleu_score)

# 3. Hitung ROUGE menggunakan Hugging Face Evaluate
try:
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    rouge_1_f = []
    rouge_2_f = []
    rouge_l_f = []

    for ref, pred in zip(refs, preds):
        scores = scorer.score(ref, pred)
        rouge_1_f.append(scores['rouge1'].fmeasure)
        rouge_2_f.append(scores['rouge2'].fmeasure)
        rouge_l_f.append(scores['rougeL'].fmeasure)

    rouge_1_avg = sum(rouge_1_f) / len(rouge_1_f)
    rouge_2_avg = sum(rouge_2_f) / len(rouge_2_f)
    rouge_l_avg = sum(rouge_l_f) / len(rouge_l_f)

    print(f"ROUGE-1 F1: {rouge_1_avg:.4f}")
    print(f"ROUGE-2 F1: {rouge_2_avg:.4f}")
    print(f"ROUGE-L F1: {rouge_l_avg:.4f}")

except Exception as e:
    print(f"Error ROUGE: {str(e)}")

# 4. Hitung CIDEr menggunakan pycocoevalcap
refs_dict = {i: [{"caption": ref}] for i, ref in enumerate(refs)}
preds_dict = {i: [{"caption": pred}] for i, pred in enumerate(preds)}

tokenizer = PTBTokenizer()
refs_tok = tokenizer.tokenize(refs_dict)
preds_tok = tokenizer.tokenize(preds_dict)

try:
    cider_evaluator = Cider()
    cider_score, _ = cider_evaluator.compute_score(refs_tok, preds_tok)
    print("CIDEr Score:", cider_score)
except Exception as e:
    print(f"Error CIDEr: {str(e)}")

BLEU Score: {'bleu': 0.3132090869761287, 'precisions': [0.6282691742580077, 0.42944472745797246, 0.329149188254507, 0.2685689846140349], 'brevity_penalty': 0.7970011268596408, 'length_ratio': 0.8150628928220418, 'translation_length': 98687, 'reference_length': 121079}
ROUGE-1 F1: 0.6035
ROUGE-2 F1: 0.4270
ROUGE-L F1: 0.5357


PTBTokenizer tokenized 122641 tokens at 401010.94 tokens per second.
PTBTokenizer tokenized 100393 tokens at 316868.79 tokens per second.


CIDEr Score: 2.311035222985867


## ChartInstruct-LLama

In [13]:
## CHARTINSTRUCT-LLAMA - INDOCHART

import pandas as pd
import evaluate
from pycocoevalcap.cider.cider import Cider
from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
from rouge_score import rouge_scorer

# 1. Baca data CSV
df = pd.read_csv('/kaggle/input/inference-unichart-output/eval_chartinstruct_results.csv')
refs = df['ans_ref'].tolist()
preds = df['ans_pred'].tolist()

# 2. Hitung BLEU menggunakan Hugging Face Evaluate
bleu = evaluate.load("bleu")
bleu_inputs = {
    "predictions": preds,
    "references": [[ref] for ref in refs]
}
bleu_score = bleu.compute(**bleu_inputs)
print("BLEU Score:", bleu_score)

# 3. Hitung ROUGE menggunakan Hugging Face Evaluate
try:
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    rouge_1_f = []
    rouge_2_f = []
    rouge_l_f = []

    for ref, pred in zip(refs, preds):
        scores = scorer.score(ref, pred)
        rouge_1_f.append(scores['rouge1'].fmeasure)
        rouge_2_f.append(scores['rouge2'].fmeasure)
        rouge_l_f.append(scores['rougeL'].fmeasure)

    rouge_1_avg = sum(rouge_1_f) / len(rouge_1_f)
    rouge_2_avg = sum(rouge_2_f) / len(rouge_2_f)
    rouge_l_avg = sum(rouge_l_f) / len(rouge_l_f)

    print(f"ROUGE-1 F1: {rouge_1_avg:.4f}")
    print(f"ROUGE-2 F1: {rouge_2_avg:.4f}")
    print(f"ROUGE-L F1: {rouge_l_avg:.4f}")

except Exception as e:
    print(f"Error ROUGE: {str(e)}")

# 4. Hitung CIDEr menggunakan pycocoevalcap
refs_dict = {i: [{"caption": ref}] for i, ref in enumerate(refs)}
preds_dict = {i: [{"caption": pred}] for i, pred in enumerate(preds)}

tokenizer = PTBTokenizer()
refs_tok = tokenizer.tokenize(refs_dict)
preds_tok = tokenizer.tokenize(preds_dict)

try:
    cider_evaluator = Cider()
    cider_score, _ = cider_evaluator.compute_score(refs_tok, preds_tok)
    print("CIDEr Score:", cider_score)
except Exception as e:
    print(f"Error CIDEr: {str(e)}")

BLEU Score: {'bleu': 0.21542551949448152, 'precisions': [0.4042220484753714, 0.2518544727328667, 0.1718547240210738, 0.12309983032653485], 'brevity_penalty': 1.0, 'length_ratio': 1.553247992893022, 'translation_length': 328703, 'reference_length': 211623}
ROUGE-1 F1: 0.4760
ROUGE-2 F1: 0.3065
ROUGE-L F1: 0.4076


PTBTokenizer tokenized 214977 tokens at 464601.68 tokens per second.
Jul 24, 2025 5:16:41 AM edu.stanford.nlp.process.PTBLexer next
PTBTokenizer tokenized 330890 tokens at 529383.21 tokens per second.


CIDEr Score: 0.6556028260215988


In [14]:
## CHARTINSTRUCT-LLAMA - BPS

import pandas as pd
import evaluate
from pycocoevalcap.cider.cider import Cider
from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
from rouge_score import rouge_scorer

# 1. Baca data CSV
df = pd.read_csv('/kaggle/input/inference-unichart-output/eval_chartinstruct_bps_results.csv')
refs = df['ans_ref'].tolist()
preds = df['ans_pred'].tolist()

# 2. Hitung BLEU menggunakan Hugging Face Evaluate
bleu = evaluate.load("bleu")
bleu_inputs = {
    "predictions": preds,
    "references": [[ref] for ref in refs]
}
bleu_score = bleu.compute(**bleu_inputs)
print("BLEU Score:", bleu_score)

# 3. Hitung ROUGE menggunakan Hugging Face Evaluate
try:
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    rouge_1_f = []
    rouge_2_f = []
    rouge_l_f = []

    for ref, pred in zip(refs, preds):
        scores = scorer.score(ref, pred)
        rouge_1_f.append(scores['rouge1'].fmeasure)
        rouge_2_f.append(scores['rouge2'].fmeasure)
        rouge_l_f.append(scores['rougeL'].fmeasure)

    rouge_1_avg = sum(rouge_1_f) / len(rouge_1_f)
    rouge_2_avg = sum(rouge_2_f) / len(rouge_2_f)
    rouge_l_avg = sum(rouge_l_f) / len(rouge_l_f)

    print(f"ROUGE-1 F1: {rouge_1_avg:.4f}")
    print(f"ROUGE-2 F1: {rouge_2_avg:.4f}")
    print(f"ROUGE-L F1: {rouge_l_avg:.4f}")

except Exception as e:
    print(f"Error ROUGE: {str(e)}")

# 4. Hitung CIDEr menggunakan pycocoevalcap
refs_dict = {i: [{"caption": ref}] for i, ref in enumerate(refs)}
preds_dict = {i: [{"caption": pred}] for i, pred in enumerate(preds)}

tokenizer = PTBTokenizer()
refs_tok = tokenizer.tokenize(refs_dict)
preds_tok = tokenizer.tokenize(preds_dict)

try:
    cider_evaluator = Cider()
    cider_score, _ = cider_evaluator.compute_score(refs_tok, preds_tok)
    print("CIDEr Score:", cider_score)
except Exception as e:
    print(f"Error CIDEr: {str(e)}")

BLEU Score: {'bleu': 0.4338480662062727, 'precisions': [0.723745183934132, 0.5124469574697778, 0.36430357548793557, 0.2622120371487155], 'brevity_penalty': 1.0, 'length_ratio': 1.0462868881427814, 'translation_length': 94735, 'reference_length': 90544}
ROUGE-1 F1: 0.7389
ROUGE-2 F1: 0.5285
ROUGE-L F1: 0.6466


PTBTokenizer tokenized 92335 tokens at 343409.90 tokens per second.
PTBTokenizer tokenized 96376 tokens at 344548.06 tokens per second.


CIDEr Score: 2.196741710974816


In [15]:
## CHARTINSTRUCT-LLAMA - STATISTA

import pandas as pd
import evaluate
from pycocoevalcap.cider.cider import Cider
from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
from rouge_score import rouge_scorer

# 1. Baca data CSV
df = pd.read_csv('/kaggle/input/inference-unichart-output/eval_chartinstruct_statista_results.csv')
refs = df['ans_ref'].tolist()
preds = df['ans_pred'].tolist()

# 2. Hitung BLEU menggunakan Hugging Face Evaluate
bleu = evaluate.load("bleu")
bleu_inputs = {
    "predictions": preds,
    "references": [[ref] for ref in refs]
}
bleu_score = bleu.compute(**bleu_inputs)
print("BLEU Score:", bleu_score)

# 3. Hitung ROUGE menggunakan Hugging Face Evaluate
try:
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    rouge_1_f = []
    rouge_2_f = []
    rouge_l_f = []

    for ref, pred in zip(refs, preds):
        scores = scorer.score(ref, pred)
        rouge_1_f.append(scores['rouge1'].fmeasure)
        rouge_2_f.append(scores['rouge2'].fmeasure)
        rouge_l_f.append(scores['rougeL'].fmeasure)

    rouge_1_avg = sum(rouge_1_f) / len(rouge_1_f)
    rouge_2_avg = sum(rouge_2_f) / len(rouge_2_f)
    rouge_l_avg = sum(rouge_l_f) / len(rouge_l_f)

    print(f"ROUGE-1 F1: {rouge_1_avg:.4f}")
    print(f"ROUGE-2 F1: {rouge_2_avg:.4f}")
    print(f"ROUGE-L F1: {rouge_l_avg:.4f}")

except Exception as e:
    print(f"Error ROUGE: {str(e)}")

# 4. Hitung CIDEr menggunakan pycocoevalcap
refs_dict = {i: [{"caption": ref}] for i, ref in enumerate(refs)}
preds_dict = {i: [{"caption": pred}] for i, pred in enumerate(preds)}

tokenizer = PTBTokenizer()
refs_tok = tokenizer.tokenize(refs_dict)
preds_tok = tokenizer.tokenize(preds_dict)

try:
    cider_evaluator = Cider()
    cider_score, _ = cider_evaluator.compute_score(refs_tok, preds_tok)
    print("CIDEr Score:", cider_score)
except Exception as e:
    print(f"Error CIDEr: {str(e)}")

BLEU Score: {'bleu': 0.1366935489095235, 'precisions': [0.27724437258724816, 0.15384817085695057, 0.10486340318924722, 0.0780572148528078], 'brevity_penalty': 1.0, 'length_ratio': 1.9105129708702582, 'translation_length': 231323, 'reference_length': 121079}
ROUGE-1 F1: 0.3544
ROUGE-2 F1: 0.2085
ROUGE-L F1: 0.3042


PTBTokenizer tokenized 122641 tokens at 389863.41 tokens per second.
PTBTokenizer tokenized 232342 tokens at 558157.18 tokens per second.


CIDEr Score: 0.04393761635328175


## Skenario testing

In [16]:
import pandas as pd
import evaluate
from pycocoevalcap.cider.cider import Cider
from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
from rouge_score import rouge_scorer

# Load dataset
df = pd.read_csv('/kaggle/input/inference-unichart-output/skenario-testing-metadata.csv')

# Inisialisasi metric
bleu = evaluate.load("bleu")
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
tokenizer = PTBTokenizer()
cider_evaluator = Cider()

# Simpan hasil skor per grup
results = []

# Grouping berdasarkan tipe_skenario
for (skenario, model), group_df in df.groupby(['tipe_skenario', 'model']):
    refs = group_df['ans_ref'].tolist()
    preds = group_df['ans_pred'].tolist()

    # --- BLEU ---
    bleu_inputs = {
        "predictions": preds,
        "references": [[ref] for ref in refs]
    }
    bleu_score = bleu.compute(**bleu_inputs)["bleu"]

    # --- ROUGE ---
    rouge_1_f, rouge_2_f, rouge_l_f = [], [], []
    for ref, pred in zip(refs, preds):
        scores = scorer.score(ref, pred)
        rouge_1_f.append(scores['rouge1'].fmeasure)
        rouge_2_f.append(scores['rouge2'].fmeasure)
        rouge_l_f.append(scores['rougeL'].fmeasure)

    rouge_1_avg = sum(rouge_1_f) / len(rouge_1_f)
    rouge_2_avg = sum(rouge_2_f) / len(rouge_2_f)
    rouge_l_avg = sum(rouge_l_f) / len(rouge_l_f)

    # --- CIDEr ---
    refs_dict = {i: [{"caption": ref}] for i, ref in enumerate(refs)}
    preds_dict = {i: [{"caption": pred}] for i, pred in enumerate(preds)}
    refs_tok = tokenizer.tokenize(refs_dict)
    preds_tok = tokenizer.tokenize(preds_dict)

    try:
        cider_score, _ = cider_evaluator.compute_score(refs_tok, preds_tok)
    except Exception as e:
        cider_score = None

    # Simpan skor untuk grup ini
    results.append({
        "tipe_skenario": skenario,
        "model": model,
        "bleu": bleu_score,
        "rouge1": rouge_1_avg,
        "rouge2": rouge_2_avg,
        "rougeL": rouge_l_avg,
        "cider": cider_score
    })

# Konversi hasil ke DataFrame
result_df = pd.DataFrame(results)

# Tampilkan hasil akhir
result_df

PTBTokenizer tokenized 911 tokens at 10734.02 tokens per second.
PTBTokenizer tokenized 877 tokens at 10199.21 tokens per second.
PTBTokenizer tokenized 911 tokens at 10011.43 tokens per second.
PTBTokenizer tokenized 896 tokens at 10258.16 tokens per second.
PTBTokenizer tokenized 911 tokens at 9830.40 tokens per second.
PTBTokenizer tokenized 924 tokens at 12517.00 tokens per second.
PTBTokenizer tokenized 911 tokens at 9941.28 tokens per second.
PTBTokenizer tokenized 835 tokens at 10105.00 tokens per second.
PTBTokenizer tokenized 911 tokens at 12388.58 tokens per second.
PTBTokenizer tokenized 932 tokens at 9816.63 tokens per second.
PTBTokenizer tokenized 911 tokens at 12264.86 tokens per second.
PTBTokenizer tokenized 831 tokens at 11052.75 tokens per second.
PTBTokenizer tokenized 911 tokens at 11847.21 tokens per second.
PTBTokenizer tokenized 869 tokens at 10311.87 tokens per second.
PTBTokenizer tokenized 911 tokens at 10710.31 tokens per second.
PTBTokenizer tokenized 902 t

Unnamed: 0,tipe_skenario,model,bleu,rouge1,rouge2,rougeL,cider
0,gangguan visual,ChartInstruct-Llama,0.33695,0.644093,0.425754,0.560338,1.270819
1,gangguan visual,UniChart,0.355512,0.670645,0.434221,0.555657,2.070795
2,penghilangan elemen,ChartInstruct-Llama,0.320552,0.670965,0.444011,0.54908,1.20661
3,penghilangan elemen,UniChart,0.287704,0.633523,0.382187,0.548619,1.330638
4,resolusi piksel,ChartInstruct-Llama,0.353171,0.67315,0.457066,0.58474,1.923758
5,resolusi piksel,UniChart,0.279903,0.598988,0.366087,0.488308,1.38949
6,tiga dimensi,ChartInstruct-Llama,0.367793,0.682635,0.476363,0.587745,1.600058
7,tiga dimensi,UniChart,0.33752,0.624972,0.399131,0.535167,1.333788
8,variasi gaya visual,ChartInstruct-Llama,0.277366,0.564613,0.329362,0.476728,1.032988
9,variasi gaya visual,UniChart,0.305122,0.594214,0.359155,0.507413,1.598982


# METRIK BERT SCORE

In [17]:
!pip install -q bert_score

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.5/207.5 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.1/21.1 MB[0m [31m82.0 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency r

## ChartInstruct-LLama

In [18]:
## CHARTINSTRUCT-LLAMA BPS
from bert_score import score

# 1. Baca data
df = pd.read_csv('/kaggle/input/inference-unichart-output/eval_chartinstruct_bps_results.csv')
refs = df['ans_ref'].tolist()
preds = df['ans_pred'].tolist()

# 2. Hitung BERTScore dengan model default
P, R, F1 = score(preds, refs, lang="id", verbose=True)

# 3. Print hasil rata-rata
print(f"BERTScore Precision: {P.mean().item():.4f}")
print(f"BERTScore Recall:    {R.mean().item():.4f}")
print(f"BERTScore F1:        {F1.mean().item():.4f}")

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

calculating scores...
computing bert embedding.


  0%|          | 0/37 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/19 [00:00<?, ?it/s]

done in 12.07 seconds, 97.62 sentences/sec
BERTScore Precision: 0.8896
BERTScore Recall:    0.8831
BERTScore F1:        0.8862


In [19]:
## CHARTINSTRUCT-LLAMA STATISTA
from bert_score import score

# 1. Baca data
df = pd.read_csv('/kaggle/input/inference-unichart-output/eval_chartinstruct_statista_results.csv')
refs = df['ans_ref'].tolist()
preds = df['ans_pred'].tolist()

# 2. Hitung BERTScore dengan model default
P, R, F1 = score(preds, refs, lang="id", verbose=True)

# 3. Print hasil rata-rata
print(f"BERTScore Precision: {P.mean().item():.4f}")
print(f"BERTScore Recall:    {R.mean().item():.4f}")
print(f"BERTScore F1:        {F1.mean().item():.4f}")

calculating scores...
computing bert embedding.


  0%|          | 0/79 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/40 [00:00<?, ?it/s]

done in 21.98 seconds, 113.74 sentences/sec
BERTScore Precision: 0.7281
BERTScore Recall:    0.7665
BERTScore F1:        0.7459


In [20]:
## CHARTINSTRUCT-LLAMA INDOCHART
from bert_score import score

# 1. Baca data
df = pd.read_csv('/kaggle/input/inference-unichart-output/eval_chartinstruct_results.csv')
refs = df['ans_ref'].tolist()
preds = df['ans_pred'].tolist()

# 2. Hitung BERTScore dengan model default
P, R, F1 = score(preds, refs, lang="id", verbose=True)

# 3. Print hasil rata-rata
print(f"BERTScore Precision: {P.mean().item():.4f}")
print(f"BERTScore Recall:    {R.mean().item():.4f}")
print(f"BERTScore F1:        {F1.mean().item():.4f}")

calculating scores...
computing bert embedding.


  0%|          | 0/115 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/58 [00:00<?, ?it/s]

done in 33.94 seconds, 108.37 sentences/sec
BERTScore Precision: 0.7774
BERTScore Recall:    0.8032
BERTScore F1:        0.7892


## UniChart

In [21]:
## UNICHART BPS
from bert_score import score

# 1. Baca data
df = pd.read_csv('/kaggle/input/inference-unichart-output/eval_unichart_bps_results.csv')
refs = df['ans_ref'].tolist()
preds = df['ans_pred'].tolist()

# 2. Hitung BERTScore dengan model default
P, R, F1 = score(preds, refs, lang="id", verbose=True)

# 3. Print hasil rata-rata
print(f"BERTScore Precision: {P.mean().item():.4f}")
print(f"BERTScore Recall:    {R.mean().item():.4f}")
print(f"BERTScore F1:        {F1.mean().item():.4f}")

calculating scores...
computing bert embedding.


  0%|          | 0/36 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/19 [00:00<?, ?it/s]

done in 10.77 seconds, 109.40 sentences/sec
BERTScore Precision: 0.8993
BERTScore Recall:    0.8969
BERTScore F1:        0.8980


In [22]:
## UNICHART INDOCHART
from bert_score import score

# 1. Baca data
df = pd.read_csv('/kaggle/input/inference-unichart-output/eval_unichart_results.csv')
refs = df['ans_ref'].tolist()
preds = df['ans_pred'].tolist()

# 2. Hitung BERTScore dengan model default
P, R, F1 = score(preds, refs, lang="id", verbose=True)

# 3. Print hasil rata-rata
print(f"BERTScore Precision: {P.mean().item():.4f}")
print(f"BERTScore Recall:    {R.mean().item():.4f}")
print(f"BERTScore F1:        {F1.mean().item():.4f}")

calculating scores...
computing bert embedding.


  0%|          | 0/113 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/58 [00:00<?, ?it/s]

done in 25.27 seconds, 145.58 sentences/sec
BERTScore Precision: 0.8787
BERTScore Recall:    0.8493
BERTScore F1:        0.8632


In [23]:
## UNICHART STATISTA
from bert_score import score

# 1. Baca data
df = pd.read_csv('/kaggle/input/inference-unichart-output/eval_unichart_statista_results.csv')
refs = df['ans_ref'].tolist()
preds = df['ans_pred'].tolist()

# 2. Hitung BERTScore dengan model default
P, R, F1 = score(preds, refs, lang="id", verbose=True)

# 3. Print hasil rata-rata
print(f"BERTScore Precision: {P.mean().item():.4f}")
print(f"BERTScore Recall:    {R.mean().item():.4f}")
print(f"BERTScore F1:        {F1.mean().item():.4f}")

calculating scores...
computing bert embedding.


  0%|          | 0/77 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/40 [00:00<?, ?it/s]

done in 13.11 seconds, 190.64 sentences/sec
BERTScore Precision: 0.8669
BERTScore Recall:    0.8298
BERTScore F1:        0.8473


## Skenario testing

In [24]:
from bert_score import score
import pandas as pd

# 1. Baca data
df = pd.read_csv('/kaggle/input/inference-unichart-output/skenario-testing-metadata.csv')

# 2. Simpan hasil per grup
results = []

# 3. Group by `tipe_skenario`
for (skenario, model), group_df in df.groupby(['tipe_skenario', 'model']):
    refs = group_df['ans_ref'].tolist()
    preds = group_df['ans_pred'].tolist()

    # 4. Hitung BERTScore untuk grup ini
    P, R, F1 = score(preds, refs, lang="id", verbose=False)

    # 5. Ambil rata-rata dan simpan
    results.append({
        "tipe_skenario": skenario,
        "model": model,
        "BERTScore_P": P.mean().item(),
        "BERTScore_R": R.mean().item(),
        "BERTScore_F1": F1.mean().item()
    })

# 6. Konversi ke DataFrame
result_df = pd.DataFrame(results)

# 7. Tampilkan hasil
result_df

Unnamed: 0,tipe_skenario,model,BERTScore_P,BERTScore_R,BERTScore_F1
0,gangguan visual,ChartInstruct-Llama,0.878481,0.868727,0.873448
1,gangguan visual,UniChart,0.873903,0.861031,0.867122
2,penghilangan elemen,ChartInstruct-Llama,0.87141,0.863295,0.867163
3,penghilangan elemen,UniChart,0.870913,0.846425,0.85827
4,resolusi piksel,ChartInstruct-Llama,0.891846,0.884907,0.888309
5,resolusi piksel,UniChart,0.84653,0.828523,0.837214
6,tiga dimensi,ChartInstruct-Llama,0.892801,0.87887,0.885668
7,tiga dimensi,UniChart,0.864576,0.851843,0.857994
8,variasi gaya visual,ChartInstruct-Llama,0.849192,0.844037,0.846452
9,variasi gaya visual,UniChart,0.854221,0.833375,0.843297


# METRIK INDOBERT SCORE

In [25]:
!pip install -q scipy

In [26]:
import torch
from transformers import AutoTokenizer, AutoModel
from scipy.spatial.distance import cosine
from tqdm import tqdm
import pandas as pd

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Fungsi untuk load model dan tokenizer IndoBERT (bisa ganti model lain juga)
def load_model(model_name, device):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    model.to(device)  # Pastikan model berjalan di device (GPU/CPU)
    return tokenizer, model

# Fungsi untuk mengubah teks jadi embeddings
def get_embeddings(texts, tokenizer, model, device, max_length=512):
    # Tokenisasi teks dan batasi panjangnya
    inputs = tokenizer(texts, return_tensors="pt", truncation=True, padding=True, max_length=max_length)
    inputs = {key: value.to(device) for key, value in inputs.items()}  # Pindahkan ke device
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)  # Ambil mean dari semua token embeddings
    return embeddings

# Fungsi untuk menghitung cosine similarity antar dua embeddings
def cosine_sim(embeddings1, embeddings2):
    # Flatten tensor menjadi vektor 1-D
    embeddings1 = embeddings1.flatten().cpu().numpy()  # Convert tensor ke numpy dan flatten
    embeddings2 = embeddings2.flatten().cpu().numpy()  # Convert tensor ke numpy dan flatten
    sim = 1 - cosine(embeddings1, embeddings2)  # Cosine similarity, makin dekat 1 makin mirip
    return sim

# Fungsi untuk menghitung BERTScore secara manual (Precision, Recall, F1)
def bertscore_custom(refs, preds, model_name="indobenchmark/indobert-base-p1", batch_size=16):
    # Tentukan device (GPU atau CPU)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    tokenizer, model = load_model(model_name, device)
    precision, recall, f1 = [], [], []

    # Proses dalam batch dengan tqdm untuk progress bar
    for i in tqdm(range(0, len(refs), batch_size), desc="Processing Batches", ncols=100, unit="batch"):
        batch_refs = refs[i:i+batch_size]
        batch_preds = preds[i:i+batch_size]

        # Dapatkan embeddings untuk referensi dan prediksi dalam satu batch
        ref_embs = get_embeddings(batch_refs, tokenizer, model, device)
        pred_embs = get_embeddings(batch_preds, tokenizer, model, device)

        # Hitung cosine similarity untuk setiap pasangan referensi dan prediksi dalam batch
        for ref_emb, pred_emb in zip(ref_embs, pred_embs):
            sim = cosine_sim(ref_emb, pred_emb)
            precision.append(sim)  # Ganti logika precision sesuai keinginan (misal ambil threshold)
            recall.append(sim)     # Sama halnya dengan recall
            f1.append(2 * (precision[-1] * recall[-1]) / (precision[-1] + recall[-1]) if precision[-1] + recall[-1] > 0 else 0)

    return {
        "precision": sum(precision) / len(precision),
        "recall": sum(recall) / len(recall),
        "f1": sum(f1) / len(f1)
    }

## ChartInstruct-LLama

In [27]:
## CHARTINSTRUCT-LLAMA BPS

# 1. Baca data dari CSV
df = pd.read_csv('/kaggle/input/inference-unichart-output/eval_chartinstruct_bps_results.csv')
refs = df['ans_ref'].tolist()
preds = df['ans_pred'].tolist()

# 2. Hitung INDOBERTScore Custom
result = bertscore_custom(refs, preds)

# 3. Print hasil INDOBERTScore
print(f"INDOBERTScore Precision: {result['precision']:.4f}")
print(f"INDOBERTScore Recall:    {result['recall']:.4f}")
print(f"INDOBERTScore F1:        {result['f1']:.4f}")

tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/498M [00:00<?, ?B/s]

Processing Batches:   0%|                                                 | 0/74 [00:00<?, ?batch/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

Processing Batches: 100%|████████████████████████████████████████| 74/74 [00:09<00:00,  7.92batch/s]

INDOBERTScore Precision: 0.9339
INDOBERTScore Recall:    0.9339
INDOBERTScore F1:        0.9339





In [28]:
## CHARTINSTRUCT-LLAMA INDOCHART

# 1. Baca data dari CSV
df = pd.read_csv('/kaggle/input/inference-unichart-output/eval_chartinstruct_results.csv')
refs = df['ans_ref'].tolist()
preds = df['ans_pred'].tolist()

# 2. Hitung INDOBERTScore Custom
result = bertscore_custom(refs, preds)

# 3. Print hasil INDOBERTScore
print(f"INDOBERTScore Precision: {result['precision']:.4f}")
print(f"INDOBERTScore Recall:    {result['recall']:.4f}")
print(f"INDOBERTScore F1:        {result['f1']:.4f}")

Processing Batches: 100%|██████████████████████████████████████| 230/230 [00:28<00:00,  7.97batch/s]

INDOBERTScore Precision: 0.7860
INDOBERTScore Recall:    0.7860
INDOBERTScore F1:        0.7860





In [29]:
## CHARTINSTRUCT-LLAMA STATISTA

# 1. Baca data dari CSV
df = pd.read_csv('/kaggle/input/inference-unichart-output/eval_chartinstruct_statista_results.csv')
refs = df['ans_ref'].tolist()
preds = df['ans_pred'].tolist()

# 2. Hitung INDOBERTScore Custom
result = bertscore_custom(refs, preds)

# 3. Print hasil INDOBERTScore
print(f"INDOBERTScore Precision: {result['precision']:.4f}")
print(f"INDOBERTScore Recall:    {result['recall']:.4f}")
print(f"INDOBERTScore F1:        {result['f1']:.4f}")

Processing Batches: 100%|██████████████████████████████████████| 157/157 [00:18<00:00,  8.60batch/s]

INDOBERTScore Precision: 0.7318
INDOBERTScore Recall:    0.7318
INDOBERTScore F1:        0.7318





## UniChart

In [30]:
## UNICHART BPS

# 1. Baca data dari CSV
df = pd.read_csv('/kaggle/input/inference-unichart-output/eval_unichart_bps_results.csv')
refs = df['ans_ref'].tolist()
preds = df['ans_pred'].tolist()

# 2. Hitung INDOBERTScore Custom
result = bertscore_custom(refs, preds)

# 3. Print hasil INDOBERTScore
print(f"INDOBERTScore Precision: {result['precision']:.4f}")
print(f"INDOBERTScore Recall:    {result['recall']:.4f}")
print(f"INDOBERTScore F1:        {result['f1']:.4f}")

Processing Batches: 100%|████████████████████████████████████████| 74/74 [00:09<00:00,  7.49batch/s]

INDOBERTScore Precision: 0.9574
INDOBERTScore Recall:    0.9574
INDOBERTScore F1:        0.9574





In [31]:
## UNICHART STATISTA

# 1. Baca data dari CSV
df = pd.read_csv('/kaggle/input/inference-unichart-output/eval_unichart_statista_results.csv')
refs = df['ans_ref'].tolist()
preds = df['ans_pred'].tolist()

# 2. Hitung INDOBERTScore Custom
result = bertscore_custom(refs, preds)

# 3. Print hasil INDOBERTScore
print(f"INDOBERTScore Precision: {result['precision']:.4f}")
print(f"INDOBERTScore Recall:    {result['recall']:.4f}")
print(f"INDOBERTScore F1:        {result['f1']:.4f}")

Processing Batches: 100%|██████████████████████████████████████| 157/157 [00:15<00:00, 10.01batch/s]

INDOBERTScore Precision: 0.8395
INDOBERTScore Recall:    0.8395
INDOBERTScore F1:        0.8395





In [32]:
## UNICHART INDOCHART

# 1. Baca data dari CSV
df = pd.read_csv('/kaggle/input/inference-unichart-output/eval_unichart_results.csv')
refs = df['ans_ref'].tolist()
preds = df['ans_pred'].tolist()

# 2. Hitung INDOBERTScore Custom
result = bertscore_custom(refs, preds)

# 3. Print hasil INDOBERTScore
print(f"INDOBERTScore Precision: {result['precision']:.4f}")
print(f"INDOBERTScore Recall:    {result['recall']:.4f}")
print(f"INDOBERTScore F1:        {result['f1']:.4f}")

Processing Batches: 100%|██████████████████████████████████████| 230/230 [00:28<00:00,  7.97batch/s]

INDOBERTScore Precision: 0.8790
INDOBERTScore Recall:    0.8790
INDOBERTScore F1:        0.8790





## Skenario testing

In [33]:
import pandas as pd

# 1. Baca data dari CSV
df = pd.read_csv('/kaggle/input/inference-unichart-output/skenario-testing-metadata.csv')

# 2. Inisialisasi list untuk hasil
results = []

# 3. Grouping dan evaluasi INDOBERTScore per grup
for (skenario, model), group_df in df.groupby(['tipe_skenario', 'model']):
    refs = group_df['ans_ref'].tolist()
    preds = group_df['ans_pred'].tolist()

    # 4. Hitung skor dengan fungsi custom kamu
    result = bertscore_custom(refs, preds)

    # 5. Simpan hasil ke dalam list
    results.append({
        "tipe_skenario": skenario,
        "model": model,
        "INDOBERTScore_P": result['precision'],
        "INDOBERTScore_R": result['recall'],
        "INDOBERTScore_F1": result['f1']
    })

# 6. Ubah menjadi DataFrame
result_df = pd.DataFrame(results)

# 7. Print hasil akhir
result_df

Processing Batches: 100%|██████████████████████████████████████████| 1/1 [00:00<00:00, 11.40batch/s]
Processing Batches: 100%|██████████████████████████████████████████| 1/1 [00:00<00:00, 10.48batch/s]
Processing Batches: 100%|██████████████████████████████████████████| 1/1 [00:00<00:00, 11.40batch/s]
Processing Batches: 100%|██████████████████████████████████████████| 1/1 [00:00<00:00, 10.83batch/s]
Processing Batches: 100%|██████████████████████████████████████████| 1/1 [00:00<00:00, 10.50batch/s]
Processing Batches: 100%|██████████████████████████████████████████| 1/1 [00:00<00:00, 10.49batch/s]
Processing Batches: 100%|██████████████████████████████████████████| 1/1 [00:00<00:00, 11.25batch/s]
Processing Batches: 100%|██████████████████████████████████████████| 1/1 [00:00<00:00, 10.86batch/s]
Processing Batches: 100%|██████████████████████████████████████████| 1/1 [00:00<00:00, 11.36batch/s]
Processing Batches: 100%|██████████████████████████████████████████| 1/1 [00:00<00:00, 10.5

Unnamed: 0,tipe_skenario,model,INDOBERTScore_P,INDOBERTScore_R,INDOBERTScore_F1
0,gangguan visual,ChartInstruct-Llama,0.925775,0.925775,0.925775
1,gangguan visual,UniChart,0.92201,0.92201,0.92201
2,penghilangan elemen,ChartInstruct-Llama,0.925,0.925,0.925
3,penghilangan elemen,UniChart,0.905976,0.905976,0.905976
4,resolusi piksel,ChartInstruct-Llama,0.945946,0.945946,0.945946
5,resolusi piksel,UniChart,0.880908,0.880908,0.880908
6,tiga dimensi,ChartInstruct-Llama,0.936715,0.936715,0.936715
7,tiga dimensi,UniChart,0.907918,0.907918,0.907918
8,variasi gaya visual,ChartInstruct-Llama,0.906323,0.906323,0.906323
9,variasi gaya visual,UniChart,0.882331,0.882331,0.882331
