In [1]:
import pandas as pd

In [2]:
result_csv_path = "./result/result_v2.csv"  
ground_truth_csv_path = "./test_dataset/own_dataset_trans_v1.csv" 
output_score_csv_path = "./result/eval_v2.csv"

In [3]:
result_data = pd.read_csv(result_csv_path)
ground_truth_data = pd.read_csv(ground_truth_csv_path)

In [4]:
scoring_results = []

for _, row in ground_truth_data.iterrows():
    ground_truth_video_id = row["video"]
    ground_truth_start = row["start"]
    ground_truth_end = row["end"]
    index = row["index"]

    if str(ground_truth_video_id).startswith("-"):
        ground_truth_video_id = str(ground_truth_video_id)[1:]

    matching_result = result_data[result_data["original_query"] == row["query"]]

    is_correct = 0
    is_video_id_match = 0

    if not matching_result.empty:
        result_video_id = matching_result.iloc[0]["video_id"]
        result_start = matching_result.iloc[0]["start"]
        result_end = matching_result.iloc[0]["end"]

        if result_video_id == ground_truth_video_id:
            is_video_id_match = 1 
            if (
                ground_truth_start <= result_start <= ground_truth_end
                and ground_truth_start <= result_end <= ground_truth_end
            ):
                is_correct = 1  

    scoring_results.append({
        "index": index,
        "is_correct": is_correct,
        "is_video_id_match": is_video_id_match
    })

scoring_results_df = pd.DataFrame(scoring_results)

scoring_results_df.to_csv(output_score_csv_path, index=False, encoding="utf-8-sig")

---

In [5]:
scoring_results_df = pd.read_csv(output_score_csv_path)

ground_truth_data = pd.read_csv(ground_truth_csv_path)

scoring_results_df = scoring_results_df.drop_duplicates(subset=["index"])
ground_truth_data = ground_truth_data.drop_duplicates(subset=["index"])

merged_data = ground_truth_data.merge(scoring_results_df, on="index", how="inner")

total_samples = len(merged_data)
video_id_match_count = merged_data["is_video_id_match"].sum()
correct_count = merged_data["is_correct"].sum()

video_id_match_ratio = video_id_match_count / total_samples * 100
correct_ratio = correct_count / total_samples * 100

print("=== 전체 결과 ===")
print(f"전체 데이터셋 크기: {total_samples}")
print(f"Video ID 맞춘 개수: {video_id_match_count} ({video_id_match_ratio:.2f}%)")
print(f"정답 맞춘 개수: {correct_count} ({correct_ratio:.2f}%)")

type_stats = merged_data.groupby("type").agg(
    total=("type", "size"),
    video_id_match_count=("is_video_id_match", "sum"),
    correct_count=("is_correct", "sum")
).reset_index()

type_stats["video_id_match_ratio"] = (type_stats["video_id_match_count"] / type_stats["total"]) * 100
type_stats["correct_ratio"] = (type_stats["correct_count"] / type_stats["total"]) * 100

print("\n=== Type별 결과 ===")
for _, row in type_stats.iterrows():
    print(f"\nType: {row['type']}")
    print(f"  총 데이터: {row['total']}")
    print(f"  Video ID 맞춘 개수: {row['video_id_match_count']} ({row['video_id_match_ratio']:.2f}%)")
    print(f"  정답 맞춘 개수: {row['correct_count']} ({row['correct_ratio']:.2f}%)")

=== 전체 결과 ===
전체 데이터셋 크기: 175
Video ID 맞춘 개수: 93 (53.14%)
정답 맞춘 개수: 32 (18.29%)

=== Type별 결과 ===

Type: 1.0
  총 데이터: 44.0
  Video ID 맞춘 개수: 31.0 (70.45%)
  정답 맞춘 개수: 10.0 (22.73%)

Type: 2.0
  총 데이터: 44.0
  Video ID 맞춘 개수: 22.0 (50.00%)
  정답 맞춘 개수: 7.0 (15.91%)

Type: 3.0
  총 데이터: 43.0
  Video ID 맞춘 개수: 13.0 (30.23%)
  정답 맞춘 개수: 2.0 (4.65%)

Type: 5.0
  총 데이터: 44.0
  Video ID 맞춘 개수: 27.0 (61.36%)
  정답 맞춘 개수: 13.0 (29.55%)
