# 출처 및 주제 태깅

KMMLU, MMMLU, KLUE-MRC에서 가져온 학습 데이터를 식별해 `from`/`subject` 태그를 붙이는 정리된 노트북입니다.\
실행 전후로 `source_restoration.ipynb`를 건드릴 필요 없도록 전체 파이프라인을 한 번에 정돈했습니다.

*작업을 수행하고나면, problems가 평탄화 된 상태로 저장됩니다.

## 노트북 사용 가이드

- `../data/train.csv`를 로드해 `problems` 컬럼을 펼친 뒤 라벨링합니다.
- KMMLU(한국사)는 문단 기준 유사도 + 포함 검색을 활용합니다.
- MMMLU는 화이트리스트된 과목만 사용해 앵커 기반 주제 매핑을 수행합니다.
- KLUE-MRC는 허용된 뉴스 카테고리만 대상으로 앵커 매핑을 수행합니다.
- 모든 단계가 끝나면 `../data/train_source_labeled.csv`를 생성합니다.(정상 실행 시점에만 저장).


In [None]:
from pathlib import Path
import ast
import re

import pandas as pd
from datasets import load_dataset

# 경로 설정
DATA_DIR = Path("../data")
TRAIN_PATH = DATA_DIR / "train.csv"
OUTPUT_PATH = DATA_DIR / "train_source_labeled.csv"


## 유틸 함수 모음
- 텍스트 정규화: 공백/특수문자 제거 후 비교 안정화
- 3-앵커 추출: 앞/중간/뒤 고정 길이 토막으로 매칭 강건성 확보
- 키 후보 생성 및 빈도 기반 주제 매핑: 오탐을 줄이기 위해 최소 길이·화이트리스트 적용


In [None]:
_ws = re.compile(r"\s+")
_keep = re.compile(r"[^0-9A-Za-z가-힣\s]")  # 허용: 한글/영문/숫자/공백

def norm_text(x, remove_all_space: bool = True) -> str:
    '''Whitespace/특수문자를 정리해 비교용 텍스트를 만든다.'''
    if x is None or (isinstance(x, float) and pd.isna(x)):
        return ""
    x = str(x)
    x = _ws.sub(" ", x)
    x = _keep.sub(" ", x)
    x = _ws.sub(" ", x).strip()
    if remove_all_space:
        x = re.sub(r"\s+", "", x)
    return x

def split_question_paragraph(text: str):
    '''첫 번째 물음표 이전을 question, 이후를 paragraph로 분리.'''
    if not isinstance(text, str):
        return "", ""
    text = text.strip()
    qmark = text.find("?")
    if qmark == -1:
        return text, ""
    return text[: qmark + 1].strip(), text[qmark + 1 :].strip()

def build_anchor_views(series: pd.Series, anchor_len: int):
    '''문장을 앞/중간/끝 고정 길이 토막으로 나눠 앵커 세트를 만든다.'''
    series = series.astype("string").fillna("")
    head = series.str.slice(0, anchor_len)
    tail = series.str.slice(-anchor_len, None)
    lens = series.str.len().to_numpy()
    mid_starts = (lens // 2) - (anchor_len // 2)
    mid = pd.Series(
        [txt[max(0, st): max(0, st) + anchor_len] for txt, st in zip(series.tolist(), mid_starts)],
        index=series.index,
        dtype="string",
    )
    return head, mid, tail

def make_keys(s: pd.Series, min_len: int = 2) -> pd.Series:
    s = s.astype("string").fillna("")
    s = s[s.str.len() >= min_len].drop_duplicates()
    return s.sort_values(key=lambda x: x.str.len(), ascending=False)

def collect_found_keys(corpus: pd.Series, keys: pd.Series, chunk_size: int = 3000) -> set:
    '''코퍼스에서 keys 중 등장하는 것만 모아 검색 범위를 줄인다.'''
    corpus = corpus.astype("string").fillna("")
    keys_list = keys.astype("string").fillna("").tolist()
    found = set()
    for i in range(0, len(keys_list), chunk_size):
        sub = keys_list[i:i + chunk_size]
        pat = re.compile("|".join(map(re.escape, sub)))
        for lst in corpus.str.findall(pat):
            if lst:
                found.update(lst)
    return found

def build_top_subject_map(df: pd.DataFrame, keys: pd.Series, q_col: str, subj_col: str, chunk_size: int = 2000) -> dict:
    '''질문/지문에서 발견된 key별 최빈 주제를 계산한다.'''
    mm = df[[q_col, subj_col]].copy()
    mm[q_col] = mm[q_col].astype("string").fillna("")
    mm[subj_col] = mm[subj_col].astype("string").fillna("")
    counts = {}
    keys_list = keys.astype("string").fillna("").tolist()
    for i in range(0, len(keys_list), chunk_size):
        sub = keys_list[i:i + chunk_size]
        pat = re.compile("|".join(map(re.escape, sub)))
        matches = mm[q_col].str.findall(pat)
        for subj, lst in zip(mm[subj_col].tolist(), matches.tolist()):
            if lst:
                for k in lst:
                    counts.setdefault(k, {}).setdefault(subj, 0)
                    counts[k][subj] += 1
    return {k: max(subj_count.items(), key=lambda x: x[1])[0] for k, subj_count in counts.items()}

def propagate_labels(df: pd.DataFrame, norm_col: str = "para_norm") -> pd.DataFrame:
    '''동일한 정규화 텍스트가 이미 라벨링된 경우 나머지 행에도 전파.'''
    labeled = (
        df[df["from"].notna()][[norm_col, "from", "subject"]]
        .drop_duplicates(subset=[norm_col])
        .set_index(norm_col)
    )
    for col in ["from", "subject"]:
        missing = df[col].isna()
        df.loc[missing, col] = df.loc[missing, norm_col].map(labeled[col])
    return df


## 학습 데이터 로드 및 기본 전처리
- `problems` 컬럼을 안전하게 dict로 변환해 question/answer/choices로 분리합니다.
- 비교용 `para_norm`을 한 번 만들어 이후 단계에서 재사용합니다.


In [None]:
train = pd.read_csv(TRAIN_PATH)
parsed = train["problems"].map(ast.literal_eval)
train["question"] = parsed.map(lambda x: x["question"])
train["answer"] = parsed.map(lambda x: x["answer"])
train["choices"] = parsed.map(lambda x: x["choices"])
train["paragraph"] = train["paragraph"].astype(str)

train_origin = train.copy()
train_origin["para_norm"] = train_origin["paragraph"].map(lambda x: norm_text(x, remove_all_space=True))

train_labeled = train_origin.copy()
train_labeled["from"] = pd.NA
train_labeled["subject"] = pd.NA

print("train missing paragraphs:", train_origin["paragraph"].eq("").sum())


## KMMLU (한국사) 태깅
1) KMMLU 전체(split 병합)를 question/paragraph로 분리 후 정규화합니다.
2) train 문단의 앞쪽 앵커/짧은 문장 키가 KMMLU 지문에 등장하는지 검색합니다.
3) KMMLU 질문/지문이 train 문단을 포함하는 경우까지 보완해 누락을 최소화합니다.


In [None]:
# 1) 데이터셋 적재 및 전처리
ds_kmmlu = load_dataset("HAERAE-HUB/KMMLU", "Korean-History")
kmmlu_raw = pd.concat([ds_kmmlu[s].to_pandas() for s in ["train", "dev", "test"]])

kmmlu_proc = kmmlu_raw.copy()
kmmlu_proc[["question", "paragraph"]] = kmmlu_proc["question"].apply(lambda x: pd.Series(split_question_paragraph(x)))
kmmlu_proc["choices"] = kmmlu_proc[["A", "B", "C", "D"]].values.tolist()
kmmlu_proc = kmmlu_proc[["question", "paragraph", "choices", "answer"]].reset_index(drop=True)
kmmlu_proc["para_norm"] = kmmlu_proc["paragraph"].map(lambda x: norm_text(x, remove_all_space=True))


In [None]:
# 2) 앵커 기반 매칭 (train -> KMMLU)
remain = train_labeled["from"].isna()
train_key = train_origin["para_norm"].astype("string").fillna("")

ANCHOR_LEN = 50
train_anchor = train_key.str.slice(0, ANCHOR_LEN)
LONG_MIN, SHORT_MIN, SHORT_MAX = 30, 8, 29

long_keys = make_keys(train_anchor[remain & (train_key.str.len() >= LONG_MIN)], min_len=LONG_MIN)
short_keys = make_keys(train_key[remain & train_key.str.len().between(SHORT_MIN, SHORT_MAX)], min_len=SHORT_MIN)

found_long = collect_found_keys(kmmlu_proc["para_norm"], long_keys, chunk_size=1000)
found_short = collect_found_keys(kmmlu_proc["para_norm"], short_keys, chunk_size=1000)

m = remain & (train_anchor.isin(found_long) | train_key.isin(found_short))
train_labeled.loc[m, "from"] = "KMMLU"
train_labeled.loc[m, "subject"] = "korean_history"
train_labeled = propagate_labels(train_labeled)

print("KMMLU(anchor) newly labeled:", int(m.sum()))
print("Total labeled so far:", int(train_labeled["from"].notna().sum()))


In [None]:
# 3) 역방향 포함 검색 (KMMLU 질문/지문 -> train 문단)
remain_idx = train_labeled.index[train_labeled["from"].isna()]
train_text = train_origin.loc[remain_idx, "para_norm"].astype("string").fillna("")

MIN_KEY_LEN = 20
km_para_keys = kmmlu_proc["para_norm"].astype("string").fillna("").drop_duplicates()
km_para_keys = km_para_keys[km_para_keys.str.len() >= MIN_KEY_LEN].sort_values(key=lambda s: s.str.len(), ascending=False)

km_q_norm = kmmlu_proc["question"].map(lambda x: norm_text(x, remove_all_space=True)).astype("string").fillna("")
km_q_keys = km_q_norm.drop_duplicates()
km_q_keys = km_q_keys[km_q_keys.str.len() >= MIN_KEY_LEN].sort_values(key=lambda s: s.str.len(), ascending=False)

matched = {}
for idx, txt in zip(remain_idx.tolist(), train_text.tolist()):
    for k in km_para_keys:
        if k in txt:
            matched[idx] = k
            break
    if idx in matched:
        continue
    for k in km_q_keys:
        if k in txt:
            matched[idx] = k
            break

km_mask = train_labeled.index.isin(matched.keys())
train_labeled.loc[km_mask, "from"] = "KMMLU"
train_labeled.loc[km_mask, "subject"] = "korean_history"
train_labeled = propagate_labels(train_labeled)

print("KMMLU(containment) newly labeled:", int(km_mask.sum()))
print("Total labeled so far:", int(train_labeled["from"].notna().sum()))


## MMMLU 태깅 (화이트리스트 과목)
- KO_KR split 전체를 합쳐 사용합니다.
- 화이트리스트에 포함된 과목만 대상으로 앵커 기반 주제 매핑을 수행합니다.
- 질문 텍스트 기반 보정 단계까지 수행해 누락된 히스토리를 채웁니다.


In [None]:
ds_mmmlu = load_dataset("openai/MMMLU", "KO_KR")
mmmlu_raw = pd.concat([ds_mmmlu[s].to_pandas() for s in ds_mmmlu.keys()], ignore_index=True)

mmmlu_proc = mmmlu_raw.copy()
mmmlu_proc["q_norm"] = mmmlu_proc["Question"].map(lambda x: norm_text(x, remove_all_space=True)).astype("string").fillna("")
mmmlu_proc["Subject"] = mmmlu_proc["Subject"].astype("string").fillna("")

WHITE_LIST = {
    "high_school_european_history",
    "high_school_us_history",
    "high_school_world_history",
    "high_school_macroeconomics",
    "high_school_microeconomics",
    "high_school_government_and_politics",
    "high_school_geography",
    "high_school_psychology",
}
mmmlu_proc = mmmlu_proc[mmmlu_proc["Subject"].isin(WHITE_LIST)].copy()


In [None]:
# 1) 문단 기반 앵커 매핑
remain = train_labeled["from"].isna()
s = train_origin["para_norm"].astype("string").fillna("")
a_head, a_mid, a_tail = build_anchor_views(s, anchor_len=40)

LONG_MIN, SHORT_MIN, SHORT_MAX = 20, 8, 29
long_keys = make_keys(pd.concat([
    a_head[remain & (s.str.len() >= LONG_MIN)],
    a_mid[remain & (s.str.len() >= LONG_MIN)],
    a_tail[remain & (s.str.len() >= LONG_MIN)],
], ignore_index=True).drop_duplicates(), min_len=LONG_MIN)
short_keys = make_keys(s[remain & s.str.len().between(SHORT_MIN, SHORT_MAX)].drop_duplicates(), min_len=SHORT_MIN)

top_long = build_top_subject_map(mmmlu_proc, long_keys, q_col="q_norm", subj_col="Subject", chunk_size=1000)
top_short = build_top_subject_map(mmmlu_proc, short_keys, q_col="q_norm", subj_col="Subject", chunk_size=500)

m_long = remain & (a_head.isin(top_long) | a_mid.isin(top_long) | a_tail.isin(top_long))
m_short = remain & s.isin(top_short)
m = m_long | m_short

subj_long = a_head.map(top_long).fillna(a_mid.map(top_long)).fillna(a_tail.map(top_long))
subj_short = s.map(top_short)

train_labeled.loc[m, "from"] = "MMMLU"
train_labeled.loc[m, "subject"] = subj_long.fillna(subj_short)
train_labeled = propagate_labels(train_labeled)

print("MMMLU(paragraph) newly labeled:", int(m.sum()))
print("Total labeled so far:", int(train_labeled["from"].notna().sum()))


In [None]:
# 2) 질문 텍스트 기반 보정
remain = train_labeled["from"].isna()
train_q_norm = train_origin["question"].map(lambda x: norm_text(x, remove_all_space=True)).astype("string").fillna("")
train_labeled["question_norm"] = train_q_norm

q_head, q_mid, q_tail = build_anchor_views(train_q_norm, anchor_len=40)
LONG_MIN, SHORT_MIN, SHORT_MAX = 20, 8, 29

q_long_keys = make_keys(pd.concat([
    q_head[remain & (train_q_norm.str.len() >= LONG_MIN)],
    q_mid[remain & (train_q_norm.str.len() >= LONG_MIN)],
    q_tail[remain & (train_q_norm.str.len() >= LONG_MIN)],
], ignore_index=True).drop_duplicates(), min_len=LONG_MIN)
q_short_keys = make_keys(train_q_norm[remain & train_q_norm.str.len().between(SHORT_MIN, SHORT_MAX)].drop_duplicates(), min_len=SHORT_MIN)

top_q_long = build_top_subject_map(mmmlu_proc, q_long_keys, q_col="q_norm", subj_col="Subject", chunk_size=1000)
top_q_short = build_top_subject_map(mmmlu_proc, q_short_keys, q_col="q_norm", subj_col="Subject", chunk_size=500)

m_long = remain & (q_head.isin(top_q_long) | q_mid.isin(top_q_long) | q_tail.isin(top_q_long))
m_short = remain & train_q_norm.isin(top_q_short)
m = m_long | m_short

subj_long = q_head.map(top_q_long).fillna(q_mid.map(top_q_long)).fillna(q_tail.map(top_q_long))
subj_short = train_q_norm.map(top_q_short)

train_labeled.loc[m, "from"] = "MMMLU"
train_labeled.loc[m, "subject"] = subj_long.fillna(subj_short)
train_labeled = propagate_labels(train_labeled)

print("MMMLU(question) newly labeled:", int(m.sum()))
print("Total labeled so far:", int(train_labeled["from"].notna().sum()))


## KLUE-MRC 태깅
- KLUE-MRC 전 split을 합친 뒤 허용 카테고리만 사용합니다.
- 문단 앵커 기반 매핑으로 뉴스 카테고리를 subject로 설정합니다.


In [None]:
ds_klue = load_dataset("klue", "mrc")
klue_raw = pd.concat([ds_klue[s].to_pandas() for s in ds_klue.keys()], ignore_index=True)

ALLOWED = {"경제", "교육산업", "국제", "부동산", "사회", "생활", "책마을"}

klue_proc = klue_raw.copy()
klue_proc["news_category"] = (
    klue_proc["news_category"]
    .astype("string")
    .str.strip()
    .replace({"null": pd.NA, "NULL": pd.NA, "None": pd.NA, "": pd.NA})
)
klue_proc = klue_proc[klue_proc["news_category"].isin(ALLOWED)].copy()
klue_proc["context_norm"] = klue_proc["context"].map(lambda x: norm_text(x, remove_all_space=True)).astype("string").fillna("")


In [None]:
remain = train_labeled["from"].isna()
s = train_origin["para_norm"].astype("string").fillna("")
a_head, a_mid, a_tail = build_anchor_views(s, anchor_len=40)

LONG_MIN, SHORT_MIN, SHORT_MAX = 20, 8, 29
long_keys = make_keys(pd.concat([
    a_head[remain & (s.str.len() >= LONG_MIN)],
    a_mid[remain & (s.str.len() >= LONG_MIN)],
    a_tail[remain & (s.str.len() >= LONG_MIN)],
], ignore_index=True).drop_duplicates(), min_len=LONG_MIN)
short_keys = make_keys(s[remain & s.str.len().between(SHORT_MIN, SHORT_MAX)].drop_duplicates(), min_len=SHORT_MIN)

top_long = build_top_subject_map(klue_proc, long_keys, q_col="context_norm", subj_col="news_category", chunk_size=1000)
top_short = build_top_subject_map(klue_proc, short_keys, q_col="context_norm", subj_col="news_category", chunk_size=500)

m_long = remain & (a_head.isin(top_long) | a_mid.isin(top_long) | a_tail.isin(top_long))
m_short = remain & s.isin(top_short)
m = m_long | m_short

subj_long = a_head.map(top_long).fillna(a_mid.map(top_long)).fillna(a_tail.map(top_long))
subj_short = s.map(top_short)

train_labeled.loc[m, "from"] = "klue-mrc"
train_labeled.loc[m, "subject"] = subj_long.fillna(subj_short)
train_labeled = propagate_labels(train_labeled)

print("KLUE-MRC newly labeled:", int(m.sum()))
print("Total labeled so far:", int(train_labeled["from"].notna().sum()))


In [None]:
train_labeled['from'].value_counts()

## 남은 데이터 확인 및 저장
- 라벨이 비어 있는 샘플 수를 체크합니다.
- 작업용 정규화 컬럼은 저장 전 제거해 원본 스키마를 유지합니다.


In [None]:
remaining = train_labeled["from"].isna().sum()
print("Unlabeled samples remaining:", int(remaining))

export_df = train_labeled.drop(columns=[c for c in ["para_norm", "question_norm", 'problems'] if c in train_labeled.columns])
export_df.to_csv(OUTPUT_PATH, index=False)
print("Saved:", OUTPUT_PATH.resolve())
