In [1]:
import os
import sys
from pathlib import Path

nb_dir = Path(os.getcwd())

project_root = nb_dir.parents[1]

if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

print("project_root:", project_root)
print("sys.path[0]:", sys.path[0])

project_root: /data/ephemeral/pro-nlp-generationfornlp-nlp-13
sys.path[0]: /data/ephemeral/pro-nlp-generationfornlp-nlp-13


In [2]:
import pandas as pd
import ast

train_gen = pd.read_csv("../../data/dpo_outputs/train_gen.csv")
valid_gen = pd.read_csv("../../data/dpo_outputs/valid_gen.csv")

print(f"Train: {len(train_gen)} rows")
print(f"Valid: {len(valid_gen)} rows")
print(f"\nTrain accuracy: {train_gen['is_correct'].mean():.4f}")
print(f"Valid accuracy: {valid_gen['is_correct'].mean():.4f}")

print(f"\nMargin stats (digit_margin_top1_minus_top2):")
print(train_gen['digit_margin_top1_minus_top2'].describe())

incorrect = train_gen[train_gen['is_correct'] == False]
print(f"\nIncorrect samples: {len(incorrect)}")


soft_true = train_gen[
    (train_gen['is_correct'] == True) & 
    (train_gen['digit_margin_top1_minus_top2'] <= 0.995)
]
print(f"Soft True samples (margin <= 0.995): {len(soft_true)}")


Train: 1827 rows
Valid: 204 rows

Train accuracy: 0.9622
Valid accuracy: 0.9020

Margin stats (digit_margin_top1_minus_top2):
count    1827.000000
mean        0.912304
std         0.200409
min         0.000000
25%         0.941516
50%         0.995122
75%         0.999675
max         0.999974
Name: digit_margin_top1_minus_top2, dtype: float64

Incorrect samples: 69
Soft True samples (margin <= 0.995): 843


In [6]:
valid_gen.iloc[0,:].to_dict()

{'id': 'generation-for-nlp-1672',
 'choices_len': 5,
 'answer': 1,
 'predicted_answer': 1,
 'is_correct': True,
 'generated_text': '<think>\n\n</think>\n\n1',
 'top5_candidates': "[{'rank': 1, 'token_id': 16, 'token': '1', 'logit': 31.859375, 'prob_full_vocab': 0.9998868703842163}, {'rank': 2, 'token_id': 17, 'token': '2', 'logit': 22.5, 'prob_full_vocab': 8.614418766228482e-05}, {'rank': 3, 'token_id': 18, 'token': '3', 'logit': 20.515625, 'prob_full_vocab': 1.1841939340229146e-05}, {'rank': 4, 'token_id': 15, 'token': '0', 'logit': 19.703125, 'prob_full_vocab': 5.254828920442378e-06}, {'rank': 5, 'token_id': 19, 'token': '4', 'logit': 19.6875, 'prob_full_vocab': 5.173360023036366e-06}]",
 'digit_probs_1_to_k': '[0.9998952150344849, 8.614490070613101e-05, 1.184203847515164e-05, 5.1734036787820514e-06, 1.6795581814221805e-06]',
 'digit_margin_top1_minus_top2': 0.9998090863227844,
 'digit_top1': 1,
 'digit_top2': 2,
 'prompt': '<|im_start|>system\n당신은 논리적인 **텍스트 분석 및 독해 전문가**입니다.\n이 문제는

In [8]:
valid_gen.groupby("is_correct")["digit_margin_top1_minus_top2"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
is_correct,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
False,20.0,0.451076,0.324007,0.015244,0.189589,0.334505,0.722327,0.990546
True,184.0,0.867354,0.259224,0.004419,0.896691,0.996977,0.999669,0.999967


In [9]:
train_gen.groupby("is_correct")["digit_margin_top1_minus_top2"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
is_correct,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
False,69.0,0.300058,0.279162,0.0,0.071243,0.213621,0.388363,0.955074
True,1758.0,0.936334,0.153057,0.014795,0.952108,0.996333,0.999708,0.999974


In [11]:
import pandas as pd
import ast

train_gen_df = pd.read_csv("../../data/dpo_outputs/train_gen.csv")
valid_gen_df = pd.read_csv("../../data/dpo_outputs/valid_gen.csv")

incorrect = train_gen_df[train_gen_df['is_correct'] == False]


soft_true = train_gen_df[
    (train_gen_df['is_correct'] == True) & 
    (train_gen_df['digit_margin_top1_minus_top2'] <= 0.995)
]
print(f"Soft True samples (margin <= 0.995): {len(soft_true)}")

from src.data.dpo_dataset import build_dpo_dataset, save_jsonl
import pandas as pd
from sklearn.model_selection import train_test_split

print(f"Train Gen: {len(train_gen_df)} rows")
print(f"Valid Gen: {len(valid_gen_df)} rows\n")

all_pairs = build_dpo_dataset(
    train_gen_df=train_gen_df,
    valid_gen_df=valid_gen_df,
    margin_threshold=0.995,
)

train_pairs, eval_pairs = train_test_split(
    all_pairs,
    test_size=0.1,
    random_state=42,
)

print(f"\nTrain pairs: {len(train_pairs)}")
print(f"Eval pairs: {len(eval_pairs)}")

save_jsonl(train_pairs, "../../data/dpo_train.jsonl")
save_jsonl(eval_pairs, "../../data/dpo_eval.jsonl")

import json
print("\n" + "="*80)
print("Sample DPO pair:")
print("="*80)
print(json.dumps(train_pairs[0], indent=2, ensure_ascii=False))


Soft True samples (margin <= 0.995): 843
Train Gen: 1827 rows
Valid Gen: 204 rows

Incorrect samples: 89
Soft True samples (margin <= 0.995): 929
Total DPO pairs generated: 1119

Train pairs: 1007
Eval pairs: 112
Saved: ../../data/dpo_train.jsonl (1007 samples)
Saved: ../../data/dpo_eval.jsonl (112 samples)

Sample DPO pair:
{
  "prompt": "<|im_start|>system\nYou are a student solving multiple-choice questions. The problem consists of a passage, a question, and choices. Solve the problem step-by-step according to the guidelines below.\n\nGuidelines:\n1. Question Analysis: Define exactly what the question is asking for.\n2. Choice Analysis: Analyze the choices and expand upon the concepts. For example, expand \"World War II\" to a broader concept like \"War.\" If a choice does not appear in the passage, analyze it by including the expanded concept.\n3. Background Knowledge Check: If the problem requires background knowledge, clearly identify what knowledge and specific parts are needed,