In [2]:
!pip install groq python-dotenv numpy tqdm datasets



In [3]:
from groq import Groq
from dotenv import load_dotenv
from datasets import load_dataset

import os
from tqdm import tqdm
import re
import random
import pprint

from typing import List, Dict, Any

load_dotenv()
random.seed(0)

client = Groq()
gsm8k_dataset = load_dataset("gsm8k", "main")

gsm8k_train = gsm8k_dataset["train"]
gsm8k_test  = gsm8k_dataset["test"]

In [5]:
def generate_response_using_Llama(
        prompt: str,
        model: str = "llama3-8b-8192"
    ):
    try:
        chat_completion = client.chat.completions.create(
            messages=[
                {
                    "role": "system",
                    "content": "You are a helpful assistant that solves math problems."
                },
                {
                    "role": "user", 
                    "content": prompt
                }
            ],
            model=model,
            temperature=0.3, ### 수정해도 됩니다!
            stream=False
        )
        return chat_completion.choices[0].message.content
    
    except Exception as e:
        print(f"API call error: {str(e)}")
        return None

#### 응답 잘 나오는지 확인해보기

In [6]:
response = generate_response_using_Llama(
    prompt="Hello world!",
)
print(response)

Hello there! I'm excited to help you with any math problems you might have. What kind of math are you working on? Do you have a specific problem you're stuck on or a concept you're trying to understand? Let me know and I'll do my best to assist you!


#### GSM8K 데이터셋 확인해보기

In [7]:
print("[Question]")
for l in gsm8k_test['question'][0].split("."):
    print(l)
print("="*100)
print("[Answer]")
print(gsm8k_test['answer'][0])

[Question]
Janet’s ducks lay 16 eggs per day
 She eats three for breakfast every morning and bakes muffins for her friends every day with four
 She sells the remainder at the farmers' market daily for $2 per fresh duck egg
 How much in dollars does she make every day at the farmers' market?
[Answer]
Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.
She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer’s market.
#### 18


#### Util 함수들
- extract_final_answer: LLM의 응답을 parse하여 최종 결과만 추출 (정답과 비교하기 위해)
- run_benchmark_test: 벤치마크 테스트
- save_final_result: 결과물 제출을 위한 함수

In [8]:
### 수정해도 됩니다!
def extract_final_answer(response: str):
    regex = r"(?:Answer:|Model response:)\s*\$?([0-9,]+)\b|([0-9,]+)\s*(meters|cups|miles|minutes)"
    matches = re.finditer(regex, response, re.MULTILINE)
    results = [match.group(1) if match.group(1) else match.group(2).replace(",", "") for match in matches]

    if len(results) == 0:
        additional_regex = r"\$?([0-9,]+)"
        additional_matches = re.finditer(additional_regex, response, re.MULTILINE)
        results.extend([match.group(1).replace(",", "") for match in additional_matches])

    return results[-1] if results else None

In [9]:
### 수정해도 됩니다!
def run_benchmark_test(
        dataset,
        prompt: str,
        model: str = "llama3-8b-8192",
        num_samples: int = 50,
        VERBOSE: bool = False
    ):
    correct = 0
    total   = 0
    results = []

    for i in tqdm(range(min(num_samples, len(dataset)))):
        question = dataset[i]["question"]
        correct_answer = float(re.findall(r'\d+(?:\.\d+)?', dataset[i]["answer"].split('####')[-1])[0])

        response = generate_response_using_Llama(
            prompt=prompt.format(question=question),
            model=model
        )

        if response:
            if VERBOSE:
                print("="*50)
                print(response)
                print("="*50)
            predicted_answer = extract_final_answer(response)

            if isinstance(predicted_answer, str):
                predicted_answer = float(predicted_answer.replace(",", ""))
            
            diff = abs(predicted_answer - correct_answer)
            is_correct = diff < 1e-5 if predicted_answer is not None else False
            
            if is_correct:
                correct += 1
            total += 1
            
            results.append({
                'question': question,
                'correct_answer': correct_answer,
                'predicted_answer': predicted_answer,
                'response': response,
                'correct': is_correct
            })

            if (i + 1) % 5 == 0:
                current_acc = correct/total if total > 0 else 0
                print(f"Progress: [{i+1}/{num_samples}]")
                print(f"Current Acc.: [{current_acc:.2%}]")

    return results, correct/total if total > 0 else 0

In [10]:
def save_final_result(results: List[Dict[str, Any]], accuracy: float, filename: str) -> None:
    result_str = f"====== ACCURACY: {accuracy} ======\n\n"
    result_str += f"[Details]\n"
    
    for idx, result in enumerate(results):
        result_str += f"Question {idx+1}: {result['question']}\n"
        result_str += f"Correct Answer: {result['correct_answer']}\n"
        result_str += f"Predicted Answer: {result['predicted_answer']}\n"
        result_str += f"Correct: {result['correct']}\n\n"
    
    with open(filename, "w", encoding="utf-8") as f:
        f.write(result_str)

#### Direct prompting with few-shot example

In [9]:
def construct_direct_prompt(num_examples: int = 3) -> str:
    train_dataset = gsm8k_train

    sampled_indices = random.sample(
        [i for i in range(len(train_dataset['question']))],
        num_examples
    )

    prompt = "Instruction:\nSolve the following mathematical question and generate ONLY the answer after a tag, 'Answer:' without any rationale.\n"

    for i in range(num_examples):
        cur_question = train_dataset['question'][i]
        cur_answer = train_dataset['answer'][i].split("####")[-1].strip()

        prompt += f"\n[Example {i+1}]\n"
        prompt += f"Question:\n{cur_question}\n"
        prompt += f"Answer:{cur_answer}\n"

    prompt += "\nQuestion:\n{question}\nAnswer:"

    return prompt

In [10]:
### 어떤 방식으로 저장되는지 확인해보세요!
PROMPT = construct_direct_prompt(3)
VERBOSE = False

results, accuracy = run_benchmark_test(
    dataset=gsm8k_test,
    prompt=PROMPT,
    VERBOSE=VERBOSE,
    num_samples=10
)
save_final_result(results, accuracy, "example.txt")

 50%|█████     | 5/10 [00:02<00:02,  1.95it/s]

Progress: [5/10]
Current Acc.: [20.00%]


100%|██████████| 10/10 [00:05<00:00,  1.72it/s]

Progress: [10/10]
Current Acc.: [30.00%]





In [None]:
# TODO: 0 shot, 3 shot, 5 shot direct prompting을 통해 벤치마크 테스트를 한 후, 각각 direct_prompting_{shot: int}.txt로 저장해주세요!
# 예시: shot이 5인 경우 direct_prompting_5.txt
# 항상 num_samples=50 입니다!
shots = [0, 3, 5]

for shot in shots:
    print(f"{shot}shot 벤치마크 테스트 시작")
    PROMPT = construct_direct_prompt(shot)
    VERBOSE = False

    results, accuracy = run_benchmark_test(
        dataset=gsm8k_test,
        prompt=PROMPT,
        VERBOSE=VERBOSE,
        num_samples=50
    )
    save_final_result(results, accuracy, f"direct_prompting_{shot}.txt")

0shot 벤치마크 테스트 시작


 10%|█         | 5/50 [00:03<00:28,  1.60it/s]

Progress: [5/50]
Current Acc.: [20.00%]


 20%|██        | 10/50 [00:07<00:32,  1.22it/s]

Progress: [10/50]
Current Acc.: [20.00%]


 30%|███       | 15/50 [00:10<00:23,  1.50it/s]

Progress: [15/50]
Current Acc.: [13.33%]


 40%|████      | 20/50 [00:13<00:15,  1.89it/s]

Progress: [20/50]
Current Acc.: [20.00%]


 50%|█████     | 25/50 [00:24<00:54,  2.16s/it]

Progress: [25/50]
Current Acc.: [20.00%]


 60%|██████    | 30/50 [00:37<00:51,  2.57s/it]

Progress: [30/50]
Current Acc.: [20.00%]


 70%|███████   | 35/50 [00:51<00:41,  2.74s/it]

Progress: [35/50]
Current Acc.: [17.14%]


 80%|████████  | 40/50 [01:05<00:26,  2.68s/it]

Progress: [40/50]
Current Acc.: [15.00%]


 90%|█████████ | 45/50 [01:18<00:13,  2.63s/it]

Progress: [45/50]
Current Acc.: [15.56%]


100%|██████████| 50/50 [01:32<00:00,  1.85s/it]


Progress: [50/50]
Current Acc.: [18.00%]
3shot 벤치마크 테스트 시작


 10%|█         | 5/50 [00:12<01:55,  2.57s/it]

Progress: [5/50]
Current Acc.: [20.00%]


 20%|██        | 10/50 [00:26<01:54,  2.86s/it]

Progress: [10/50]
Current Acc.: [30.00%]


 30%|███       | 15/50 [00:39<01:37,  2.79s/it]

Progress: [15/50]
Current Acc.: [20.00%]


 40%|████      | 20/50 [00:53<01:24,  2.82s/it]

Progress: [20/50]
Current Acc.: [20.00%]


 50%|█████     | 25/50 [01:07<01:09,  2.78s/it]

Progress: [25/50]
Current Acc.: [20.00%]


 60%|██████    | 30/50 [01:20<00:53,  2.68s/it]

Progress: [30/50]
Current Acc.: [20.00%]


 70%|███████   | 35/50 [01:33<00:38,  2.56s/it]

Progress: [35/50]
Current Acc.: [17.14%]


 80%|████████  | 40/50 [01:47<00:27,  2.71s/it]

Progress: [40/50]
Current Acc.: [15.00%]


 90%|█████████ | 45/50 [02:01<00:13,  2.62s/it]

Progress: [45/50]
Current Acc.: [17.78%]


100%|██████████| 50/50 [02:14<00:00,  2.69s/it]


Progress: [50/50]
Current Acc.: [20.00%]
5shot 벤치마크 테스트 시작


 10%|█         | 5/50 [00:11<01:51,  2.47s/it]

Progress: [5/50]
Current Acc.: [40.00%]


 20%|██        | 10/50 [00:29<02:24,  3.62s/it]

Progress: [10/50]
Current Acc.: [20.00%]


 30%|███       | 15/50 [00:42<01:38,  2.81s/it]

Progress: [15/50]
Current Acc.: [13.33%]


 40%|████      | 20/50 [00:55<01:20,  2.68s/it]

Progress: [20/50]
Current Acc.: [20.00%]


 50%|█████     | 25/50 [01:07<01:03,  2.53s/it]

Progress: [25/50]
Current Acc.: [24.00%]


 60%|██████    | 30/50 [01:21<00:53,  2.65s/it]

Progress: [30/50]
Current Acc.: [23.33%]


 70%|███████   | 35/50 [01:34<00:39,  2.62s/it]

Progress: [35/50]
Current Acc.: [20.00%]


 80%|████████  | 40/50 [01:47<00:23,  2.37s/it]

Progress: [40/50]
Current Acc.: [17.50%]


 90%|█████████ | 45/50 [01:58<00:12,  2.44s/it]

Progress: [45/50]
Current Acc.: [15.56%]


100%|██████████| 50/50 [02:12<00:00,  2.65s/it]

Progress: [50/50]
Current Acc.: [16.00%]





### Chain-of-Thought prompting with few-shot example
```text
[Question]
Janet’s ducks lay 16 eggs per day
 She eats three for breakfast every morning and bakes muffins for her friends every day with four
 She sells the remainder at the farmers' market daily for $2 per fresh duck egg
 How much in dollars does she make every day at the farmers' market?
====================================================================================================
[Answer]
Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.
She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer’s market.
#### 18
```

[Answer] 아래의 정답을 도출하는 과정을 예시로 달아주면 CoT의 few shot이 되겠죠?

In [13]:
def construct_CoT_prompt(num_examples: int = 3) -> str:
    train_dataset = gsm8k_train

    sampled_indices = random.sample(
        [i for i in range(len(train_dataset['question']))],
        num_examples
    )
    prompt = "Solve step-by-step. Answer after '####'.\n"  # 간결한 instruction

    for i in range(num_examples):
        # 간결한 CoT example 형식
        cur_q = train_dataset['question'][sampled_indices[i]].strip()
        ans = train_dataset['answer'][sampled_indices[i]]
        rationale = ans.split("####")[0].strip().replace("\n", " ")
        final = ans.split("####")[-1].strip()
        
        prompt += f"Q: {cur_q}\nA: {rationale}\n#### {final}\n"

    prompt += "\nQuestion:\n{question}\nAnswer:"

    return prompt

In [15]:
# TODO: 0 shot, 3 shot, 5 shot CoT prompting을 통해 벤치마크 테스트를 한 후, 각각 CoT_prompting_{shot: int}.txt로 저장해주세요!
# 예시: shot이 5인 경우 CoT_prompting_5.txt
# 항상 num_samples=50 입니다!


shots = [0, 3, 5]

for shot in shots:
    print(f"{shot}-shot CoT Prompting 테스트 중...")
    PROMPT = construct_CoT_prompt(shot)
    VERBOSE = False

    results, accuracy = run_benchmark_test(
        dataset=gsm8k_test,
        prompt=PROMPT,
        VERBOSE=VERBOSE,
        num_samples=50
    )
    save_final_result(results, accuracy, f"CoT_prompting_{shot}.txt")


0-shot CoT Prompting 테스트 중...


 10%|█         | 5/50 [00:04<00:32,  1.38it/s]

Progress: [5/50]
Current Acc.: [60.00%]


 20%|██        | 10/50 [00:07<00:26,  1.49it/s]

Progress: [10/50]
Current Acc.: [60.00%]


 30%|███       | 15/50 [00:11<00:27,  1.26it/s]

Progress: [15/50]
Current Acc.: [60.00%]


 40%|████      | 20/50 [00:15<00:24,  1.21it/s]

Progress: [20/50]
Current Acc.: [55.00%]


 50%|█████     | 25/50 [00:17<00:14,  1.72it/s]

Progress: [25/50]
Current Acc.: [52.00%]


 60%|██████    | 30/50 [00:35<01:18,  3.93s/it]

Progress: [30/50]
Current Acc.: [53.33%]


 70%|███████   | 35/50 [00:42<00:24,  1.64s/it]

Progress: [35/50]
Current Acc.: [60.00%]


 80%|████████  | 40/50 [01:00<00:24,  2.46s/it]

Progress: [40/50]
Current Acc.: [55.00%]


 90%|█████████ | 45/50 [01:25<00:22,  4.43s/it]

Progress: [45/50]
Current Acc.: [57.78%]


100%|██████████| 50/50 [01:33<00:00,  1.87s/it]


Progress: [50/50]
Current Acc.: [58.00%]
3-shot CoT Prompting 테스트 중...


 10%|█         | 5/50 [00:37<06:10,  8.24s/it]

Progress: [5/50]
Current Acc.: [60.00%]


 20%|██        | 10/50 [01:29<06:44, 10.12s/it]

Progress: [10/50]
Current Acc.: [60.00%]


 30%|███       | 15/50 [02:20<06:02, 10.35s/it]

Progress: [15/50]
Current Acc.: [66.67%]


 40%|████      | 20/50 [03:11<05:10, 10.35s/it]

Progress: [20/50]
Current Acc.: [65.00%]


 50%|█████     | 25/50 [04:01<04:02,  9.72s/it]

Progress: [25/50]
Current Acc.: [64.00%]


 60%|██████    | 30/50 [04:51<03:12,  9.64s/it]

Progress: [30/50]
Current Acc.: [66.67%]


 70%|███████   | 35/50 [05:39<02:22,  9.48s/it]

Progress: [35/50]
Current Acc.: [71.43%]


 80%|████████  | 40/50 [06:30<01:49, 10.97s/it]

Progress: [40/50]
Current Acc.: [65.00%]


 90%|█████████ | 45/50 [07:17<00:49,  9.88s/it]

Progress: [45/50]
Current Acc.: [68.89%]


100%|██████████| 50/50 [08:06<00:00,  9.73s/it]


Progress: [50/50]
Current Acc.: [70.00%]
5-shot CoT Prompting 테스트 중...


 10%|█         | 5/50 [00:55<08:24, 11.21s/it]

Progress: [5/50]
Current Acc.: [60.00%]


 20%|██        | 10/50 [01:56<07:55, 11.90s/it]

Progress: [10/50]
Current Acc.: [60.00%]


 30%|███       | 15/50 [02:58<07:01, 12.03s/it]

Progress: [15/50]
Current Acc.: [66.67%]


 40%|████      | 20/50 [03:57<05:58, 11.96s/it]

Progress: [20/50]
Current Acc.: [70.00%]


 50%|█████     | 25/50 [04:57<04:49, 11.59s/it]

Progress: [25/50]
Current Acc.: [64.00%]


 60%|██████    | 30/50 [05:54<03:45, 11.26s/it]

Progress: [30/50]
Current Acc.: [70.00%]


 70%|███████   | 35/50 [06:51<02:33, 10.23s/it]

Progress: [35/50]
Current Acc.: [74.29%]


 80%|████████  | 40/50 [07:52<02:00, 12.06s/it]

Progress: [40/50]
Current Acc.: [67.50%]


 90%|█████████ | 45/50 [08:49<00:55, 11.10s/it]

Progress: [45/50]
Current Acc.: [71.11%]


100%|██████████| 50/50 [09:49<00:00, 11.79s/it]

Progress: [50/50]
Current Acc.: [70.00%]





### Construct your prompt!!

목표: 본인만의 프롬프트를 통해 정답률을 더 끌어올려보기!
- gsm8k의 train 데이터셋에서 예시를 가져온 다음 (자유롭게!)
- 그 예시들에 대한 풀이 과정을 만들어주세요!
- 모든 것들이 자유입니다! Direct Prompting, CoT Prompting을 한 결과보다 정답률만 높으면 돼요.

In [17]:
from collections import Counter
from tqdm import tqdm
import time
from groq import Groq

def construct_my_prompt(num_examples: int = 5) -> str:
    """
    내 프롬프트 - 쿼터 절약을 위해 최적화
    
    Args:
        num_examples: few-shot example 개수 (0, 3, 5)
    
    Returns:
        프롬프트 문자열
    """
    import random
    train_dataset = gsm8k_train
    sampled_indices = random.sample(range(len(train_dataset["question"])), 
                                   num_examples)

    # 간결한 instruction
    prompt = "Solve step-by-step. Use '→' for steps. Answer after '####'.\n"

    for i, idx in enumerate(sampled_indices):
        question = train_dataset["question"][idx].strip()
        rationale = train_dataset["answer"][idx].split("####")[0].strip().replace('\n', ' ')
        answer = train_dataset["answer"][idx].split("####")[-1].strip()

        # 수식 강조
        rationale = rationale.replace("So", "→").replace(".", ". ")

        # 간결한 Q/A 형식
        prompt += f"Q: {question}\nA: {rationale}\n#### {answer}\n"

    prompt += "Q: {question}\nA:"
    return prompt

def check_quota_usage():
    """
    현재 쿼터 사용량 확인
    """
    try:
        client = Groq()
        response = client.chat.completions.create(
            messages=[{"role": "user", "content": "test"}],
            model="llama3-8b-8192",
            max_tokens=1
        )
        return True
    except Exception as e:
        if "rate_limit" in str(e) or "429" in str(e):
            print(f"⚠️ 쿼터 초과: {str(e)}")
            return False
        else:
            print(f"❌ 다른 에러: {str(e)}")
            return False

def wait_for_quota_reset(wait_minutes=5):
    """
    쿼터 리셋 대기
    """
    print(f"⏳ {wait_minutes}분 대기 중...")
    time.sleep(wait_minutes * 60)
    print("✅ 대기 완료!")

def safe_api_call(func, max_retries=3, wait_minutes=5):
    """
    안전한 API 호출 (재시도 로직 포함)
    """
    for attempt in range(max_retries):
        try:
            return func()
        except Exception as e:
            if "rate_limit" in str(e) or "429" in str(e):
                print(f"⚠️ 시도 {attempt + 1}/{max_retries}: 쿼터 초과")
                if attempt < max_retries - 1:
                    wait_for_quota_reset(wait_minutes)
                else:
                    raise e
            else:
                raise e

def run_self_consistency_test_with_quota_check(prompt: str, dataset, 
                                              model="llama3-8b-8192", 
                                              num_samples=50, n=3, batch_size=10):
    """
    Self-Consistency + 쿼터 체크 (배치 처리)
    
    Args:
        prompt: 프롬프트
        dataset: 테스트 데이터셋
        model: 모델명
        num_samples: 테스트 샘플 수
        n: Self-Consistency 반복 횟수
        batch_size: 배치 크기
    """
    results = []
    correct = 0
    
    # 배치 단위로 처리
    for batch_start in tqdm(range(0, num_samples, batch_size), desc="Batch Progress"):
        batch_end = min(batch_start + batch_size, num_samples)
        
        # 배치 시작 전 쿼터 체크
        if not check_quota_usage():
            print(f"⏸️ 배치 {batch_start//batch_size + 1} 시작 전 쿼터 체크 실패")
            wait_for_quota_reset(5)
        
        # 배치 처리
        for i in range(batch_start, batch_end):
            q = dataset['question'][i]
            a = dataset['answer'][i].split("####")[-1].strip()
            filled_prompt = prompt.replace("{question}", q)

            answers = []
            for _ in range(n):
                def api_call():
                    return generate_response_using_Llama(filled_prompt, model=model)
                
                response = safe_api_call(api_call)
                final = extract_final_answer(response)
                answers.append(final)

            most_common = Counter(answers).most_common(1)[0][0]

            results.append({
                "question": q,
                "correct_answer": a,
                "predicted_answer": most_common,
                "correct": most_common == a
            })

            if most_common == a:
                correct += 1
        
        # 배치 완료 후 진행상황 출력
        current_acc = correct / len(results) if results else 0
        print(f"📊 배치 {batch_start//batch_size + 1} 완료: [{len(results)}/{num_samples}] 정확도: {current_acc:.2%}")
        
        # 배치 간 대기
        if batch_end < num_samples:
            time.sleep(2)

    accuracy = correct / num_samples
    return results, accuracy

def run_self_consistency_test(prompt: str, dataset, 
                             model="llama3-8b-8192", 
                             num_samples=50, n=3):
    """
    기본 Self-Consistency 테스트 (쿼터 체크 없음)
    """
    results = []
    correct = 0

    for i in tqdm(range(num_samples), desc="Self-Consistency Progress"):
        q = dataset['question'][i]
        a = dataset['answer'][i].split("####")[-1].strip()
        filled_prompt = prompt.replace("{question}", q)

        answers = []
        for _ in range(n):
            response = generate_response_using_Llama(filled_prompt, model=model)
            final = extract_final_answer(response)
            answers.append(final)

        most_common = Counter(answers).most_common(1)[0][0]

        results.append({
            "question": q,
            "correct_answer": a,
            "predicted_answer": most_common,
            "correct": most_common == a
        })

        if most_common == a:
            correct += 1

    accuracy = correct / num_samples
    return results, accuracy

In [18]:
# TODO: 만든 0 shot, 3 shot, 5 shot example과 프롬프트를 통해 벤치마크 테스트를 한 후, 각각 My_prompting_{shot: int}.txt로 저장해주세요!
# 예시: shot이 5인 경우 My_prompting_5.txt
# 항상 num_samples=50 입니다!

shots = [0, 3, 5]
for shot in shots:
    print(f"Testing My Prompting with {shot}-shot + Self-Consistency...")
    PROMPT = construct_my_prompt(shot)
    
    # 쿼터 체크 버전 사용
    results, accuracy = run_self_consistency_test_with_quota_check(
        prompt=PROMPT,
        dataset=gsm8k_test,
        model="llama3-8b-8192",
        num_samples=50,
        n=3,
        batch_size=10
    )
    
    save_final_result(results, accuracy, f"My_prompting_{shot}.txt")
    print(f"Accuracy: {accuracy * 100:.2f}%\n") 

Testing My Prompting with 0-shot + Self-Consistency...


Batch Progress:   0%|          | 0/5 [00:00<?, ?it/s]

📊 배치 1 완료: [10/50] 정확도: 80.00%


Batch Progress:  20%|██        | 1/5 [00:21<01:25, 21.35s/it]

📊 배치 2 완료: [20/50] 정확도: 70.00%


Batch Progress:  40%|████      | 2/5 [01:53<03:09, 63.13s/it]

📊 배치 3 완료: [30/50] 정확도: 66.67%


Batch Progress:  60%|██████    | 3/5 [03:17<02:25, 72.67s/it]

📊 배치 4 완료: [40/50] 정확도: 67.50%


Batch Progress: 100%|██████████| 5/5 [06:04<00:00, 72.82s/it]


📊 배치 5 완료: [50/50] 정확도: 68.00%
Accuracy: 68.00%

Testing My Prompting with 3-shot + Self-Consistency...


Batch Progress:   0%|          | 0/5 [00:00<?, ?it/s]

📊 배치 1 완료: [10/50] 정확도: 60.00%


Batch Progress:  20%|██        | 1/5 [03:57<15:49, 237.38s/it]

📊 배치 2 완료: [20/50] 정확도: 65.00%


Batch Progress:  40%|████      | 2/5 [08:09<12:17, 245.90s/it]

📊 배치 3 완료: [30/50] 정확도: 70.00%


Batch Progress:  60%|██████    | 3/5 [12:04<08:02, 241.05s/it]

📊 배치 4 완료: [40/50] 정확도: 70.00%


Batch Progress: 100%|██████████| 5/5 [20:11<00:00, 242.40s/it]


📊 배치 5 완료: [50/50] 정확도: 72.00%
Accuracy: 72.00%

Testing My Prompting with 5-shot + Self-Consistency...


Batch Progress:   0%|          | 0/5 [00:00<?, ?it/s]

📊 배치 1 완료: [10/50] 정확도: 70.00%


Batch Progress:  20%|██        | 1/5 [06:15<25:02, 375.75s/it]

📊 배치 2 완료: [20/50] 정확도: 70.00%


Batch Progress:  40%|████      | 2/5 [12:40<19:03, 381.32s/it]

📊 배치 3 완료: [30/50] 정확도: 70.00%


Batch Progress:  60%|██████    | 3/5 [18:52<12:33, 376.99s/it]

📊 배치 4 완료: [40/50] 정확도: 70.00%


Batch Progress: 100%|██████████| 5/5 [31:22<00:00, 376.45s/it]

📊 배치 5 완료: [50/50] 정확도: 72.00%
Accuracy: 72.00%






### 보고서 작성하기
#### 아래의 내용이 포함되면 됩니다!

1. Direct Prompting, CoT Prompting, My Prompting을 0 shot, 3 shot, 5 shot 정답률을 표로 보여주세요!
2. CoT Prompting이 Direct Prompting에 비해 왜 좋을 수 있는지에 대해서 서술해주세요!
3. 본인이 작성한 프롬프트 기법이 CoT에 비해서 왜 더 좋을 수 있는지에 대해서 설명해주세요!
4. 최종적으로, `PROMPTING.md`에 보고서를 작성해주세요!

## 📊 정답률 비교 (0-shot, 3-shot, 5-shot)
| Prompting 기법     | 0-shot | 3-shot | 5-shot |
| ---------------- | ------ | ------ | ------ |
| Direct Prompting | 18.00% | 20.00% | 16.00% |
| CoT Prompting    | 58.00% | 70.00% | 70.00% |
| My Prompting     | 68.00% | 72.00% | 72.00% |


## 🔍 CoT Prompting이 Direct Prompting보다 뛰어난 이유

Chain of Thought Prompting(CoT)은 문제 해결 과정을 단계적으로 서술하게 만들어, 단순한 정답 추론보다 더 나은 성능을 유도
특히 수학 문제처럼 중간 계산이 필요한 문제에서 다음과 같은 장점을 가짐
- 사고 과정을 분해하여 논리 전개 가능
- 중간 추론 오류를 방지할 수 있음
- 다단계 사고(Multi-hop reasoning)에 적합함

**예시 (CoT Prompting)**

```
Q: If a bag has 3 apples and each apple costs 2 dollars, how much is the total?
A: There are 3 apples. Each costs 2 dollars. So, 3 × 2 = 6.
#### 6
```

**예시 (Direct Prompting)**

```
Q: If a bag has 3 apples and each apple costs 2 dollars, how much is the total?
A: 6
```

## 🧪 My Prompting이 CoT보다 더 나은 이유
1. `→` 기호를 사용해 reasoning 단계 시각화
2. 설명을 수식 위주로 구성해 핵심 정보만 전달
3. 일관된 Q/A 형식 유지로 모델 혼란 최소화
4. Self-Consistency voting과 잘 어울리도록 구성

특히 정답을 하나만 생성하는 대신 같은 문제에 대해 여러 번 응답을 생성하고, 가장 많이 등장한 답을 최종 정답으로 채택하는 방식(Self-Consistency)을 도입함
이를 통해,
- 불확실한 추론에 대한 안정성 증가
- 복잡한 문제에 대한 평규적 사고 능력 강화
를 기대할 수 있다.

**예시 (My Prompting)**

```
Q: If a bag has 3 apples and each apple costs 2 dollars, how much is the total?
A: → Apples: 3  
→ Cost per apple: 2  
→ Total = 3 × 2 = 6  
#### 6
```