### 분포 확인

In [None]:
import pandas as pd
import ast

df = pd.read_csv('../raw/train_reformat.csv')

def parse_choices(choices_str):
    return ast.literal_eval(choices_str)

df['choices_list'] = df['choices'].apply(parse_choices)
df['choices_length'] = df['choices_list'].apply(len)

print("선지 개수 별 분포")
print(df['choices_length'].value_counts().sort_index())
print("\n")


print("4지선다 분포")
print(df[df['choices_length'] == 4]['answer'].value_counts().sort_index())
print("\n")

print("5지선다 분포")
print(df[df['choices_length'] == 5]['answer'].value_counts().sort_index())

선지 개수 별 분포
choices_length
4     791
5    1239
Name: count, dtype: int64


4지선다 분포
answer
1    179
2    185
3    204
4    223
Name: count, dtype: int64


5지선다 분포포
answer
1    765
2    265
3    116
4     62
5     31
Name: count, dtype: int64


### 분포 맞추기 ver 1

In [12]:
import pandas as pd
import ast
import random
import numpy as np

df = pd.read_csv('../raw/train_reformat.csv')

def parse_choices(choices_str):
    return ast.literal_eval(choices_str)

def process_row(row, target_answer):
    choices = parse_choices(row['choices'])
    current_answer = row['answer']
    
    # 현재 답안과 목표 답안이 다른 경우
    if current_answer != target_answer:
        # 위치 교환
        current_idx = current_answer - 1
        target_idx = target_answer - 1
        choices[current_idx], choices[target_idx] = choices[target_idx], choices[current_idx]
        
    return pd.Series({
        'choices': str(choices),
        'answer': target_answer
    })

df['choices_length'] = df['choices'].apply(lambda x: len(parse_choices(x)))
df_4 = df[df['choices_length'] == 4].copy()
df_5 = df[df['choices_length'] == 5].copy()

target_count_4 = len(df_4) // 4
processed_4 = []

for answer in range(1, 5):
    # 현재 답인 것
    current_rows = df_4[df_4['answer'] == answer]
    
    # 초과할 경우 target_count만큼만 남기기
    if len(current_rows) > target_count_4:
        keep_rows = current_rows.iloc[:target_count_4]
        extra_rows = current_rows.iloc[target_count_4:]
        
        for idx, row in extra_rows.iterrows():
            available_answers = [i for i in range(1, 5) if i != answer]
            target_answer = random.choice(available_answers)
            new_row = process_row(row, target_answer)
            processed_4.append(pd.concat([row.drop(['choices', 'answer']), new_row]))
        
        processed_4.extend([row for _, row in keep_rows.iterrows()])
    else:
        processed_4.extend([row for _, row in current_rows.iterrows()])


target_count_5 = len(df_5) // 5
processed_5 = []

# 모든 답안 5번으로 변경
for _, row in df_5.iterrows():
    processed_5.append(pd.concat([row.drop(['choices', 'answer']), process_row(row, 5)]))

# 1~4번으로 일부 다시 수정
df_5_temp = pd.DataFrame(processed_5)
for answer in range(1, 5):
    candidates = df_5_temp[df_5_temp['answer'] == 5].index
    if len(candidates) > target_count_5:
        selected_indices = np.random.choice(candidates, target_count_5, replace=False)
        for idx in selected_indices:
            row = df_5_temp.loc[idx]
            df_5_temp.loc[idx] = pd.concat([row.drop(['choices', 'answer']), process_row(row, answer)])

df_balanced = pd.concat([pd.DataFrame(processed_4), df_5_temp])
df_balanced = df_balanced.drop('choices_length', axis=1)


print("4지선다 분포:")
print(df_balanced[df_balanced['choices'].apply(lambda x: len(parse_choices(x))) == 4]['answer'].value_counts().sort_index())
print("\n5지선다 분포:")
print(df_balanced[df_balanced['choices'].apply(lambda x: len(parse_choices(x))) == 5]['answer'].value_counts().sort_index())
print(df_balanced['answer'].value_counts().sort_index())


df_balanced = df_balanced.sort_values('id')
df_balanced.to_csv('choice_balance_v1.csv', index=False)

4지선다 분포:
answer
1    188
2    196
3    208
4    199
Name: count, dtype: int64

5지선다 분포:
answer
1    247
2    247
3    247
4    247
5    251
Name: count, dtype: int64
answer
1    435
2    443
3    455
4    446
5    251
Name: count, dtype: int64


In [None]:
import pandas as pd
import ast
import random
import numpy as np

def parse_choices(choices_str):
    return ast.literal_eval(choices_str)

def process_row(row, target_answer):
    choices = parse_choices(row['choices'])
    current_answer = row['answer']
    
    # 현재 답안과 목표 답안이 다른 경우
    if current_answer != target_answer:
        # 위치 교환
        current_idx = current_answer - 1
        target_idx = target_answer - 1
        choices[current_idx], choices[target_idx] = choices[target_idx], choices[current_idx]
        
    return pd.Series({
        'choices': str(choices),
        'answer': target_answer
    })

df = pd.read_csv('../raw/train_reformat.csv')
df['choices_length'] = df['choices'].apply(lambda x: len(parse_choices(x)))
df_4 = df[df['choices_length'] == 4].copy()
df_5 = df[df['choices_length'] == 5].copy()

total_samples = len(df)
target_count_per_answer = total_samples // 5 

processed_rows = []

remaining_4_count = len(df_4)
target_per_answer_4 = remaining_4_count // 4

for answer in range(1, 5):
    current_rows = df_4[df_4['answer'] == answer]
    
    if len(current_rows) > target_per_answer_4:
        keep_rows = current_rows.iloc[:target_per_answer_4]
        extra_rows = current_rows.iloc[target_per_answer_4:]
        
        for idx, row in extra_rows.iterrows():
            available_answers = [i for i in range(1, 5) if i != answer]

            current_counts = pd.Series([len(processed_rows) for _ in range(5)])
            for processed_row in processed_rows:
                current_counts[processed_row['answer'] - 1] += 1
            target_answer = min(available_answers, key=lambda x: current_counts[x-1])
            new_row = process_row(row, target_answer)
            processed_rows.append(pd.concat([row.drop(['choices', 'answer']), new_row]))
        
        processed_rows.extend([row for _, row in keep_rows.iterrows()])
    else:
        processed_rows.extend([row for _, row in current_rows.iterrows()])

remaining_5_count = len(df_5)
current_counts = pd.Series([0] * 5)
for processed_row in processed_rows:
    current_counts[processed_row['answer'] - 1] += 1

target_counts = pd.Series([target_count_per_answer] * 5)
needed_counts = target_counts - current_counts

for _, row in df_5.iterrows():
    current_min_answer = needed_counts.idxmax() + 1
    if needed_counts[current_min_answer - 1] > 0:
        new_row = process_row(row, current_min_answer)
        processed_rows.append(pd.concat([row.drop(['choices', 'answer']), new_row]))
        needed_counts[current_min_answer - 1] -= 1
    else:
        new_row = process_row(row, 5)
        processed_rows.append(pd.concat([row.drop(['choices', 'answer']), new_row]))

df_balanced = pd.DataFrame(processed_rows)
df_balanced = df_balanced.drop('choices_length', axis=1)

print("4지선다 분포:")
print(df_balanced[df_balanced['choices'].apply(lambda x: len(parse_choices(x))) == 4]['answer'].value_counts().sort_index())
print("\n5지선다 분포:")
print(df_balanced[df_balanced['choices'].apply(lambda x: len(parse_choices(x))) == 5]['answer'].value_counts().sort_index())
print("\n전체 분포:")
print(df_balanced['answer'].value_counts().sort_index())

df_balanced = df_balanced.sort_values('id')
df_balanced.to_csv('choice_balance_v2.csv', index=False)

4지선다 분포:
answer
1    195
2    195
3    197
4    204
Name: count, dtype: int64

5지선다 분포:
answer
1    211
2    211
3    209
4    202
5    406
Name: count, dtype: int64

전체 분포:
answer
1    406
2    406
3    406
4    406
5    406
Name: count, dtype: int64
