# 02. Preprocessing (전처리)

QC + Big Five + Ideology + Honesty-Humility 점수 계산

In [None]:
%pip install pandas numpy -q

In [None]:
import pandas as pd
import numpy as np
import os

if os.path.basename(os.getcwd()) == 'notebooks':
    os.chdir('..')
print(f'작업 폴더: {os.getcwd()}')

In [None]:
# 데이터 로드
data = pd.read_csv('data/raw/sapa_data.csv')
keys = pd.read_csv('data/raw/superKey696.csv', index_col=0)
item_cols = [col for col in data.columns if col.startswith('q_')]

print(f"데이터: {len(data):,}명, {len(item_cols)}문항")

## 1. QC (Quality Control)

In [None]:
# 응답 부족 (10개 미만)
responses = data[item_cols].notna().sum(axis=1)
low_response = data[responses < 10]['RID'].tolist()

# Straight-lining (모든 응답 동일)
def is_straight(row):
    valid = row.dropna()
    return len(valid) >= 10 and valid.nunique() == 1

straight = data[data[item_cols].apply(is_straight, axis=1)]['RID'].tolist()

# 제외 적용
exclude = set(low_response) | set(straight)
data_clean = data[~data['RID'].isin(exclude)].copy()

print(f"=== QC 결과 ===")
print(f"응답 부족: {len(low_response)}명")
print(f"Straight-lining: {len(straight)}명")
print(f"제외 합계: {len(exclude)}명")
print(f"유효 응답자: {len(data_clean):,}명")

## 2. Big Five 점수 계산

In [None]:
def calc_scale(df, keys, scale):
    """채점 키로 척도 점수 계산 (역채점 포함)"""
    items = keys.index[keys[scale] != 0].tolist()
    weights = keys.loc[items, scale]
    available = [q for q in items if q in df.columns]
    
    if not available:
        return pd.Series([np.nan] * len(df), index=df.index)
    
    subset = df[available].copy()
    for item in available:
        if weights[item] == -1:
            subset[item] = 7 - subset[item]
    
    return subset.mean(axis=1, skipna=True)

In [None]:
# scores DataFrame 생성
scores = pd.DataFrame()
scores['RID'] = data_clean['RID'].values

# Big Five
for scale in ['NEO_O', 'NEO_C', 'NEO_E', 'NEO_A', 'NEO_N']:
    scores[scale] = calc_scale(data_clean, keys, scale).values

print("=== Big Five 결과 ===")
for scale in ['NEO_O', 'NEO_C', 'NEO_E', 'NEO_A', 'NEO_N']:
    n = scores[scale].notna().sum()
    m = scores[scale].mean()
    s = scores[scale].std()
    print(f"{scale}: N={n:,}, M={m:.2f}, SD={s:.2f}")

## 3. Ideology 점수 계산

Ideology = mean(z(MPQtr), z(NEOo6) * -1)

In [None]:
def z_score(series):
    return (series - series.mean()) / series.std()

mpq_tr = calc_scale(data_clean, keys, 'MPQtr')
neo_o6 = calc_scale(data_clean, keys, 'NEOo6')

# .values로 index 정렬 문제 방지
scores['Ideology'] = ((z_score(mpq_tr) + z_score(neo_o6) * -1) / 2).values

n = scores['Ideology'].notna().sum()
print(f"=== Ideology 결과 ===")
print(f"Ideology: N={n:,}, M={scores['Ideology'].mean():.3f}, SD={scores['Ideology'].std():.3f}")

## 4. Honesty-Humility 점수 계산

H-H = mean(z(NEOa2), z(NEOa4), z(HEXACO_H))

In [None]:
neo_a2 = calc_scale(data_clean, keys, 'NEOa2')
neo_a4 = calc_scale(data_clean, keys, 'NEOa4')
hexaco_h = calc_scale(data_clean, keys, 'HEXACO_H')

# .values로 index 정렬 문제 방지
scores['Honesty_Humility'] = ((z_score(neo_a2) + z_score(neo_a4) + z_score(hexaco_h)) / 3).values

n = scores['Honesty_Humility'].notna().sum()
print(f"=== Honesty-Humility 결과 ===")
print(f"H-H: N={n:,}, M={scores['Honesty_Humility'].mean():.3f}, SD={scores['Honesty_Humility'].std():.3f}")

## 5. 저장

In [None]:
os.makedirs('data/processed', exist_ok=True)

cols = ['RID', 'NEO_O', 'NEO_C', 'NEO_E', 'NEO_A', 'NEO_N', 'Ideology', 'Honesty_Humility']
scores[cols].to_csv('data/processed/sapa_scores.csv', index=False)

print(f"\n✅ 저장 완료: data/processed/sapa_scores.csv")
print(f"총 {len(scores):,}명, 7개 척도")