In [1]:
import pandas as pd
import numpy as np

np.random.seed(1234)

df = pd.DataFrame({
    'score': np.random.normal(loc=70, scale=10, size=100)  # 평균 70, 표준편차 10
})

df.head()

Unnamed: 0,score
0,74.714352
1,58.090243
2,84.32707
3,66.873481
4,62.794113


In [2]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
df['scaled'] = ss.fit_transform(df[['score']])
df

Unnamed: 0,score,scaled
0,74.714352,0.438214
1,58.090243,-1.231402
2,84.327070,1.403652
3,66.873481,-0.349271
4,62.794113,-0.758976
...,...,...
95,69.180529,-0.117567
96,66.552340,-0.381525
97,75.282881,0.495313
98,59.310112,-1.108886


In [3]:
# 구간화
# scaled < -Z0       : GROUP 0 (하위 33%)
# -Z0 <= scaled < Z0 : GROUP 1 (하위 33% ~ 상위 33%)
# scaled >= Z0       : GROUP 2 (상위 33%)

In [None]:
# 퍼센트 포인트 함수 (Percent Point Function)

#          |<- 2.5% ->|<------ 95% ------>|<- 2.5% ->|
#          |          |                  |          |
#         -1.96       0                +1.96       ...

# norm.ppf(0.025)는 왼쪽 2.5% 지점인 -1.96 반환
# norm.ppf(0.975)는 오른쪽 97.5% 지점인 +1.96 반환

In [11]:
from scipy.stats import norm

# P(Z < -Z₀) = 0.33 
# -Z₀ = norm.ppf(0.33)

Z0 = -norm.ppf(0.33)
Z0

0.4399131656732338

In [12]:
df['group'] = np.where(df['scaled'] < -Z0, 0, np.where(df['scaled'] < Z0, 1, 2))
df['group'].value_counts()

1    40
2    33
0    27
Name: group, dtype: int64

In [13]:
Z1 = norm.ppf(0.33)
Z1

-0.4399131656732338