In [1]:
import numpy as np
from scipy.stats import *
import pandas as pd

### 분산 (Variance)

In [2]:
x = [1,2,3,4,5]
print(np.var(x, ddof=1)) # dominator : n-1
print(np.array(x).var()) # dominator : n
print(pd.Series(x).var(ddof=0)) # dominator : n

2.5
2.0
2.0


### 표준편차 (Standard Deviation)

In [3]:
print(np.std(x, ddof=1)) # dominator = n-1
print(np.array(x).std(ddof=0)) # dominator = n
print(pd.Series(x).std(ddof=1)) # dominator = n-1

1.5811388300841898
1.4142135623730951
1.5811388300841898


### 변동계수 (Coefficient of Variation)

In [4]:
x1 = np.array([1,2,3,4,5])
x2 = x1 * 10

print(np.std(x1, ddof=1))
print(np.std(x2, ddof=1))

1.5811388300841898
15.811388300841896


In [5]:
print(variation(x1))
print(variation(x2))

0.47140452079103173
0.4714045207910317


In [6]:
print(np.std(x1, ddof=1)/np.mean(x1))
print(np.std(x2, ddof=1)/np.mean(x2))

0.5270462766947299
0.5270462766947299


### 표준화 (Scaling)
데이터의 평균을 0으로 표준편차를 1로 만드는 방법

In [7]:
x1

array([1, 2, 3, 4, 5])

In [8]:
x2

array([10, 20, 30, 40, 50])

In [9]:
# standard scaling - mean : 0, std : 1
z1 = (x1 - x1.mean()) / x1.std()
z2 = (x2 - x2.mean()) / x2.std()

print(z1)
print(z2)

[-1.41421356 -0.70710678  0.          0.70710678  1.41421356]
[-1.41421356 -0.70710678  0.          0.70710678  1.41421356]


In [10]:
# min-max scaling - 0 < data < 1
z1 = (x1 - x1.mean()) / (x1.max() - x1.min())
z2 = (x2 - x2.mean()) / (x2.max() - x2.min())

print(z1)
print(z2)

[-0.5  -0.25  0.    0.25  0.5 ]
[-0.5  -0.25  0.    0.25  0.5 ]


In [11]:
X = pd.DataFrame({'X1' : [1,2,3,4,5],
                  'X2': [10,20,30,40,50]})

X

Unnamed: 0,X1,X2
0,1,10
1,2,20
2,3,30
3,4,40
4,5,50


In [12]:
from sklearn.preprocessing import MinMaxScaler

In [14]:
scaler = MinMaxScaler()
Z = scaler.fit_transform(X)
pd.DataFrame(Z, columns=['X1', 'X2'])

Unnamed: 0,X1,X2
0,0.0,0.0
1,0.25,0.25
2,0.5,0.5
3,0.75,0.75
4,1.0,1.0


### 4분위 IQR(InterQuartile Range)
산포를 나타내는 직관적 지표

In [15]:
x = np.random.normal(100, 20, size=1000)
x[3]

89.57303656012921

In [16]:
np.max(x)

158.2005152882765

In [17]:
np.min(x)

40.74159077692245

In [18]:
print(np.ptp(x)) # Peak to Peak
print(np.max(x) - np.min(x))

117.45892451135406
117.45892451135406


In [19]:
print(np.quantile(x, 0.75) - np.quantile(x, 0.25))
print(iqr(x))

28.591736317120038
28.591736317120038
