In [84]:
# 정규분포를 따르는 데이터 생성
    # 한국인: 1000명, 육류소비량 53.9kg, 표준편차 5kg
    # 일본인: 1000명, 육류소비량 32.7lg, 표준편차 3kg

In [131]:
# 기본 패키지
import pandas as pd
import numpy as np
# 데이터 전처리 및 분석
import scipy.stats as ss
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [118]:
# 주어진 조건 맞춰서 생성
smaple_nb = 1000
ko_mu, ko_sigma = 53.9, 5
jp_mu, jp_sigma = 32.71, 3

In [119]:
# 방법1
# 표준편차 * 평균0,표준편차1인 값(표본개수만큼) + 평균
ko_meat_csmp = ko_sigma * np.random.randn(smaple_nb) + ko_mu
jp_meat_csmp = jp_sigma * np.random.randn(smaple_nb) + jp_mu

print(len(ko_meat_csmp), ko_meat_csmp[0], ko_meat_csmp.mean(), ko_meat_csmp.std())
print(len(jp_meat_csmp), jp_meat_csmp[0], jp_meat_csmp.mean(), jp_meat_csmp.std())

1000 57.25294095879133 53.83362651154942 4.987996506310356
1000 32.816225159707756 32.741706074973365 3.0280980442696532


In [120]:
# 방법2
# np.random.normal(mu, sigma, sample_nb) 사용
ko_meat_csmp = np.random.normal(ko_mu, ko_sigma, smaple_nb)
jp_meat_csmp = np.random.normal(jp_mu, jp_sigma, smaple_nb)
print(len(ko_meat_csmp), ko_meat_csmp[0], ko_meat_csmp.mean(), ko_meat_csmp.std())
print(len(jp_meat_csmp), jp_meat_csmp[0], jp_meat_csmp.mean(), jp_meat_csmp.std())

1000 55.90015676206148 53.6238842343757 4.89761719467146
1000 27.86524361579997 32.74732769393717 2.8992318136273996


In [121]:
# df 생성
meat_csmp = pd.DataFrame({'Korean': ko_meat_csmp, 'Japanese': jp_meat_csmp})
meat_csmp.head()

Unnamed: 0,Korean,Japanese
0,55.900157,27.865244
1,52.540454,33.289671
2,46.131438,31.013592
3,59.923743,35.693987
4,54.26174,32.29345


In [122]:
# 정규화 진행:  Z-표준화
# 각 데이터 값에서 평균을 뺀 후, 표준편차로 나누어준 값
# 방법1 | numpy > z = (x - mean(x)) / std(x)
# 방법2 | scipy.stats > zscore()
# 방법3 | sklearn.preprocessing > StandardScaler().fit_trandsform()

In [123]:
# 방법1 | numpy > z = (x - mean(x) / std(x))
std1_meat_csmp = (meat_csmp - np.mean(meat_csmp)) / np.std(meat_csmp)
# 기존 df에 열추가해서 넣기
meat_csmp['Korean_Z_std_np'] = std1_meat_csmp['Korean']
meat_csmp['Japanese_Z_std_np'] = std1_meat_csmp['Japanese']

display(meat_csmp.head(3))
print('-'*50)
print(std1_meat_csmp.mean())
print('-'*50)
print(std1_meat_csmp.std())

  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


Unnamed: 0,Korean,Japanese,Korean_Z_std_np,Japanese_Z_std_np
0,55.900157,27.865244,0.464771,-1.683923
1,52.540454,33.289671,-0.221216,0.187065
2,46.131438,31.013592,-1.529815,-0.597998


--------------------------------------------------
Korean     -1.568523e-15
Japanese   -3.366196e-16
dtype: float64
--------------------------------------------------
Korean      1.0005
Japanese    1.0005
dtype: float64


In [124]:
# 방법2 | scipy.stats > zscore()
# 기존 df에 열 추가해서 넣기
meat_csmp['Korean_Z_std_ss'] = ss.zscore(meat_csmp['Korean'])
meat_csmp['Japanese_Z_std_ss'] = ss.zscore(meat_csmp['Japanese'])

display(meat_csmp.head())
print('-'*50)
print(meat_csmp[['Korean_Z_std_ss', 'Japanese_Z_std_ss']].mean())
print('-'*50)
print(meat_csmp[['Korean_Z_std_ss', 'Japanese_Z_std_ss']].std())

Unnamed: 0,Korean,Japanese,Korean_Z_std_np,Japanese_Z_std_np,Korean_Z_std_ss,Japanese_Z_std_ss
0,55.900157,27.865244,0.464771,-1.683923,0.464771,-1.683923
1,52.540454,33.289671,-0.221216,0.187065,-0.221216,0.187065
2,46.131438,31.013592,-1.529815,-0.597998,-1.529815,-0.597998
3,59.923743,35.693987,1.286311,1.016359,1.286311,1.016359
4,54.26174,32.29345,0.130238,-0.156551,0.130238,-0.156551


--------------------------------------------------
Korean_Z_std_ss     -1.566747e-15
Japanese_Z_std_ss   -3.366196e-16
dtype: float64
--------------------------------------------------
Korean_Z_std_ss      1.0005
Japanese_Z_std_ss    1.0005
dtype: float64


In [125]:
# 방법3 | sklearn.preprocessing > StandardScaler().fit_trandsform()
# scaler 이름으로 호출
scaler = StandardScaler()

# dataframe 형태로 넣어야 해서 이중 리스트
meat_csmp['Korean_Z_std_sk'] = scaler.fit_transform(meat_csmp[['Korean']])
meat_csmp['Japanese_Z_std_sk'] = scaler.fit_transform(meat_csmp[['Japanese']])

display(meat_csmp.head())
print('-'*50)
print(meat_csmp[['Korean_Z_std_sk', 'Japanese_Z_std_sk']].mean())
print('-'*50)
print(meat_csmp[['Korean_Z_std_sk', 'Japanese_Z_std_sk']].std())

Unnamed: 0,Korean,Japanese,Korean_Z_std_np,Japanese_Z_std_np,Korean_Z_std_ss,Japanese_Z_std_ss,Korean_Z_std_sk,Japanese_Z_std_sk
0,55.900157,27.865244,0.464771,-1.683923,0.464771,-1.683923,0.464771,-1.683923
1,52.540454,33.289671,-0.221216,0.187065,-0.221216,0.187065,-0.221216,0.187065
2,46.131438,31.013592,-1.529815,-0.597998,-1.529815,-0.597998,-1.529815,-0.597998
3,59.923743,35.693987,1.286311,1.016359,1.286311,1.016359,1.286311,1.016359
4,54.26174,32.29345,0.130238,-0.156551,0.130238,-0.156551,0.130238,-0.156551


--------------------------------------------------
Korean_Z_std_sk     -1.566747e-15
Japanese_Z_std_sk   -3.366196e-16
dtype: float64
--------------------------------------------------
Korean_Z_std_sk      1.0005
Japanese_Z_std_sk    1.0005
dtype: float64


In [126]:
# 정규화 진행: Min-Max 정규화
# 최대값을 1 최소값을 0 으로 맞추는 방법
# 방법1 | np > (x-min(x)) / (max(x)-min(x))
# 방법2 | sklearn.preprocessing > MinMaxScaler.fit_transform()

In [127]:
# 방법1 | np > (x-min(x)) / (max(x)-min(x))
meat_csmp['Korean_MM_std_np'] = (meat_csmp['Korean'] - np.min(meat_csmp['Korean'])) / (np.max(meat_csmp['Korean']) - np.min(meat_csmp['Korean']))
meat_csmp['Japanese_MM_std_np'] = (meat_csmp['Japanese'] - np.min(meat_csmp['Japanese'])) / (np.max(meat_csmp['Japanese']) - np.min(meat_csmp['Japanese']))

display(meat_csmp.head(3))
print('-'*50)
print(meat_csmp[['Korean_MM_std_np', 'Japanese_MM_std_np']].min())
print('-'*50)
print(meat_csmp[['Korean_MM_std_np', 'Japanese_MM_std_np']].max())

Unnamed: 0,Korean,Japanese,Korean_Z_std_np,Japanese_Z_std_np,Korean_Z_std_ss,Japanese_Z_std_ss,Korean_Z_std_sk,Japanese_Z_std_sk,Korean_MM_std_np,Japanese_MM_std_np
0,55.900157,27.865244,0.464771,-1.683923,0.464771,-1.683923,0.464771,-1.683923,0.627846,0.198191
1,52.540454,33.289671,-0.221216,0.187065,-0.221216,0.187065,-0.221216,0.187065,0.517462,0.507497
2,46.131438,31.013592,-1.529815,-0.597998,-1.529815,-0.597998,-1.529815,-0.597998,0.306894,0.377713


--------------------------------------------------
Korean_MM_std_np      0.0
Japanese_MM_std_np    0.0
dtype: float64
--------------------------------------------------
Korean_MM_std_np      1.0
Japanese_MM_std_np    1.0
dtype: float64


In [130]:
# 방법2 | sklearn.preprocessing > MinMaxScaler.fit_transform()
scaler = MinMaxScaler()

# df 형태로 넣어야 해서 이중 리스트
meat_csmp['Korean_MM_std_sk'] = scaler.fit_transform(meat_csmp[['Korean']])
meat_csmp['Japanese_MM_std_sk'] = scaler.fit_transform(meat_csmp[['Japanese']])

display(meat_csmp.head(3))
print('-'*50)
print(meat_csmp[['Korean_MM_std_sk', 'Japanese_MM_std_sk']].min())
print('-'*50)
print(meat_csmp[['Korean_MM_std_sk', 'Japanese_MM_std_sk']].max())

Unnamed: 0,Korean,Japanese,Korean_Z_std_np,Japanese_Z_std_np,Korean_Z_std_ss,Japanese_Z_std_ss,Korean_Z_std_sk,Japanese_Z_std_sk,Korean_MM_std_np,Japanese_MM_std_np,Korean_MM_std_sk,Japanese_MM_std_sk
0,55.900157,27.865244,0.464771,-1.683923,0.464771,-1.683923,0.464771,-1.683923,0.627846,0.198191,0.627846,0.198191
1,52.540454,33.289671,-0.221216,0.187065,-0.221216,0.187065,-0.221216,0.187065,0.517462,0.507497,0.517462,0.507497
2,46.131438,31.013592,-1.529815,-0.597998,-1.529815,-0.597998,-1.529815,-0.597998,0.306894,0.377713,0.306894,0.377713


--------------------------------------------------
Korean_MM_std_sk      0.0
Japanese_MM_std_sk    0.0
dtype: float64
--------------------------------------------------
Korean_MM_std_sk      1.0
Japanese_MM_std_sk    1.0
dtype: float64


In [133]:
# 분포의 왜도 구하기
# 양수일 수록 왼쪽에, 음수일 수록 오른쪽으로 치우치는 모양
df = pd.read_csv('C:/Engineer_Big_Data_Analysis/DATAS/이기적데이터/USJudgeRatings.csv')
print(df.shape)
df.head()

(43, 13)


Unnamed: 0.1,Unnamed: 0,CONT,INTG,DMNR,DILG,CFMG,DECI,PREP,FAMI,ORAL,WRIT,PHYS,RTEN
0,"AARONSON,L.H.",5.7,7.9,7.7,7.3,7.1,7.4,7.1,7.1,7.1,7.0,8.3,7.8
1,"ALEXANDER,J.M.",6.8,8.9,8.8,8.5,7.8,8.1,8.0,8.0,7.8,7.9,8.5,8.7
2,"ARMENTANO,A.J.",7.2,8.1,7.8,7.8,7.5,7.6,7.5,7.5,7.3,7.4,7.9,7.8
3,"BERDON,R.I.",6.8,8.8,8.5,8.8,8.3,8.5,8.7,8.7,8.4,8.5,8.8,8.7
4,"BRACKEN,J.J.",7.3,6.4,4.3,6.5,6.0,6.2,5.7,5.7,5.1,5.3,5.5,4.8


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43 entries, 0 to 42
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  43 non-null     object 
 1   CONT        43 non-null     float64
 2   INTG        43 non-null     float64
 3   DMNR        43 non-null     float64
 4   DILG        43 non-null     float64
 5   CFMG        43 non-null     float64
 6   DECI        43 non-null     float64
 7   PREP        43 non-null     float64
 8   FAMI        43 non-null     float64
 9   ORAL        43 non-null     float64
 10  WRIT        43 non-null     float64
 11  PHYS        43 non-null     float64
 12  RTEN        43 non-null     float64
dtypes: float64(12), object(1)
memory usage: 4.5+ KB
