In [1]:
# 정규분포를 따르는 데이터 생성
    # 한국인: 1000명, 육류소비량 53.9kg, 표준편차 5kg
    # 일본인: 1000명, 육류소비량 32.7lg, 표준편차 3kg

In [4]:
# 기본 패키지
import pandas as pd
import numpy as np
# 데이터 전처리 및 분석
import scipy.stats as ss
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [5]:
# 주어진 조건 맞춰서 생성
smaple_nb = 1000
ko_mu, ko_sigma = 53.9, 5
jp_mu, jp_sigma = 32.71, 3

In [6]:
# 방법1
# 표준편차 * 평균0,표준편차1인 값(표본개수만큼) + 평균
ko_meat_csmp = ko_sigma * np.random.randn(smaple_nb) + ko_mu
jp_meat_csmp = jp_sigma * np.random.randn(smaple_nb) + jp_mu

print(len(ko_meat_csmp), ko_meat_csmp[0], ko_meat_csmp.mean(), ko_meat_csmp.std())
print(len(jp_meat_csmp), jp_meat_csmp[0], jp_meat_csmp.mean(), jp_meat_csmp.std())

1000 52.89556605514723 54.00431271619144 5.015266140516428
1000 33.93875290473452 32.74272498080108 2.9872721776433493


In [7]:
# 방법2
# np.random.normal(mu, sigma, sample_nb) 사용
ko_meat_csmp = np.random.normal(ko_mu, ko_sigma, smaple_nb)
jp_meat_csmp = np.random.normal(jp_mu, jp_sigma, smaple_nb)
print(len(ko_meat_csmp), ko_meat_csmp[0], ko_meat_csmp.mean(), ko_meat_csmp.std())
print(len(jp_meat_csmp), jp_meat_csmp[0], jp_meat_csmp.mean(), jp_meat_csmp.std())

1000 53.391792484024286 53.818727191092606 4.851088626096836
1000 32.38665473890962 32.68776161762123 2.97252457591566


In [8]:
# df 생성
meat_csmp = pd.DataFrame({'Korean': ko_meat_csmp, 'Japanese': jp_meat_csmp})
meat_csmp.head()

Unnamed: 0,Korean,Japanese
0,53.391792,32.386655
1,50.12717,28.887156
2,45.713893,34.38833
3,44.07342,34.207942
4,58.415256,36.040411


In [9]:
# 정규화 진행:  Z-표준화
# 각 데이터 값에서 평균을 뺀 후, 표준편차로 나누어준 값
# 방법1 | numpy > z = (x - mean(x)) / std(x)
# 방법2 | scipy.stats > zscore()
# 방법3 | sklearn.preprocessing > StandardScaler().fit_trandsform()

In [10]:
# 방법1 | numpy > z = (x - mean(x) / std(x))
std1_meat_csmp = (meat_csmp - np.mean(meat_csmp)) / np.std(meat_csmp)
# 기존 df에 열추가해서 넣기
meat_csmp['Korean_Z_std_np'] = std1_meat_csmp['Korean']
meat_csmp['Japanese_Z_std_np'] = std1_meat_csmp['Japanese']

display(meat_csmp.head(3))
print('-'*50)
print(std1_meat_csmp.mean())
print('-'*50)
print(std1_meat_csmp.std())

  return std(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)


Unnamed: 0,Korean,Japanese,Korean_Z_std_np,Japanese_Z_std_np
0,53.391792,32.386655,2.089953,-3.655677
1,50.12717,28.887156,1.416986,-4.832959
2,45.713893,34.38833,0.507236,-2.982285


--------------------------------------------------
Korean      2.177961
Japanese   -3.554380
dtype: float64
--------------------------------------------------
Korean      1.0005
Japanese    1.0005
dtype: float64


In [11]:
# 방법2 | scipy.stats > zscore()
# 기존 df에 열 추가해서 넣기
meat_csmp['Korean_Z_std_ss'] = ss.zscore(meat_csmp['Korean'])
meat_csmp['Japanese_Z_std_ss'] = ss.zscore(meat_csmp['Japanese'])

display(meat_csmp.head())
print('-'*50)
print(meat_csmp[['Korean_Z_std_ss', 'Japanese_Z_std_ss']].mean())
print('-'*50)
print(meat_csmp[['Korean_Z_std_ss', 'Japanese_Z_std_ss']].std())

Unnamed: 0,Korean,Japanese,Korean_Z_std_np,Japanese_Z_std_np,Korean_Z_std_ss,Japanese_Z_std_ss
0,53.391792,32.386655,2.089953,-3.655677,-0.088008,-0.101297
1,50.12717,28.887156,1.416986,-4.832959,-0.760975,-1.278578
2,45.713893,34.38833,0.507236,-2.982285,-1.670725,0.572096
3,44.07342,34.207942,0.16907,-3.04297,-2.008891,0.511411
4,58.415256,36.040411,3.125486,-2.426501,0.947525,1.127879


--------------------------------------------------
Korean_Z_std_ss     -1.101341e-16
Japanese_Z_std_ss    5.506706e-17
dtype: float64
--------------------------------------------------
Korean_Z_std_ss      1.0005
Japanese_Z_std_ss    1.0005
dtype: float64


In [12]:
# 방법3 | sklearn.preprocessing > StandardScaler().fit_trandsform()
# scaler 이름으로 호출
scaler = StandardScaler()

# dataframe 형태로 넣어야 해서 이중 리스트
meat_csmp['Korean_Z_std_sk'] = scaler.fit_transform(meat_csmp[['Korean']])
meat_csmp['Japanese_Z_std_sk'] = scaler.fit_transform(meat_csmp[['Japanese']])

display(meat_csmp.head())
print('-'*50)
print(meat_csmp[['Korean_Z_std_sk', 'Japanese_Z_std_sk']].mean())
print('-'*50)
print(meat_csmp[['Korean_Z_std_sk', 'Japanese_Z_std_sk']].std())

Unnamed: 0,Korean,Japanese,Korean_Z_std_np,Japanese_Z_std_np,Korean_Z_std_ss,Japanese_Z_std_ss,Korean_Z_std_sk,Japanese_Z_std_sk
0,53.391792,32.386655,2.089953,-3.655677,-0.088008,-0.101297,-0.088008,-0.101297
1,50.12717,28.887156,1.416986,-4.832959,-0.760975,-1.278578,-0.760975,-1.278578
2,45.713893,34.38833,0.507236,-2.982285,-1.670725,0.572096,-1.670725,0.572096
3,44.07342,34.207942,0.16907,-3.04297,-2.008891,0.511411,-2.008891,0.511411
4,58.415256,36.040411,3.125486,-2.426501,0.947525,1.127879,0.947525,1.127879


--------------------------------------------------
Korean_Z_std_sk     -1.101341e-16
Japanese_Z_std_sk    5.506706e-17
dtype: float64
--------------------------------------------------
Korean_Z_std_sk      1.0005
Japanese_Z_std_sk    1.0005
dtype: float64


In [13]:
# 정규화 진행: Min-Max 정규화
# 최대값을 1 최소값을 0 으로 맞추는 방법
# 방법1 | np > (x-min(x)) / (max(x)-min(x))
# 방법2 | sklearn.preprocessing > MinMaxScaler.fit_transform()

In [14]:
# 방법1 | np > (x-min(x)) / (max(x)-min(x))
meat_csmp['Korean_MM_std_np'] = (meat_csmp['Korean'] - np.min(meat_csmp['Korean'])) / (np.max(meat_csmp['Korean']) - np.min(meat_csmp['Korean']))
meat_csmp['Japanese_MM_std_np'] = (meat_csmp['Japanese'] - np.min(meat_csmp['Japanese'])) / (np.max(meat_csmp['Japanese']) - np.min(meat_csmp['Japanese']))

display(meat_csmp.head(3))
print('-'*50)
print(meat_csmp[['Korean_MM_std_np', 'Japanese_MM_std_np']].min())
print('-'*50)
print(meat_csmp[['Korean_MM_std_np', 'Japanese_MM_std_np']].max())

Unnamed: 0,Korean,Japanese,Korean_Z_std_np,Japanese_Z_std_np,Korean_Z_std_ss,Japanese_Z_std_ss,Korean_Z_std_sk,Japanese_Z_std_sk,Korean_MM_std_np,Japanese_MM_std_np
0,53.391792,32.386655,2.089953,-3.655677,-0.088008,-0.101297,-0.088008,-0.101297,0.492678,0.483078
1,50.12717,28.887156,1.416986,-4.832959,-0.760975,-1.278578,-0.760975,-1.278578,0.37492,0.296786
2,45.713893,34.38833,0.507236,-2.982285,-1.670725,0.572096,-1.670725,0.572096,0.215727,0.589635


--------------------------------------------------
Korean_MM_std_np      0.0
Japanese_MM_std_np    0.0
dtype: float64
--------------------------------------------------
Korean_MM_std_np      1.0
Japanese_MM_std_np    1.0
dtype: float64


In [15]:
# 방법2 | sklearn.preprocessing > MinMaxScaler.fit_transform()
scaler = MinMaxScaler()

# df 형태로 넣어야 해서 이중 리스트
meat_csmp['Korean_MM_std_sk'] = scaler.fit_transform(meat_csmp[['Korean']])
meat_csmp['Japanese_MM_std_sk'] = scaler.fit_transform(meat_csmp[['Japanese']])

display(meat_csmp.head(3))
print('-'*50)
print(meat_csmp[['Korean_MM_std_sk', 'Japanese_MM_std_sk']].min())
print('-'*50)
print(meat_csmp[['Korean_MM_std_sk', 'Japanese_MM_std_sk']].max())

Unnamed: 0,Korean,Japanese,Korean_Z_std_np,Japanese_Z_std_np,Korean_Z_std_ss,Japanese_Z_std_ss,Korean_Z_std_sk,Japanese_Z_std_sk,Korean_MM_std_np,Japanese_MM_std_np,Korean_MM_std_sk,Japanese_MM_std_sk
0,53.391792,32.386655,2.089953,-3.655677,-0.088008,-0.101297,-0.088008,-0.101297,0.492678,0.483078,0.492678,0.483078
1,50.12717,28.887156,1.416986,-4.832959,-0.760975,-1.278578,-0.760975,-1.278578,0.37492,0.296786,0.37492,0.296786
2,45.713893,34.38833,0.507236,-2.982285,-1.670725,0.572096,-1.670725,0.572096,0.215727,0.589635,0.215727,0.589635


--------------------------------------------------
Korean_MM_std_sk      0.0
Japanese_MM_std_sk    0.0
dtype: float64
--------------------------------------------------
Korean_MM_std_sk      1.0
Japanese_MM_std_sk    1.0
dtype: float64


In [17]:
# 분포의 왜도 구하기
# 양수일 수록 왼쪽에, 음수일 수록 오른쪽으로 치우치는 모양
df = pd.read_csv('C:/Engineer_Big_Data_Analysis/DATAS/이기적데이터/USJudgeRatings.csv')
print(df.shape)
df.head()

(43, 13)


Unnamed: 0.1,Unnamed: 0,CONT,INTG,DMNR,DILG,CFMG,DECI,PREP,FAMI,ORAL,WRIT,PHYS,RTEN
0,"AARONSON,L.H.",5.7,7.9,7.7,7.3,7.1,7.4,7.1,7.1,7.1,7.0,8.3,7.8
1,"ALEXANDER,J.M.",6.8,8.9,8.8,8.5,7.8,8.1,8.0,8.0,7.8,7.9,8.5,8.7
2,"ARMENTANO,A.J.",7.2,8.1,7.8,7.8,7.5,7.6,7.5,7.5,7.3,7.4,7.9,7.8
3,"BERDON,R.I.",6.8,8.8,8.5,8.8,8.3,8.5,8.7,8.7,8.4,8.5,8.8,8.7
4,"BRACKEN,J.J.",7.3,6.4,4.3,6.5,6.0,6.2,5.7,5.7,5.1,5.3,5.5,4.8


In [23]:
# 두 개 열의 왜도를 확인
print(ss.skew(df['CONT'])) # 양수는 왼쪽으로 치우침
print(ss.skew(df['PHYS'])) # 음수는 오른쪽으로 치우침

1.0859724796276253
-1.5582154642293153


In [49]:
# 로그를 취해서 왜도를 조정해주자
# log 변환
# 양수일 경우
print(ss.skew(np.log(df['CONT'])))
# 음수일 경우
print(ss.skew(np.log((np.max(df['PHYS']) + 1) - df['PHYS'])))

# log10 변환
print(ss.skew(np.log10(df['CONT'])))
print(ss.skew(np.log10((np.max(df['PHYS']) + 1) - df['PHYS'])))

0.6555571886692603
0.5824357748750443
0.6555571886692441
0.582435774875044


In [70]:
# 범주화, 이산형화 진행
# 데이터 먼저 만들기
data = [["철수",52], ["영희",92], ["미영",84], ["시완",71], ["미경",65], ["영환",81], ["숙경",66], ["부영",77], ["민섭",73], ["보연",74]]
df = pd.DataFrame(data,columns=['이름','수학점수'])
df

Unnamed: 0,이름,수학점수
0,철수,52
1,영희,92
2,미영,84
3,시완,71
4,미경,65
5,영환,81
6,숙경,66
7,부영,77
8,민섭,73
9,보연,74


In [71]:
# 조건을 사용해서 구간을 직접 지정하기
# 등급 컬럼을 만들고 0으로 초기화
df['등급'] = 0 
# 직접 지정
df.loc[df['수학점수'] < 60, '등급'] = 'F'
df.loc[(df['수학점수']>=60) & (df['수학점수']<70), '등급'] = 'D'
df.loc[(df["수학점수"]>=70) & (df["수학점수"]<80), "등급"] = "C"
df.loc[(df["수학점수"]>=80) & (df["수학점수"]<90), "등급"] = "B"
df.loc[(df["수학점수"]>=90) & (df["수학점수"]<=100), "등급"] = "A"
df

  df.loc[df['수학점수'] < 60, '등급'] = 'F'


Unnamed: 0,이름,수학점수,등급
0,철수,52,F
1,영희,92,A
2,미영,84,B
3,시완,71,C
4,미경,65,D
5,영환,81,B
6,숙경,66,D
7,부영,77,C
8,민섭,73,C
9,보연,74,C


In [72]:
# cut() 함수 활용
# bins로 직접 수치를 지정
# pd.cut(x=데이터, bins=[경계값리스트], labels=[bin이름], include_lowest=True)
# include_lowest: 낮음 경겟값 포함 여부(60 <= x < 70 에서 60)
df['등급'] = pd.cut(df['수학점수'], bins=[0, 60, 70, 80, 90, 100], labels=['F', 'D', 'C', 'B', 'A'], include_lowest=True)
df

Unnamed: 0,이름,수학점수,등급
0,철수,52,F
1,영희,92,A
2,미영,84,B
3,시완,71,C
4,미경,65,D
5,영환,81,B
6,숙경,66,D
7,부영,77,C
8,민섭,73,C
9,보연,74,C


In [73]:
# qcut() 함수 사용
# 나누고자 하는 개수를 q로 지정하면 알아서 균등하게 채워짐
df['등급'] = pd.qcut(df['수학점수'], q=5, labels=['F', 'D', 'C', 'B', 'A'])
df

Unnamed: 0,이름,수학점수,등급
0,철수,52,F
1,영희,92,A
2,미영,84,A
3,시완,71,D
4,미경,65,F
5,영환,81,B
6,숙경,66,D
7,부영,77,B
8,민섭,73,C
9,보연,74,C


In [None]:
# 차원축소 PCA 하기