## 7장. 데이터 정제 및 준비

In [1]:
import numpy as np 
import pandas as pd 

### 1. 결측치 제외 

In [2]:
## dropna는 NA값이 포함된 열을 제외한다. 
data = pd.read_csv("examples/NA_data1.csv")
data 

Unnamed: 0,A,B,C
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [3]:
data_clean = data.dropna()
data_clean

Unnamed: 0,A,B,C
0,1.0,6.5,3.0


In [4]:
data_clean2 = data.dropna(how = 'all') ##모든 값이 na인 것만 제외 
data_clean2

Unnamed: 0,A,B,C
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [5]:
data2 = pd.read_csv("examples/NA_data2.csv")
data2 

Unnamed: 0,A,B,C,D
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [6]:
## 컬럼값이 모두 na인 것은 제외하라. 
data2_clean = data2.dropna(axis=1, how = 'all') 
data2_clean

Unnamed: 0,A,B,C
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


### 2. 결측값 대체 

In [7]:
data2.fillna(0)   # na를 0으로 대체하라 

Unnamed: 0,A,B,C,D
0,1.0,6.5,3.0,0.0
1,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0
3,0.0,6.5,3.0,0.0


In [8]:
data2.fillna(method = 'ffill')  # na를 직전의 값으로 대체라하 (D컬럼은 직전값이 없어 값을 대체하지 못함)

Unnamed: 0,A,B,C,D
0,1.0,6.5,3.0,
1,1.0,6.5,3.0,
2,1.0,6.5,3.0,
3,1.0,6.5,3.0,


### 3. 중복값 제거 

In [9]:
data = pd.read_csv('examples/data_duplicated.csv')
data

Unnamed: 0,name,score
0,Kim,90
1,Lee,80
2,Park,88
3,Jang,76
4,Joe,89
5,Moon,79
6,Hwang,59
7,Lim,94
8,Park,88


In [10]:
data.drop_duplicates()    #중복값인 index = 8의 데이터가 제외되었다. 

Unnamed: 0,name,score
0,Kim,90
1,Lee,80
2,Park,88
3,Jang,76
4,Joe,89
5,Moon,79
6,Hwang,59
7,Lim,94


### 4. 데이터 변형

In [11]:
data = pd.read_csv('examples/data_meat.csv')
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [12]:
#dictionary 
meat_to_animal = {
    'bacon' : 'pig', 
    'pulled pork' : 'pig', 
    'pastrami' : 'cow', 
    'corned beef' : 'cow', 
    'honey ham' : 'pig', 
    'nova lox' : 'salmon'     
}

In [13]:
# food 컬럼의 문자를 모두 소문자로 변환 
data_lower = data['food'].str.lower()
data_lower

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

In [14]:
data['animal'] = data_lower.map(meat_to_animal)
data 

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [15]:
## 위와 같은 결과를 함수로 보다 간단하게 처리하는 방법 
data['food'].map(lambda x : meat_to_animal[x.lower()])
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [16]:
#특정값을 NA로 변환시키기 
data = pd.Series([1.1, 9999, 2.0, 9999, -100, 3.4])   ## 9999는 결측값을 의미함 
data

0       1.1
1    9999.0
2       2.0
3    9999.0
4    -100.0
5       3.4
dtype: float64

In [17]:
data.replace(9999, np.nan)

0      1.1
1      NaN
2      2.0
3      NaN
4   -100.0
5      3.4
dtype: float64

In [18]:
## 9999와 음수를 NA로 변환시키기 
data.replace([9999, -100], np.nan)

0    1.1
1    NaN
2    2.0
3    NaN
4    NaN
5    3.4
dtype: float64

In [19]:
## 9999는 NA, 음수는 0으로 
data.replace([9999, -100], [np.nan, 0])

0    1.1
1    NaN
2    2.0
3    NaN
4    0.0
5    3.4
dtype: float64

In [20]:
## 사전을 이용하여 여러개의 값을 각각 다른값으로 변환 
data.replace({9999:np.nan, -100:0})

0    1.1
1    NaN
2    2.0
3    NaN
4    0.0
5    3.4
dtype: float64

In [21]:
## 인덱스, 컬럼명을 대/소문자로 변경하기 
df = pd.DataFrame(np.arange(12).reshape(3,4), 
                  index = ['Ohio', 'Colorado', 'New York'], 
                  columns = ['one', 'two', 'three', 'four']
                 )
df

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [22]:
df.rename(index = str.upper, columns = str.lower)

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLORADO,4,5,6,7
NEW YORK,8,9,10,11


In [23]:
df.rename(index = str.title, columns = str.upper)  #str.title ~ 첫문자만 대문자로 변경 

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [24]:
# rename과 사전을 이용, 특정이름만 변경 
df.rename(index = {'New York' : 'NY'}, 
          columns = {'one' : '1/4분기', 'two': '2/4분기', 'three': '3/4분기', 'four' : '4/4분기'})

Unnamed: 0,1/4분기,2/4분기,3/4분기,4/4분기
Ohio,0,1,2,3
Colorado,4,5,6,7
NY,8,9,10,11


In [25]:
# age data categorizing 
df_age = pd.read_csv('examples/data_ages.csv')
df_age.head()

Unnamed: 0,ages
0,34
1,47
2,62
3,27
4,22


In [26]:
age_bin = [0, 18, 25, 35, 60, 100]
age_cat = ['0.baby', '1.Youth', '2.Young Adult', '3.Middle Age', '4.Senior']

df_age_new = pd.cut(df_age['ages'], age_bin, labels = age_cat)
df_age_new.head(10)

0    2.Young Adult
1     3.Middle Age
2         4.Senior
3    2.Young Adult
4          1.Youth
5           0.baby
6    2.Young Adult
7    2.Young Adult
8           0.baby
9          1.Youth
Name: ages, dtype: category
Categories (5, object): [0.baby < 1.Youth < 2.Young Adult < 3.Middle Age < 4.Senior]

In [27]:
## outlier 제거하기 
## 1)4천개의 숫자를 랜덤하게 생성 
data = pd.DataFrame(np.random.randn(1000, 4))
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.003519,-0.034217,0.012381,-0.020179
std,1.005541,1.012377,1.018047,1.022032
min,-3.125491,-3.274537,-3.011499,-3.179852
25%,-0.662084,-0.735592,-0.675314,-0.687771
50%,-0.00875,-0.066333,0.065606,-0.001281
75%,0.710279,0.633623,0.716682,0.64126
max,3.447996,3.30086,3.111104,3.014596


In [29]:
## 2)절대값이 3을 초과하는 것이 하나라도 있으면 ... 
data[(np.abs(data) > 3).any(1)]   

Unnamed: 0,0,1,2,3
75,0.878641,-3.062946,-0.380687,0.508486
130,-3.125491,-0.13172,0.99562,0.468487
146,-1.120821,3.30086,-0.164788,0.544968
206,3.447996,-0.396115,1.666177,0.668314
331,-3.00817,-0.76203,-0.615217,0.655419
368,-0.099932,-3.274537,-0.515099,-1.423129
568,0.374343,0.452329,3.111104,0.390866
693,-0.224636,1.388838,-0.974791,-3.179852
714,3.398888,-1.133208,1.333054,-0.049644
754,-0.629826,2.253864,-3.011499,1.216468


In [30]:
## data sampling 
data.sample(3)

Unnamed: 0,0,1,2,3
735,0.269212,-2.300335,2.009274,0.723153
373,-0.488548,-1.1718,1.116928,2.073572
797,-0.288486,-0.526827,0.103652,2.18958


In [35]:
## dict의 key를 dummy로 변환 
df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'], 
                   'data1': range(6) 
                  })
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [33]:
pd.get_dummies(df['key'])

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [None]:
## 문자열 다루기, 정규표현식 skip 