Chapter 5<br/>
< Pandas #1 >
===============================

[[실행 코드]](https://github.com/alstn2468/Python_For_Machine_Learning/blob/master/Chapter.5/2.ipynb)


### Pandas
- 구조화된 데이터의 처리를 지원하는 Python 라이브러리
- Python계의 엑셀
- Numpy와 통합하여, 강력한 스프레드시트 처리 기능 제공
- 인덱싱, 연산용 함수, 전처리 함수 등을 제공

### 데이터 로딩

In [1]:
import pandas as pd

In [2]:
data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data'
df_data = pd.read_csv(data_url, sep='\s+', header = None)

In [3]:
df_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [4]:
df_data.columns = [
    'CRIM','ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO' ,'B', 'LSTAT', 'MEDV'] 
df_data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [5]:
type(df_data.values)

numpy.ndarray

### Pandas의 구성
- Series : DataFrame 중 하나의 Column에 해당하는 데이터의 모음 Object
- DataFrame : Data Table 전체를 포함하는 Object

In [6]:
from pandas import Series, DataFrame
import pandas as pd

In [7]:
example_obj = Series()

In [8]:
list_data = [1, 2, 3, 4, 5]
example_obj = Series(data = list_data)
example_obj

0    1
1    2
2    3
3    4
4    5
dtype: int64

### Series
- Subclass of numpy.array
- Data : any type
- Index labels need not be orderd
- Duplicates are possible(but result in reduced functionality)


#### index 이름 지정 

In [9]:
list_data = [1, 2, 3, 4, 5]
list_name = ['a', 'b', 'c', 'd', 'e']
example_obj = Series(data = list_data, index = list_name)
example_obj

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [10]:
import numpy as np

dict_data = {'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5}
example_obj = Series(dict_data, dtype = np.float32, name = 'example_data')
example_obj

a    1.0
b    2.0
c    3.0
d    4.0
e    5.0
Name: example_data, dtype: float32

#### data index에 접근하기

In [11]:
example_obj['a']

1.0

#### data index에 값 할당하기

In [12]:
example_obj['a'] = 3.2
example_obj

a    3.2
b    2.0
c    3.0
d    4.0
e    5.0
Name: example_data, dtype: float32

#### 값 리스트만 가져오기

In [13]:
example_obj.values

array([ 3.20000005,  2.        ,  3.        ,  4.        ,  5.        ], dtype=float32)

#### index 리스트만 가져오기

In [14]:
example_obj.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

#### Data에 대한 정보를 저장하기

In [15]:
example_obj.name = 'number'
example_obj.index.name = 'alphabet'
example_obj

alphabet
a    3.2
b    2.0
c    3.0
d    4.0
e    5.0
Name: number, dtype: float32

#### index 값을 기준으로 데이터 생성

In [16]:
dict_data_1 = {'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5}
indexes = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h']
series_obj_1 = Series(dict_data_1, index = indexes)
series_obj_1

a    1.0
b    2.0
c    3.0
d    4.0
e    5.0
f    NaN
g    NaN
h    NaN
dtype: float64

### Dataframe
- Numpy array-like
- Each column can have a different type
- Row and column index
- Size mutable : insert and delete columns

#### Series를 모아서 만든 Data Table = 기본 2차원

In [17]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np

In [18]:
raw_data = {'first_name' : ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
            'last_name' : ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze'],
            'age' : [42, 52, 36, 24, 73],
            'city' : ['San Francisco', 'Baltimore', 'Miami', 'Douglas', 'Boston']}
df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'city'])
df

Unnamed: 0,first_name,last_name,age,city
0,Jason,Miller,42,San Francisco
1,Molly,Jacobson,52,Baltimore
2,Tina,Ali,36,Miami
3,Jake,Milner,24,Douglas
4,Amy,Cooze,73,Boston


#### column 선택

In [19]:
DataFrame(raw_data, columns = ['age', 'city'])

Unnamed: 0,age,city
0,42,San Francisco
1,52,Baltimore
2,36,Miami
3,24,Douglas
4,73,Boston


#### 새로운 column 추가

In [20]:
DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'city', 'debt'])

Unnamed: 0,first_name,last_name,age,city,debt
0,Jason,Miller,42,San Francisco,
1,Molly,Jacobson,52,Baltimore,
2,Tina,Ali,36,Miami,
3,Jake,Milner,24,Douglas,
4,Amy,Cooze,73,Boston,


#### column 선택 - series 추출

In [21]:
df = DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'city', 'debt'])
df.first_name

0    Jason
1    Molly
2     Tina
3     Jake
4      Amy
Name: first_name, dtype: object

In [22]:
df['first_name']

0    Jason
1    Molly
2     Tina
3     Jake
4      Amy
Name: first_name, dtype: object

#### loc - index location

In [23]:
df.loc[1]

first_name        Molly
last_name      Jacobson
age                  52
city          Baltimore
debt                NaN
Name: 1, dtype: object

#### iloc - index position

In [24]:
df['age'].iloc[1:]

1    52
2    36
3    24
4    73
Name: age, dtype: int64

In [25]:
s = pd.Series(np.nan, index = [49, 48, 47, 46, 1, 2, 3, 4, 5])
s

49   NaN
48   NaN
47   NaN
46   NaN
1    NaN
2    NaN
3    NaN
4    NaN
5    NaN
dtype: float64

In [26]:
s.loc[:3]

49   NaN
48   NaN
47   NaN
46   NaN
1    NaN
2    NaN
3    NaN
dtype: float64

In [27]:
s.iloc[:3]

49   NaN
48   NaN
47   NaN
dtype: float64

#### DataFrame - Column에 새로운 데이터 할당

In [28]:
df.debt = df.age > 40
df

Unnamed: 0,first_name,last_name,age,city,debt
0,Jason,Miller,42,San Francisco,True
1,Molly,Jacobson,52,Baltimore,True
2,Tina,Ali,36,Miami,False
3,Jake,Milner,24,Douglas,False
4,Amy,Cooze,73,Boston,True


#### Transpose

In [29]:
df.T

Unnamed: 0,0,1,2,3,4
first_name,Jason,Molly,Tina,Jake,Amy
last_name,Miller,Jacobson,Ali,Milner,Cooze
age,42,52,36,24,73
city,San Francisco,Baltimore,Miami,Douglas,Boston
debt,True,True,False,False,True


#### 값 출력

In [30]:
df.values

array([['Jason', 'Miller', 42, 'San Francisco', True],
       ['Molly', 'Jacobson', 52, 'Baltimore', True],
       ['Tina', 'Ali', 36, 'Miami', False],
       ['Jake', 'Milner', 24, 'Douglas', False],
       ['Amy', 'Cooze', 73, 'Boston', True]], dtype=object)

### csv 변환

In [31]:
df.to_csv()

',first_name,last_name,age,city,debt\n0,Jason,Miller,42,San Francisco,True\n1,Molly,Jacobson,52,Baltimore,True\n2,Tina,Ali,36,Miami,False\n3,Jake,Milner,24,Douglas,False\n4,Amy,Cooze,73,Boston,True\n'

#### DataFrame - Column을 삭제

In [32]:
del df['debt']
df

Unnamed: 0,first_name,last_name,age,city
0,Jason,Miller,42,San Francisco
1,Molly,Jacobson,52,Baltimore
2,Tina,Ali,36,Miami
3,Jake,Milner,24,Douglas
4,Amy,Cooze,73,Boston


In [33]:
pop = {'Nevada': {2001: 2.4, 2002: 2.9},
        'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}

DataFrame(pop)

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


### Selection

- Selection with column names

In [34]:
df = pd.read_excel("./data/excel-comp-data.xlsx")
df.head()

Unnamed: 0,account,name,street,city,state,postal-code,Jan,Feb,Mar
0,211829,"Kerluke, Koepp and Hilpert",34456 Sean Highway,New Jaycob,Texas,28752,10000,62000,35000
1,320563,Walter-Trantow,1311 Alvis Tunnel,Port Khadijah,NorthCarolina,38365,95000,45000,35000
2,648336,"Bashirian, Kunde and Price",62184 Schamberger Underpass Apt. 231,New Lilianland,Iowa,76517,91000,120000,35000
3,109996,"D'Amore, Gleichner and Bode",155 Fadel Crescent Apt. 144,Hyattburgh,Maine,46021,45000,120000,10000
4,121213,Bauch-Goldner,7274 Marissa Common,Shanahanchester,California,49681,162000,120000,35000


#### 한개의 column 선택시

In [35]:
df["account"].head(2)

0    211829
1    320563
Name: account, dtype: int64

#### 1개 이상의 column 선택

In [36]:
df[["account", "street", "state"]].head(3)

Unnamed: 0,account,street,state
0,211829,34456 Sean Highway,Texas
1,320563,1311 Alvis Tunnel,NorthCarolina
2,648336,62184 Schamberger Underpass Apt. 231,Iowa


#### column 이름 없이 사용하는 index number는 row 기준 표시

In [37]:
df[:3]

Unnamed: 0,account,name,street,city,state,postal-code,Jan,Feb,Mar
0,211829,"Kerluke, Koepp and Hilpert",34456 Sean Highway,New Jaycob,Texas,28752,10000,62000,35000
1,320563,Walter-Trantow,1311 Alvis Tunnel,Port Khadijah,NorthCarolina,38365,95000,45000,35000
2,648336,"Bashirian, Kunde and Price",62184 Schamberger Underpass Apt. 231,New Lilianland,Iowa,76517,91000,120000,35000


#### column 이름과 함께 row index 사용시, 해당 column만

In [38]:
df["account"][:3]

0    211829
1    320563
2    648336
Name: account, dtype: int64

In [39]:
account_serires = df["account"]
account_serires[:3]

0    211829
1    320563
2    648336
Name: account, dtype: int64

#### 1개 이상의 index

In [40]:
account_serires[[0, 1, 2]]

0    211829
1    320563
2    648336
Name: account, dtype: int64

#### Boolean index

In [41]:
account_serires[account_serires < 250000]

0     211829
3     109996
4     121213
5     132971
6     145068
7     205217
8     209744
9     212303
10    214098
11    231907
12    242368
Name: account, dtype: int64

#### index 변경

In [42]:
df.index = df['account']
del df['account']
df.head()

Unnamed: 0_level_0,name,street,city,state,postal-code,Jan,Feb,Mar
account,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
211829,"Kerluke, Koepp and Hilpert",34456 Sean Highway,New Jaycob,Texas,28752,10000,62000,35000
320563,Walter-Trantow,1311 Alvis Tunnel,Port Khadijah,NorthCarolina,38365,95000,45000,35000
648336,"Bashirian, Kunde and Price",62184 Schamberger Underpass Apt. 231,New Lilianland,Iowa,76517,91000,120000,35000
109996,"D'Amore, Gleichner and Bode",155 Fadel Crescent Apt. 144,Hyattburgh,Maine,46021,45000,120000,10000
121213,Bauch-Goldner,7274 Marissa Common,Shanahanchester,California,49681,162000,120000,35000


#### Basic, loc, iloc selection

- Column 과 index number

In [43]:
df[['name', 'street']][:2]

Unnamed: 0_level_0,name,street
account,Unnamed: 1_level_1,Unnamed: 2_level_1
211829,"Kerluke, Koepp and Hilpert",34456 Sean Highway
320563,Walter-Trantow,1311 Alvis Tunnel


- Column 과 index name

In [44]:
df.loc[[211829, 320563], ['name', 'street']]

Unnamed: 0_level_0,name,street
account,Unnamed: 1_level_1,Unnamed: 2_level_1
211829,"Kerluke, Koepp and Hilpert",34456 Sean Highway
320563,Walter-Trantow,1311 Alvis Tunnel


- Column number 와 index number

In [45]:
df.iloc[:2, :2]

Unnamed: 0_level_0,name,street
account,Unnamed: 1_level_1,Unnamed: 2_level_1
211829,"Kerluke, Koepp and Hilpert",34456 Sean Highway
320563,Walter-Trantow,1311 Alvis Tunnel


#### index 재설정

In [46]:
df.index = list(range(0, 15))
df.head()

Unnamed: 0,name,street,city,state,postal-code,Jan,Feb,Mar
0,"Kerluke, Koepp and Hilpert",34456 Sean Highway,New Jaycob,Texas,28752,10000,62000,35000
1,Walter-Trantow,1311 Alvis Tunnel,Port Khadijah,NorthCarolina,38365,95000,45000,35000
2,"Bashirian, Kunde and Price",62184 Schamberger Underpass Apt. 231,New Lilianland,Iowa,76517,91000,120000,35000
3,"D'Amore, Gleichner and Bode",155 Fadel Crescent Apt. 144,Hyattburgh,Maine,46021,45000,120000,10000
4,Bauch-Goldner,7274 Marissa Common,Shanahanchester,California,49681,162000,120000,35000


#### Data drop
- 실제 데이터가 날라가는 것은 아니다.
- index number로 drop

df.drop(1)

- 1개 이상의 index number로 drop

In [47]:
df.drop([0, 1, 2, 3])

Unnamed: 0,name,street,city,state,postal-code,Jan,Feb,Mar
4,Bauch-Goldner,7274 Marissa Common,Shanahanchester,California,49681,162000,120000,35000
5,"Williamson, Schumm and Hettinger",89403 Casimer Spring,Jeremieburgh,Arkansas,62785,150000,120000,35000
6,Casper LLC,340 Consuela Bridge Apt. 400,Lake Gabriellaton,Mississipi,18008,62000,120000,70000
7,Kovacek-Johnston,91971 Cronin Vista Suite 601,Deronville,RhodeIsland,53461,145000,95000,35000
8,Champlin-Morar,26739 Grant Lock,Lake Juliannton,Pennsylvania,64415,70000,95000,35000
9,Gerhold-Maggio,366 Maggio Grove Apt. 998,North Ras,Idaho,46308,70000,120000,35000
10,"Goodwin, Homenick and Jerde",649 Cierra Forks Apt. 078,Rosaberg,Tenessee,47743,45000,120000,55000
11,Hahn-Moore,18115 Olivine Throughway,Norbertomouth,NorthDakota,31415,150000,10000,162000
12,"Frami, Anderson and Donnelly",182 Bertie Road,East Davian,Iowa,72686,162000,120000,35000
13,Walsh-Haley,2624 Beatty Parkways,Goodwinmouth,RhodeIsland,31919,55000,120000,35000


- axis 지정으로 축을 기준으로 drop<br/>
>column 중에 'city'

In [48]:
df.drop('city', axis = 1)

Unnamed: 0,name,street,state,postal-code,Jan,Feb,Mar
0,"Kerluke, Koepp and Hilpert",34456 Sean Highway,Texas,28752,10000,62000,35000
1,Walter-Trantow,1311 Alvis Tunnel,NorthCarolina,38365,95000,45000,35000
2,"Bashirian, Kunde and Price",62184 Schamberger Underpass Apt. 231,Iowa,76517,91000,120000,35000
3,"D'Amore, Gleichner and Bode",155 Fadel Crescent Apt. 144,Maine,46021,45000,120000,10000
4,Bauch-Goldner,7274 Marissa Common,California,49681,162000,120000,35000
5,"Williamson, Schumm and Hettinger",89403 Casimer Spring,Arkansas,62785,150000,120000,35000
6,Casper LLC,340 Consuela Bridge Apt. 400,Mississipi,18008,62000,120000,70000
7,Kovacek-Johnston,91971 Cronin Vista Suite 601,RhodeIsland,53461,145000,95000,35000
8,Champlin-Morar,26739 Grant Lock,Pennsylvania,64415,70000,95000,35000
9,Gerhold-Maggio,366 Maggio Grove Apt. 998,Idaho,46308,70000,120000,35000


### Series addition operation
- index 기준으로 연산
- 겹치는 index가 없을 경우 NaN값으로 반환

In [49]:
s1 = Series(range(1, 6), index = list("abced"))
s1

a    1
b    2
c    3
e    4
d    5
dtype: int64

In [50]:
s2 = Series(range(5, 11), index = list("bcedef"))
s2

b     5
c     6
e     7
d     8
e     9
f    10
dtype: int64

In [51]:
s1 + s2

a     NaN
b     7.0
c     9.0
d    13.0
e    11.0
e    13.0
f     NaN
dtype: float64

In [52]:
s1.add(s2)

a     NaN
b     7.0
c     9.0
d    13.0
e    11.0
e    13.0
f     NaN
dtype: float64

### DataFrame operation
- df는 column과 index를 모두 고려
- add operation을 쓰면 NaN값 0으로 변환
- Operation types : add, sub, div, mul

In [53]:
df1 = DataFrame(np.arange(9).reshape(3, 3),
                columns = list("abc"))
df1

Unnamed: 0,a,b,c
0,0,1,2
1,3,4,5
2,6,7,8


In [54]:
df2 = DataFrame(np.arange(16).reshape(4, 4), 
                index = list("abcd"))
df2

Unnamed: 0,0,1,2,3
a,0,1,2,3
b,4,5,6,7
c,8,9,10,11
d,12,13,14,15


In [55]:
df1 + df2

Unnamed: 0,a,b,c,0,1,2,3
0,,,,,,,
1,,,,,,,
2,,,,,,,
a,,,,,,,
b,,,,,,,
c,,,,,,,
d,,,,,,,


In [56]:
df1.add(df2, fill_value = 0)

Unnamed: 0,a,b,c,0,1,2,3
0,0.0,1.0,2.0,,,,
1,3.0,4.0,5.0,,,,
2,6.0,7.0,8.0,,,,
a,,,,0.0,1.0,2.0,3.0
b,,,,4.0,5.0,6.0,7.0
c,,,,8.0,9.0,10.0,11.0
d,,,,12.0,13.0,14.0,15.0


- column을 기준으로 broadcastiong 발생

In [57]:
df = DataFrame(np.arange(16).reshape(4, 4),
               columns = list('abcd'))
df

Unnamed: 0,a,b,c,d
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15


In [58]:
s = Series(np.arange(10, 14),
           index = list('abcd'))
s

a    10
b    11
c    12
d    13
dtype: int64

In [59]:
df + s

Unnamed: 0,a,b,c,d
0,10,12,14,16
1,14,16,18,20
2,18,20,22,24
3,22,24,26,28


### Series + DataFrame
- axis를 기준으로 row broadcasting 실행

In [60]:
df = DataFrame(np.arange(16).reshape(4, 4),
               columns = list('abcd'))
df

Unnamed: 0,a,b,c,d
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15


In [61]:
s2 = Series(np.arange(10, 14))
s2

0    10
1    11
2    12
3    13
dtype: int64

In [62]:
df + s2

Unnamed: 0,a,b,c,d,0,1,2,3
0,,,,,,,,
1,,,,,,,,
2,,,,,,,,
3,,,,,,,,


In [63]:
df.add(s2, axis = 0)

Unnamed: 0,a,b,c,d
0,10,11,12,13
1,15,16,17,18
2,20,21,22,23
3,25,26,27,28


### lambda
- 한 줄로 함수를 표현하는 익명 함수 표기법
- Lisp 언어에서 시작된 기법
- 오늘날 현대언어에 많이 사용

```python
lambda argument : expression
```

#### 하나의 argument만 처리하는 lambda 함수

In [64]:
f = lambda x : x / 2
f(3)

1.5

In [65]:
f = lambda x : x ** 2
f(3)

9

#### 이름을 할당하지 않는 lambda 함수

In [66]:
(lambda x : x + 1)(5)

6

### map
- 함수와 sequence형 데이터를 인자로 받는다.
- 각 element마다 입력받은 함수를 적용하여 list로 반환
- 일반적으로 함수를 lambda형태로 표현

```python
map(function, sequence)
```

- 2개 이상의 argument가 있을 때는  두 개의 sequence형을 써야한다.

In [67]:
ex = [1, 2, 3, 4, 5]

f = lambda x, y : x + y
list(map(f, ex, ex))

[2, 4, 6, 8, 10]

In [68]:
list(map(lambda x : x + x, ex))

[2, 4, 6, 8, 10]

### map for Series
- Pandas의 Series type의 데이터에도 map함수 사용가능
- function 대신 dict, sequence형 자료형으로 대체 가능

In [69]:
s1 = Series(np.arange(10))
s1.head(5)

0    0
1    1
2    2
3    3
4    4
dtype: int64

In [70]:
s1.map(lambda x : x ** 2).head(5)

0     0
1     1
2     4
3     9
4    16
dtype: int64

- dict type으로 데이터 교체 없는 값은 NaN

In [71]:
z = {1: 'A', 2: 'B', 3: 'C'}
s1.map(z).head(5)

0    NaN
1      A
2      B
3      C
4    NaN
dtype: object

- 같은 위치의 데이터를 s2로 전환

In [72]:
s2 = Series(np.arange(10, 20))
s1.map(s2).head(5)

0    10
1    11
2    12
3    13
4    14
dtype: int64

### Map for Series

In [73]:
df = pd.read_csv("./data/wages.csv")
df.head()

Unnamed: 0,earn,height,sex,race,ed,age
0,79571.299011,73.89,male,white,16,49
1,96396.988643,66.23,female,white,16,62
2,48710.666947,63.77,female,white,16,33
3,80478.096153,63.22,female,other,16,95
4,82089.345498,63.08,female,white,17,43


In [74]:
df.sex.unique()

array(['male', 'female'], dtype=object)

In [75]:
df["sex_code"] = df.sex.map({"male": 0, "female": 1})
df.head(5)

Unnamed: 0,earn,height,sex,race,ed,age,sex_code
0,79571.299011,73.89,male,white,16,49,0
1,96396.988643,66.23,female,white,16,62,1
2,48710.666947,63.77,female,white,16,33,1
3,80478.096153,63.22,female,other,16,95,1
4,82089.345498,63.08,female,white,17,43,1


### Replace function
- Map 함수의 기능중 데이터 변환 기능만 담당
- 데이터 변환시 많이 사용하는 함수

In [76]:
df.sex.replace({'male': 0, 'female': 1}).head()

0    0
1    1
2    1
3    1
4    1
Name: sex, dtype: int64

### apply for DataFrame
- Map과 달리, Series 전체(column)에 해당하는 함수를 적용
- 입력값이 Series 데이터로 받아 handling 가능
- 내장 연산 함수를 사용해도 사용할 때 똑같은 효과를 거둘 수 있다.
- mean, std등 사용가능
- scalar 값 이외에 Series값의 반환도 가능

In [77]:
df_info = df[["earn", "height", "age"]]
df_info.head()

Unnamed: 0,earn,height,age
0,79571.299011,73.89,49
1,96396.988643,66.23,62
2,48710.666947,63.77,33
3,80478.096153,63.22,95
4,82089.345498,63.08,43


In [78]:
f = lambda x : x.max() - x.min()
df_info.apply(f)

earn      318047.708444
height        19.870000
age           73.000000
dtype: float64

In [79]:
df_info.apply(sum)

earn      4.474344e+07
height    9.183125e+04
age       6.250800e+04
dtype: float64

In [80]:
df_info.apply(sum)

earn      4.474344e+07
height    9.183125e+04
age       6.250800e+04
dtype: float64

In [81]:
def f(x):
    return Series([x.min(), x.max(), x.mean()], 
                    index=["min", "max", "mean"])
df_info.apply(f)

Unnamed: 0,earn,height,age
min,-98.580489,57.34,22.0
max,317949.127955,77.21,95.0
mean,32446.292622,66.59264,45.328499


In [82]:
f = lambda x : -x
df_info.applymap(f).head(5)

Unnamed: 0,earn,height,age
0,-79571.299011,-73.89,-49
1,-96396.988643,-66.23,-62
2,-48710.666947,-63.77,-33
3,-80478.096153,-63.22,-95
4,-82089.345498,-63.08,-43


In [83]:
f = lambda x : -x
df_info["earn"].apply(f).head(5)

0   -79571.299011
1   -96396.988643
2   -48710.666947
3   -80478.096153
4   -82089.345498
Name: earn, dtype: float64

### describe
- Numeric type데이터의 요약 정보를 보여준다.

In [84]:
df.describe()

Unnamed: 0,earn,height,ed,age,sex_code
count,1379.0,1379.0,1379.0,1379.0,1379.0
mean,32446.292622,66.59264,13.354605,45.328499,0.622915
std,31257.070006,3.818108,2.438741,15.789715,0.484832
min,-98.580489,57.34,3.0,22.0,0.0
25%,10538.790721,63.72,12.0,33.0,0.0
50%,26877.870178,66.05,13.0,42.0,1.0
75%,44506.215336,69.315,15.0,55.0,1.0
max,317949.127955,77.21,18.0,95.0,1.0


### unique
- Series data의 유일한 값을 listf로 반환

In [85]:
df.race.unique()

array(['white', 'other', 'hispanic', 'black'], dtype=object)

In [86]:
dict(enumerate(sorted(df["race"].unique())))

{0: 'black', 1: 'hispanic', 2: 'other', 3: 'white'}

In [87]:
value = list(map(int, np.array(list(enumerate(df["race"].unique())))[:, 0].tolist()))
key = np.array(list(enumerate(df["race"].unique())), dtype = str)[:, 1].tolist()

value, key

([0, 1, 2, 3], ['white', 'other', 'hispanic', 'black'])

### sum
- 기본적인 column 또는 row 값의 연산을 지원
- sum, mean, min, max, count, median, mad, var 등

In [88]:
df.sum(axis = 0)

earn                                              4.47434e+07
height                                                91831.3
sex         malefemalefemalefemalefemalefemalefemalemalema...
race        whitewhitewhiteotherwhitewhitewhitewhitehispan...
ed                                                      18416
age                                                     62508
sex_code                                                  859
dtype: object

In [89]:
df.sum(axis = 1)

0        79710.189011
1        96542.218643
2        48824.436947
3        80653.316153
4        82213.425498
5        15423.882901
6        47231.711821
7        51100.344282
8         3324.889556
9        43111.037884
10       10483.838843
11        1134.457155
12       47714.929864
13       19130.622299
14       20196.856639
15        1096.892346
16       36094.181123
17       27072.613964
18       64718.223972
19       70124.713070
20        1102.298306
21       12268.022115
22       84348.157919
23        9080.644935
24       23381.363278
25        8856.809185
26       64717.549805
27       54212.504945
28       17010.860044
29           8.749721
            ...      
1349     64753.641872
1350     16059.423155
1351     25480.928379
1352     18611.227184
1353     26561.776457
1354     28757.820098
1355     20175.062736
1356      6463.932127
1357     95549.526798
1358     68564.843058
1359     50419.021292
1360     80647.389376
1361     44036.074343
1362     47859.290935
1363     1

### isnull
- column 또는 row 값의 NaN 값의 index 반환

In [90]:
df.isnull()

Unnamed: 0,earn,height,sex,race,ed,age,sex_code
0,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False
5,False,False,False,False,False,False,False
6,False,False,False,False,False,False,False
7,False,False,False,False,False,False,False
8,False,False,False,False,False,False,False
9,False,False,False,False,False,False,False


In [91]:
df.isnull().sum(0)

earn        0
height      0
sex         0
race        0
ed          0
age         0
sex_code    0
dtype: int64

### sort_values
- column 값을 기준으로 데이터를 정렬

In [92]:
df.sort_values(["age", "earn"], ascending = False).head(10)

Unnamed: 0,earn,height,sex,race,ed,age,sex_code
3,80478.096153,63.22,female,other,16,95,1
809,42963.362005,72.94,male,white,12,95,0
331,39169.750135,64.79,female,white,12,95,1
102,39751.19403,67.14,male,white,12,93,0
993,32809.632677,59.61,female,other,16,92,1
1017,8942.806716,62.97,female,white,10,91,1
1192,39757.94721,64.79,male,white,16,90,0
952,8162.682672,58.09,female,white,5,89,1
827,55712.348432,70.13,male,white,9,88,0
939,40744.874765,59.15,female,white,15,87,1


### Correlation & Convariance
- 상관계수와 공분산을 구하는 함수
- corr, cov, corrwith

In [93]:
df.age.corr(df.earn)

0.074003491778360561

In [94]:
df.age.cov(df.earn)

36523.699210408893

In [95]:
df.corr()

Unnamed: 0,earn,height,ed,age,sex_code
earn,1.0,0.2916,0.350374,0.074003,-0.337328
height,0.2916,1.0,0.114047,-0.133727,-0.703672
ed,0.350374,0.114047,1.0,-0.129802,-0.061747
age,0.074003,-0.133727,-0.129802,1.0,0.070036
sex_code,-0.337328,-0.703672,-0.061747,0.070036,1.0
