# Pandas <br>
* 구조화된 데이터의 처리를 지원하는 Python 라이브러리
* Python 계의 엑셀
* 고성능 Array 계산 라이브러리 Numpy와 통합하여, 
* 강력한 "스프레드시트"처리 기능을 제공
* 인덱싱, 연산용 함수, 전처리 함수 등을 제공

In [39]:
import pandas as pd #라이브러리 호출

In [40]:
data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data' #Data URL
# data_url = './housing.data' #Data URL
df_data = pd.read_csv(data_url, sep='\s+', header = None) #csv 타입 데이터 로드, separate는 빈공간으로 지정하고, Column은 없음

In [41]:
df_data.head() # 첫 5줄 출력

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [42]:
# Column Header 이름 설정
df_data.columns = [
    'CRIM','ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO' ,'B', 'LSTAT', 'MEDV'] 
df_data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [43]:
type(df_data.values)

numpy.ndarray

## Series <Br>
* Pandas의 구성에는
* Series : DataFrame 중 하나의 Column에 해당하는 데이터의 모음 Object
* 즉, Column Vector를 표현하는 object
* DataFrme : Data Table 전체를 포함하는 Object

In [44]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np

In [45]:
list_data = [1, 2, 3, 4, 5]
example_obj = Series(data = list_data)
example_obj

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [46]:
list_data = [1,2,3,4,5]
list_name = ["a","b","c","d","e"]
example_obj = Series(data = list_data, index=list_name) # index 이름을 지정
example_obj

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [47]:
# data type 설정 및 series 이름 설정
dict_data = {"a":1, "b":2, "c":3, "d":4, "e":5} # Data와 index 이름을 지정
example_obj = Series(dict_data, dtype=np.float32, name="example_data")
example_obj

a    1.0
b    2.0
c    3.0
d    4.0
e    5.0
Name: example_data, dtype: float32

In [48]:
# data index에 접근하기
example_obj["a"]

1.0

In [49]:
# data index에 값 할당하기
example_obj["a"] = 3.2
example_obj

a    3.2
b    2.0
c    3.0
d    4.0
e    5.0
Name: example_data, dtype: float32

In [50]:
# 값 리스트만 추출
example_obj.values

array([3.2, 2. , 3. , 4. , 5. ], dtype=float32)

In [51]:
# index 리스트만 추출
example_obj.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [52]:
# Data에 대한 정보를 저장
example_obj.name = "number"
example_obj.index.nae = "alphabet"
example_obj

a    3.2
b    2.0
c    3.0
d    4.0
e    5.0
Name: number, dtype: float32

In [53]:
# index 값을 기준으로 series 생성
dict_data_1 = {"a":1, "b":2, "c":3, "d":4, "e":5}
indexes = {"a", "b", "c", "d", "e", "f", "g", "h"}
series_obj_1 = Series(dict_data_1, index=indexes)
series_obj_1

d    4.0
g    NaN
f    NaN
h    NaN
c    3.0
b    2.0
e    5.0
a    1.0
dtype: float64

## Dataframe <Br>
    * Data Table 전체를 포함하는 Object
    * 각 컬럼은 다른 타입을 가질 수 있음
    * 행과 열 인덱스
    * 열을 삽입 및 삭제 가능
    * Series를 모아서 만든 Data Table = 기본 2차원

In [54]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np

In [55]:
# Example from - https://chrisalbon.com/python/pandas_map_values_to_values.html
raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
           'last_name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze'],
           'age': [42, 52, 36, 24, 73],
           'city': ['San Francisco', 'Baltimore', 'Miami', 'Douglas', 'Boston']}

df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'city'])
df

Unnamed: 0,first_name,last_name,age,city
0,Jason,Miller,42,San Francisco
1,Molly,Jacobson,52,Baltimore
2,Tina,Ali,36,Miami
3,Jake,Milner,24,Douglas
4,Amy,Cooze,73,Boston


In [56]:
# column 선택
DataFrame(raw_data, columns = ["age", "city"])

Unnamed: 0,age,city
0,42,San Francisco
1,52,Baltimore
2,36,Miami
3,24,Douglas
4,73,Boston


In [57]:
# 새로운 column 추가
DataFrame(raw_data,
         columns = ["first_name", "last_name", "age", "city", "debt"])

Unnamed: 0,first_name,last_name,age,city,debt
0,Jason,Miller,42,San Francisco,
1,Molly,Jacobson,52,Baltimore,
2,Tina,Ali,36,Miami,
3,Jake,Milner,24,Douglas,
4,Amy,Cooze,73,Boston,


In [58]:
# column 선택 - series 추출
df = DataFrame(raw_data, columns = ["first_name", "last_name", "age", "city", "debt"])
df.first_name

0    Jason
1    Molly
2     Tina
3     Jake
4      Amy
Name: first_name, dtype: object

In [59]:
# column 선택 - series 추출
df["first_name"]

0    Jason
1    Molly
2     Tina
3     Jake
4      Amy
Name: first_name, dtype: object

In [60]:
df

Unnamed: 0,first_name,last_name,age,city,debt
0,Jason,Miller,42,San Francisco,
1,Molly,Jacobson,52,Baltimore,
2,Tina,Ali,36,Miami,
3,Jake,Milner,24,Douglas,
4,Amy,Cooze,73,Boston,


In [61]:
# loc = index location 
# 인덱스의 이름으로 접근하기
df.loc[1]

first_name        Molly
last_name      Jacobson
age                  52
city          Baltimore
debt                NaN
Name: 1, dtype: object

In [62]:
# iloc = index position
# 인덱스의 위치로 접근하기
# 시리즈 데이터로 넘파이처럼 반영
df["age"].iloc[1:]

1    52
2    36
3    24
4    73
Name: age, dtype: int64

In [63]:
# Example from - https://stackoverflow.com/questions/31593201/pandas-iloc-vs-ix-vs-loc-explanation
s = pd.Series(np.nan, index=[49, 48, 47, 46, 45, 1, 2, 3, 4, 5])
s

49   NaN
48   NaN
47   NaN
46   NaN
45   NaN
1    NaN
2    NaN
3    NaN
4    NaN
5    NaN
dtype: float64

In [64]:
# index가 3일 때까지 찍음
s.loc[:3]

49   NaN
48   NaN
47   NaN
46   NaN
45   NaN
1    NaN
2    NaN
3    NaN
dtype: float64

In [65]:
# 3행까지 찍음
s.iloc[:3]

49   NaN
48   NaN
47   NaN
dtype: float64

In [66]:
df.age > 40

0     True
1     True
2    False
3    False
4     True
Name: age, dtype: bool

In [67]:
# DataFrame은 Numpy의 sub class
# Column에 새로운 데이터 할당
df.debt = df.age > 40
df

Unnamed: 0,first_name,last_name,age,city,debt
0,Jason,Miller,42,San Francisco,True
1,Molly,Jacobson,52,Baltimore,True
2,Tina,Ali,36,Miami,False
3,Jake,Milner,24,Douglas,False
4,Amy,Cooze,73,Boston,True


In [68]:
values = Series(data=["M", "F", "F"], index=[0, 1, 3])
values

0    M
1    F
3    F
dtype: object

In [69]:
df["sex"] = values
df

Unnamed: 0,first_name,last_name,age,city,debt,sex
0,Jason,Miller,42,San Francisco,True,M
1,Molly,Jacobson,52,Baltimore,True,F
2,Tina,Ali,36,Miami,False,
3,Jake,Milner,24,Douglas,False,F
4,Amy,Cooze,73,Boston,True,


In [70]:
# Trasnpose (전치)
df.T

Unnamed: 0,0,1,2,3,4
first_name,Jason,Molly,Tina,Jake,Amy
last_name,Miller,Jacobson,Ali,Milner,Cooze
age,42,52,36,24,73
city,San Francisco,Baltimore,Miami,Douglas,Boston
debt,True,True,False,False,True
sex,M,F,,F,


In [71]:
# 값 출력
df.values

array([['Jason', 'Miller', 42, 'San Francisco', True, 'M'],
       ['Molly', 'Jacobson', 52, 'Baltimore', True, 'F'],
       ['Tina', 'Ali', 36, 'Miami', False, nan],
       ['Jake', 'Milner', 24, 'Douglas', False, 'F'],
       ['Amy', 'Cooze', 73, 'Boston', True, nan]], dtype=object)

In [72]:
# csv 변환
df.to_csv()

',first_name,last_name,age,city,debt,sex\r\n0,Jason,Miller,42,San Francisco,True,M\r\n1,Molly,Jacobson,52,Baltimore,True,F\r\n2,Tina,Ali,36,Miami,False,\r\n3,Jake,Milner,24,Douglas,False,F\r\n4,Amy,Cooze,73,Boston,True,\r\n'

In [73]:
# Column을 삭제
del df["debt"]

In [74]:
df

Unnamed: 0,first_name,last_name,age,city,sex
0,Jason,Miller,42,San Francisco,M
1,Molly,Jacobson,52,Baltimore,F
2,Tina,Ali,36,Miami,
3,Jake,Milner,24,Douglas,F
4,Amy,Cooze,73,Boston,


In [75]:
# Example from Python for data analyis
pop = {'Nevada' : {2001: 2.4, 2002: 2.9},
      'Ohio' : {2000: 1.5, 2001: 1.7, 2002: 3.6}}

DataFrame(pop)

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


## Selection & Drop

### Selection with column names

In [76]:
import pandas as pd

#### Data loading
* xlrd 모듈이 없을 경우, conda install xlrd

In [77]:
# !conda install --y xlrd

In [78]:
import numpy as np
df = pd.read_excel("data/excel-comp-data.xlsx")
df.head()

Unnamed: 0,account,name,street,city,state,postal-code,Jan,Feb,Mar
0,211829,"Kerluke, Koepp and Hilpert",34456 Sean Highway,New Jaycob,Texas,28752,10000,62000,35000
1,320563,Walter-Trantow,1311 Alvis Tunnel,Port Khadijah,NorthCarolina,38365,95000,45000,35000
2,648336,"Bashirian, Kunde and Price",62184 Schamberger Underpass Apt. 231,New Lilianland,Iowa,76517,91000,120000,35000
3,109996,"D'Amore, Gleichner and Bode",155 Fadel Crescent Apt. 144,Hyattburgh,Maine,46021,45000,120000,10000
4,121213,Bauch-Goldner,7274 Marissa Common,Shanahanchester,California,49681,162000,120000,35000


In [79]:
# 한 개의 column 선택시
# Series를 가져옴
df["account"].head(3)

0    211829
1    320563
2    648336
Name: account, dtype: int64

In [80]:
# 한 개 이상의 column 선택
# 여러 개의 컬럼은 [[]] 써야함
# DataFrame을 가져옴
df[["account", "street", "state"]].head(3)

Unnamed: 0,account,street,state
0,211829,34456 Sean Highway,Texas
1,320563,1311 Alvis Tunnel,NorthCarolina
2,648336,62184 Schamberger Underpass Apt. 231,Iowa


### Series Selection

In [81]:
# column 이름 없이 사용하는 index number는 row 기준 표시
# 일관성이 없는 단점
df[:3]

Unnamed: 0,account,name,street,city,state,postal-code,Jan,Feb,Mar
0,211829,"Kerluke, Koepp and Hilpert",34456 Sean Highway,New Jaycob,Texas,28752,10000,62000,35000
1,320563,Walter-Trantow,1311 Alvis Tunnel,Port Khadijah,NorthCarolina,38365,95000,45000,35000
2,648336,"Bashirian, Kunde and Price",62184 Schamberger Underpass Apt. 231,New Lilianland,Iowa,76517,91000,120000,35000


In [82]:
# column 이름과 함께 row index 사용시, 해당 column만 추출
df["account"][:3]

0    211829
1    320563
2    648336
Name: account, dtype: int64

In [83]:
account_serires = df["account"]
account_serires[:3]

0    211829
1    320563
2    648336
Name: account, dtype: int64

In [84]:
# 1개 이상의 index
# index값을 기준으로 뽑아줌
account_serires[[0,1,2]]

0    211829
1    320563
2    648336
Name: account, dtype: int64

In [85]:
# Boolean index
account_serires[account_serires<250000]

0     211829
3     109996
4     121213
5     132971
6     145068
7     205217
8     209744
9     212303
10    214098
11    231907
12    242368
Name: account, dtype: int64

### index 변경

In [86]:
# 값을 인덱스로
df.index = df["account"]

In [87]:
df.head()

Unnamed: 0_level_0,account,name,street,city,state,postal-code,Jan,Feb,Mar
account,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
211829,211829,"Kerluke, Koepp and Hilpert",34456 Sean Highway,New Jaycob,Texas,28752,10000,62000,35000
320563,320563,Walter-Trantow,1311 Alvis Tunnel,Port Khadijah,NorthCarolina,38365,95000,45000,35000
648336,648336,"Bashirian, Kunde and Price",62184 Schamberger Underpass Apt. 231,New Lilianland,Iowa,76517,91000,120000,35000
109996,109996,"D'Amore, Gleichner and Bode",155 Fadel Crescent Apt. 144,Hyattburgh,Maine,46021,45000,120000,10000
121213,121213,Bauch-Goldner,7274 Marissa Common,Shanahanchester,California,49681,162000,120000,35000


In [88]:
del df["account"]
df.head()

Unnamed: 0_level_0,name,street,city,state,postal-code,Jan,Feb,Mar
account,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
211829,"Kerluke, Koepp and Hilpert",34456 Sean Highway,New Jaycob,Texas,28752,10000,62000,35000
320563,Walter-Trantow,1311 Alvis Tunnel,Port Khadijah,NorthCarolina,38365,95000,45000,35000
648336,"Bashirian, Kunde and Price",62184 Schamberger Underpass Apt. 231,New Lilianland,Iowa,76517,91000,120000,35000
109996,"D'Amore, Gleichner and Bode",155 Fadel Crescent Apt. 144,Hyattburgh,Maine,46021,45000,120000,10000
121213,Bauch-Goldner,7274 Marissa Common,Shanahanchester,California,49681,162000,120000,35000


### Basic, loc, iloc selection

In [89]:
# Column과 index number
df[["name", "street"]][:2]

Unnamed: 0_level_0,name,street
account,Unnamed: 1_level_1,Unnamed: 2_level_1
211829,"Kerluke, Koepp and Hilpert",34456 Sean Highway
320563,Walter-Trantow,1311 Alvis Tunnel


In [90]:
# Column과 index name
df.loc[[211829, 320563], ["name", "street"]]

Unnamed: 0_level_0,name,street
account,Unnamed: 1_level_1,Unnamed: 2_level_1
211829,"Kerluke, Koepp and Hilpert",34456 Sean Highway
320563,Walter-Trantow,1311 Alvis Tunnel


In [91]:
# Column number와 index number
df.iloc[:2, :2]

Unnamed: 0_level_0,name,street
account,Unnamed: 1_level_1,Unnamed: 2_level_1
211829,"Kerluke, Koepp and Hilpert",34456 Sean Highway
320563,Walter-Trantow,1311 Alvis Tunnel


### index 재설정

In [92]:
# merge가 없을 경우 편함
df.index = list(range(0,15))
df.head()

Unnamed: 0,name,street,city,state,postal-code,Jan,Feb,Mar
0,"Kerluke, Koepp and Hilpert",34456 Sean Highway,New Jaycob,Texas,28752,10000,62000,35000
1,Walter-Trantow,1311 Alvis Tunnel,Port Khadijah,NorthCarolina,38365,95000,45000,35000
2,"Bashirian, Kunde and Price",62184 Schamberger Underpass Apt. 231,New Lilianland,Iowa,76517,91000,120000,35000
3,"D'Amore, Gleichner and Bode",155 Fadel Crescent Apt. 144,Hyattburgh,Maine,46021,45000,120000,10000
4,Bauch-Goldner,7274 Marissa Common,Shanahanchester,California,49681,162000,120000,35000


### Data drop

In [93]:
# index number로 drop
# row 단위로 drop
df.drop(1)

Unnamed: 0,name,street,city,state,postal-code,Jan,Feb,Mar
0,"Kerluke, Koepp and Hilpert",34456 Sean Highway,New Jaycob,Texas,28752,10000,62000,35000
2,"Bashirian, Kunde and Price",62184 Schamberger Underpass Apt. 231,New Lilianland,Iowa,76517,91000,120000,35000
3,"D'Amore, Gleichner and Bode",155 Fadel Crescent Apt. 144,Hyattburgh,Maine,46021,45000,120000,10000
4,Bauch-Goldner,7274 Marissa Common,Shanahanchester,California,49681,162000,120000,35000
5,"Williamson, Schumm and Hettinger",89403 Casimer Spring,Jeremieburgh,Arkansas,62785,150000,120000,35000
6,Casper LLC,340 Consuela Bridge Apt. 400,Lake Gabriellaton,Mississipi,18008,62000,120000,70000
7,Kovacek-Johnston,91971 Cronin Vista Suite 601,Deronville,RhodeIsland,53461,145000,95000,35000
8,Champlin-Morar,26739 Grant Lock,Lake Juliannton,Pennsylvania,64415,70000,95000,35000
9,Gerhold-Maggio,366 Maggio Grove Apt. 998,North Ras,Idaho,46308,70000,120000,35000
10,"Goodwin, Homenick and Jerde",649 Cierra Forks Apt. 078,Rosaberg,Tenessee,47743,45000,120000,55000


In [94]:
# 한 개 이상의 index number로 drop
df.drop([0,1,2,3])

Unnamed: 0,name,street,city,state,postal-code,Jan,Feb,Mar
4,Bauch-Goldner,7274 Marissa Common,Shanahanchester,California,49681,162000,120000,35000
5,"Williamson, Schumm and Hettinger",89403 Casimer Spring,Jeremieburgh,Arkansas,62785,150000,120000,35000
6,Casper LLC,340 Consuela Bridge Apt. 400,Lake Gabriellaton,Mississipi,18008,62000,120000,70000
7,Kovacek-Johnston,91971 Cronin Vista Suite 601,Deronville,RhodeIsland,53461,145000,95000,35000
8,Champlin-Morar,26739 Grant Lock,Lake Juliannton,Pennsylvania,64415,70000,95000,35000
9,Gerhold-Maggio,366 Maggio Grove Apt. 998,North Ras,Idaho,46308,70000,120000,35000
10,"Goodwin, Homenick and Jerde",649 Cierra Forks Apt. 078,Rosaberg,Tenessee,47743,45000,120000,55000
11,Hahn-Moore,18115 Olivine Throughway,Norbertomouth,NorthDakota,31415,150000,10000,162000
12,"Frami, Anderson and Donnelly",182 Bertie Road,East Davian,Iowa,72686,162000,120000,35000
13,Walsh-Haley,2624 Beatty Parkways,Goodwinmouth,RhodeIsland,31919,55000,120000,35000


In [95]:
# axis 지정으로 축(column)을 기준으로 drop
# column 중에 "city"
# df.drop(["city","state", axis=1])
df.drop("city", axis=1)

Unnamed: 0,name,street,state,postal-code,Jan,Feb,Mar
0,"Kerluke, Koepp and Hilpert",34456 Sean Highway,Texas,28752,10000,62000,35000
1,Walter-Trantow,1311 Alvis Tunnel,NorthCarolina,38365,95000,45000,35000
2,"Bashirian, Kunde and Price",62184 Schamberger Underpass Apt. 231,Iowa,76517,91000,120000,35000
3,"D'Amore, Gleichner and Bode",155 Fadel Crescent Apt. 144,Maine,46021,45000,120000,10000
4,Bauch-Goldner,7274 Marissa Common,California,49681,162000,120000,35000
5,"Williamson, Schumm and Hettinger",89403 Casimer Spring,Arkansas,62785,150000,120000,35000
6,Casper LLC,340 Consuela Bridge Apt. 400,Mississipi,18008,62000,120000,70000
7,Kovacek-Johnston,91971 Cronin Vista Suite 601,RhodeIsland,53461,145000,95000,35000
8,Champlin-Morar,26739 Grant Lock,Pennsylvania,64415,70000,95000,35000
9,Gerhold-Maggio,366 Maggio Grove Apt. 998,Idaho,46308,70000,120000,35000


In [96]:
# 하지만 df에 city는 남아있음
# Pandas는 기본적으로 원본데이터 삭제를 쉽게 안함
# inplace = True 를 지정해야 DataFrame 원본이 바뀜
df

Unnamed: 0,name,street,city,state,postal-code,Jan,Feb,Mar
0,"Kerluke, Koepp and Hilpert",34456 Sean Highway,New Jaycob,Texas,28752,10000,62000,35000
1,Walter-Trantow,1311 Alvis Tunnel,Port Khadijah,NorthCarolina,38365,95000,45000,35000
2,"Bashirian, Kunde and Price",62184 Schamberger Underpass Apt. 231,New Lilianland,Iowa,76517,91000,120000,35000
3,"D'Amore, Gleichner and Bode",155 Fadel Crescent Apt. 144,Hyattburgh,Maine,46021,45000,120000,10000
4,Bauch-Goldner,7274 Marissa Common,Shanahanchester,California,49681,162000,120000,35000
5,"Williamson, Schumm and Hettinger",89403 Casimer Spring,Jeremieburgh,Arkansas,62785,150000,120000,35000
6,Casper LLC,340 Consuela Bridge Apt. 400,Lake Gabriellaton,Mississipi,18008,62000,120000,70000
7,Kovacek-Johnston,91971 Cronin Vista Suite 601,Deronville,RhodeIsland,53461,145000,95000,35000
8,Champlin-Morar,26739 Grant Lock,Lake Juliannton,Pennsylvania,64415,70000,95000,35000
9,Gerhold-Maggio,366 Maggio Grove Apt. 998,North Ras,Idaho,46308,70000,120000,35000


## DataFrame operation

### Series operation

In [97]:
# index를 기준으로 연산 수행
# 겹치는 index가 없을 경우 NaN값으로 변환
s1 = Series(range(1,6), index=list("abced"))
s1

a    1
b    2
c    3
e    4
d    5
dtype: int64

In [98]:
s2 = Series(range(5,11), index=list("bcedef"))
s2

b     5
c     6
e     7
d     8
e     9
f    10
dtype: int64

In [99]:
s1.add(s2)

a     NaN
b     7.0
c     9.0
d    13.0
e    11.0
e    13.0
f     NaN
dtype: float64

In [100]:
s1+s2

a     NaN
b     7.0
c     9.0
d    13.0
e    11.0
e    13.0
f     NaN
dtype: float64

### DataFrame operation

In [101]:
# df는 colum과 index를 모두 고려
# add operation을 쓰면 NaN값 0으로 변환
# Operation types : add, sub, div, mul
df1 = DataFrame(
np.arange(9).reshape(3,3),
columns=list("abc"))
df1

Unnamed: 0,a,b,c
0,0,1,2
1,3,4,5
2,6,7,8


In [102]:
df2 = DataFrame(
np.arange(16).reshape(4,4),
columns=list("abcd"))
df2

Unnamed: 0,a,b,c,d
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15


In [103]:
df1 + df2

Unnamed: 0,a,b,c,d
0,0.0,2.0,4.0,
1,7.0,9.0,11.0,
2,14.0,16.0,18.0,
3,,,,


In [104]:
df1.add(df2, fill_value=0)

Unnamed: 0,a,b,c,d
0,0.0,2.0,4.0,3.0
1,7.0,9.0,11.0,7.0
2,14.0,16.0,18.0,11.0
3,12.0,13.0,14.0,15.0


### Series + Dataframe

In [105]:
df = DataFrame(
    np.arange(16).reshape(4,4),
    columns=list("abcd"))
df

Unnamed: 0,a,b,c,d
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15


In [106]:
s = Series(np.arange(10,14),
          index=list("abcd"))
s

a    10
b    11
c    12
d    13
dtype: int32

In [107]:
# column을 기준으로 broadcasting이 발생
df + s

Unnamed: 0,a,b,c,d
0,10,12,14,16
1,14,16,18,20
2,18,20,22,24
3,22,24,26,28


In [108]:
# axis를 기준으로 row broadcasting 실행
df = DataFrame( np.arange(16).reshape(4,4),
              columns=list("abcd"))
df

Unnamed: 0,a,b,c,d
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15


In [109]:
s2 = Series(np.arange(10,14))
s2

0    10
1    11
2    12
3    13
dtype: int32

In [110]:
df + s2

Unnamed: 0,a,b,c,d,0,1,2,3
0,,,,,,,,
1,,,,,,,,
2,,,,,,,,
3,,,,,,,,


In [111]:
df.add(s2, axis=0)

Unnamed: 0,a,b,c,d
0,10,11,12,13
1,15,16,17,18
2,20,21,22,23
3,25,26,27,28


## lambda, map, apply

#### Lambda 함수
* 한 줄로 함수를 표현하는 익명 함수 기법
* Lisp 언어에서 시작된 기법으로 오늘날 현대언어에 많이 사용
* lambda argument : expression
* Ex) lambda x,y: x + y

In [112]:
def f(x,y):
    return x + y
f(1,4)

5

In [113]:
f = lambda x,y: x+y
f(1,4)

5

In [114]:
# 하나의 argument만 처리하는 lambda함수
f = lambda x: x / 2
f(3)

1.5

In [115]:
f = lambda x: x ** 2
f(3)

9

In [116]:
# 이름을 할당하지 않는 lambda 함수
( lambda x: x + 1)(5)

6

#### map 함수

* 함수와 sequence형 데이터를 인자로 받아
* 각 element마다 입력받은 함수를 적용하여 list로 반환
* 일반적으로 lambda 현태로 표현함
* map(function, sequence)

In [117]:
ex = [1, 2, 3, 4, 5]
f = lambda x: x ** 2
list(map(f,ex))

[1, 4, 9, 16, 25]

In [118]:
# 두 개 이상의 argument가 있을 때는 두 개의 sequence형을 써야함
f = lambda x, y: x + y
list(map(f, ex, ex))

[2, 4, 6, 8, 10]

In [119]:
# 익명 함수 그대로 사용 가능
# Python3 부터는 list를 꼭 붙여야함
list(map(lambda x : x+x, ex))

[2, 4, 6, 8, 10]

#### map for series

* Pandas의 series type의 데이터에도 map 함수 사용가능
* function 대신 dict, sequence형 자료 등으로 대체 가능

In [120]:
s1 = Series(np.arange(10))
s1.head(5)

0    0
1    1
2    2
3    3
4    4
dtype: int32

In [121]:
s1.map(lambda x: x**2).head(5)

0     0
1     1
2     4
3     9
4    16
dtype: int64

In [122]:
# dict type으로 데이터 교체
# 없는 값은 NaN
z = {1: 'A', 2: 'B', 3: 'C'}
s1.map(z).head(5)

0    NaN
1      A
2      B
3      C
4    NaN
dtype: object

In [123]:
s2 = Series(np.arange(10,20))
s1.map(s2).head(5)

0    10
1    11
2    12
3    13
4    14
dtype: int32

In [124]:
# Example - map for series
df = pd.read_csv("data/wages.csv")
df.head()

Unnamed: 0,earn,height,sex,race,ed,age
0,79571.299011,73.89,male,white,16,49
1,96396.988643,66.23,female,white,16,62
2,48710.666947,63.77,female,white,16,33
3,80478.096153,63.22,female,other,16,95
4,82089.345498,63.08,female,white,17,43


In [125]:
df.sex.unique()

array(['male', 'female'], dtype=object)

In [126]:
# 성별 str -> 성별 code
df["sex_code"] = df.sex.map({"male":0, "female":1})
df.head(5)

Unnamed: 0,earn,height,sex,race,ed,age,sex_code
0,79571.299011,73.89,male,white,16,49,0
1,96396.988643,66.23,female,white,16,62,1
2,48710.666947,63.77,female,white,16,33,1
3,80478.096153,63.22,female,other,16,95,1
4,82089.345498,63.08,female,white,17,43,1


#### Replace function <br>
* Map 함수의 기능중 데이터 변환 기능만 담당
* 데이터 변환시 많이 사용하는 함수

In [127]:
# dict type 적용
df.sex.replace(
    {"male":0, "female":1}
).head()

0    0
1    1
2    1
3    1
4    1
Name: sex, dtype: int64

In [128]:
# Target list Conversion list
# inplace -> 데이터 변환결과를 적용
df.sex.replace(
    ["male", "female"],
    [0,1], inplace=True)
df.head(5)

Unnamed: 0,earn,height,sex,race,ed,age,sex_code
0,79571.299011,73.89,0,white,16,49,0
1,96396.988643,66.23,1,white,16,62,1
2,48710.666947,63.77,1,white,16,33,1
3,80478.096153,63.22,1,other,16,95,1
4,82089.345498,63.08,1,white,17,43,1


#### apply for dataframe <br>
* map과 달리, series 전체(column)에 해당 함수를 적용
* 입력값이 series 데이터로 입력받아 handling 가능

In [129]:
df_info = df[["earn", "height", "age"]]
df_info.head()

Unnamed: 0,earn,height,age
0,79571.299011,73.89,49
1,96396.988643,66.23,62
2,48710.666947,63.77,33
3,80478.096153,63.22,95
4,82089.345498,63.08,43


In [130]:
# 각 column 별로 결과값 변환
f = lambda x : x.max() - x.min()
df_info.apply(f)

earn      318047.708444
height        19.870000
age           73.000000
dtype: float64

* 내장 연산 함수를 사용할 대도 똑같은 효과 가능
* mean, std 등 사용 가능

In [131]:
df_info.sum()

earn      4.474344e+07
height    9.183125e+04
age       6.250800e+04
dtype: float64

In [132]:
df_info.apply(sum)

earn      4.474344e+07
height    9.183125e+04
age       6.250800e+04
dtype: float64

* scalar 값 이외에 series값의 변환도 가능

In [133]:
def f(x):
    return Series([x.min(), x.max()], index=["min", "max"])

df_info.apply(f)

Unnamed: 0,earn,height,age
min,-98.580489,57.34,22
max,317949.127955,77.21,95


#### applymap for dataframe <br>
* series 단위가 아닌 element 단위로 함수를 적용
* series 단위에 apply를 적용시킬 때와 같은 효과

In [134]:
f = lambda x: -x
df_info.applymap(f).head(5)

Unnamed: 0,earn,height,age
0,-79571.299011,-73.89,-49
1,-96396.988643,-66.23,-62
2,-48710.666947,-63.77,-33
3,-80478.096153,-63.22,-95
4,-82089.345498,-63.08,-43


In [135]:
f = lambda x: -x
df_info["earn"].apply(f).head(5)

0   -79571.299011
1   -96396.988643
2   -48710.666947
3   -80478.096153
4   -82089.345498
Name: earn, dtype: float64

## Pandas Built-in functinos

### describe <br>
* Numeric type 데이터의 요약 정보를 보여줌

In [158]:
df = pd.read_csv("data/wages.csv")
df.head()

Unnamed: 0,earn,height,sex,race,ed,age
0,79571.299011,73.89,male,white,16,49
1,96396.988643,66.23,female,white,16,62
2,48710.666947,63.77,female,white,16,33
3,80478.096153,63.22,female,other,16,95
4,82089.345498,63.08,female,white,17,43


In [159]:
df.describe()

Unnamed: 0,earn,height,ed,age
count,1379.0,1379.0,1379.0,1379.0
mean,32446.292622,66.59264,13.354605,45.328499
std,31257.070006,3.818108,2.438741,15.789715
min,-98.580489,57.34,3.0,22.0
25%,10538.790721,63.72,12.0,33.0
50%,26877.870178,66.05,13.0,42.0
75%,44506.215336,69.315,15.0,55.0
max,317949.127955,77.21,18.0,95.0


### unique <br>
* series data의 유일한 값을 list로 반환

In [160]:
df.race.unique() # 유일한 인종의 값 list

array(['white', 'other', 'hispanic', 'black'], dtype=object)

In [161]:
np.array(dict(enumerate(df["race"].unique()))) # dict type으로 index

array({0: 'white', 1: 'other', 2: 'hispanic', 3: 'black'}, dtype=object)

In [162]:
# label index값과 label값 각각 추출
value = list(map(int, np.array(list(enumerate(df["race"].unique())))[:, 0].tolist()))
key = np.array(list(enumerate(df["race"].unique())), dtype=str)[:, 1].tolist()

value, key

([0, 1, 2, 3], ['white', 'other', 'hispanic', 'black'])

In [163]:
# label str -> index 값으로 변환
df["race"].replace(to_replace = key, value = value, inplace=True)

In [164]:
# 성별에 대해서도 동일하게 적용
value = list(map(int, np.array(list(enumerate(df["sex"].unique())))[:, 0].tolist()))
key = np.array(list(enumerate(df["sex"].unique())), dtype=str)[:, 1].tolist()

value, key

([0, 1], ['male', 'female'])

In [165]:
# "sex"와 "race" column의 index labeling
df["sex"].replace(to_replace=key, value=value, inplace=True)
df.head(5)

Unnamed: 0,earn,height,sex,race,ed,age
0,79571.299011,73.89,0,0,16,49
1,96396.988643,66.23,1,0,16,62
2,48710.666947,63.77,1,0,16,33
3,80478.096153,63.22,1,1,16,95
4,82089.345498,63.08,1,0,17,43


#### sum <br>
* 기본적인 column 또는 row 값의 연산을 지원
* sub, mean, min, max, count, median, mad, var 등

In [166]:
# column 별
df.sum(axis=0)

earn      4.474344e+07
height    9.183125e+04
sex       8.590000e+02
race      5.610000e+02
ed        1.841600e+04
age       6.250800e+04
dtype: float64

In [167]:
# row 별
df.sum(axis=1)

0       79710.189011
1       96542.218643
2       48824.436947
3       80654.316153
4       82213.425498
            ...     
1374    30290.060363
1375    25019.829514
1376    13824.311312
1377    95563.664410
1378     9686.681857
Length: 1379, dtype: float64

#### isnull <br>
* column 또는 row 값의 NaN (null) 값의 index를 반환

In [168]:
df.isnull()

Unnamed: 0,earn,height,sex,race,ed,age
0,False,False,False,False,False,False
1,False,False,False,False,False,False
2,False,False,False,False,False,False
3,False,False,False,False,False,False
4,False,False,False,False,False,False
...,...,...,...,...,...,...
1374,False,False,False,False,False,False
1375,False,False,False,False,False,False
1376,False,False,False,False,False,False
1377,False,False,False,False,False,False


In [169]:
# Null인 값의 합
df.isnull().sum()

earn      0
height    0
sex       0
race      0
ed        0
age       0
dtype: int64

#### sort_values <br>
* column 값을 기준으로 데이터를 sorting

In [171]:
# 오름차순
df.sort_values(["age", "earn"], ascending=True).head(10)

Unnamed: 0,earn,height,sex,race,ed,age
1038,-56.321979,67.81,0,2,10,22
800,-27.876819,72.29,0,0,12,22
963,-25.65526,68.9,0,0,12,22
1105,988.56507,64.71,1,0,12,22
801,1000.221504,64.09,1,0,12,22
862,1002.023843,66.59,1,0,12,22
933,1007.994941,68.26,1,0,12,22
988,1578.542814,64.53,0,0,12,22
522,1955.168187,69.87,1,3,12,22
765,2581.870402,64.79,1,0,12,22


#### Correlation & Covariance <br>
* 상관계수와 공분산을 구하는 함수
* corr, cov, corrwith

In [172]:
df.age.corr(df.earn)

0.07400349177836055

In [173]:
df.age.cov(df.earn)

36523.6992104089

In [174]:
df.corrwith(df.earn)

earn      1.000000
height    0.291600
sex      -0.337328
race     -0.063977
ed        0.350374
age       0.074003
dtype: float64

In [175]:
df.corr()

Unnamed: 0,earn,height,sex,race,ed,age
earn,1.0,0.2916,-0.337328,-0.063977,0.350374,0.074003
height,0.2916,1.0,-0.703672,-0.045974,0.114047,-0.133727
sex,-0.337328,-0.703672,1.0,0.000858,-0.061747,0.070036
race,-0.063977,-0.045974,0.000858,1.0,-0.049487,-0.056879
ed,0.350374,0.114047,-0.061747,-0.049487,1.0,-0.129802
age,0.074003,-0.133727,0.070036,-0.056879,-0.129802,1.0
