# Pandas 한번에 제대로 배우기
- 관계 또는 레이블링 데이터로 쉽고 직관적으로 작업할 수 있도록 고안된 빠르고 유연하며 표련력이 뛰어난 데이터 구졸르 제공하는 Python 패키지



---



In [1]:
import numpy as np
import pandas as pd
pd. __version__

'1.4.2'

**굵은 텍스트**## Pandas 객체


### 전체 함수



- Series 객체와 인덱싱
    - .unique()
    - .value_counts()
    - .isin()

### Series 객체와 인덱싱

In [3]:
s = pd.Series([0, 0.25, 0.5, 0.75, 1.0])
s

0    0.00
1    0.25
2    0.50
3    0.75
4    1.00
dtype: float64

In [4]:
s.values

array([0.  , 0.25, 0.5 , 0.75, 1.  ])

In [5]:
print(s.index)

RangeIndex(start=0, stop=5, step=1)


In [6]:
print(s[1])
print(s[1:4])

0.25
1    0.25
2    0.50
3    0.75
dtype: float64


In [7]:
s = pd.Series([0, 0.25, 0.5, 0.75, 1.0],
            index = ['a','b','c','d','e'])
print(s)

a    0.00
b    0.25
c    0.50
d    0.75
e    1.00
dtype: float64


In [8]:
s['c']

0.5

In [9]:
s[['c','d','e']]

c    0.50
d    0.75
e    1.00
dtype: float64

In [10]:
'b' is s

  'b' is s


False

In [11]:
s = pd.Series([0, 0.25, 0.5, 0.75, 1.0],
              index=[2,4,6,8,10])
print(s)

2     0.00
4     0.25
6     0.50
8     0.75
10    1.00
dtype: float64


In [12]:
s[4]

0.25

In [13]:
s[2:]

6     0.50
8     0.75
10    1.00
dtype: float64

- .unique()

In [14]:
s.unique()

array([0.  , 0.25, 0.5 , 0.75, 1.  ])

- .value_counts()

In [15]:
s.value_counts()

0.00    1
0.25    1
0.50    1
0.75    1
1.00    1
dtype: int64

- .isin()

In [16]:
s.isin([0.25,0.75])

2     False
4      True
6     False
8      True
10    False
dtype: bool

- 시리즈 만들기

In [17]:
pop_tuple = {'서울특별시':234587,
             '울산광역시':325234,
             '부산광역시':324565,
             '인천광역시':324354,
             '대구광역시':343245}
population = pd.Series(pop_tuple)
population

서울특별시    234587
울산광역시    325234
부산광역시    324565
인천광역시    324354
대구광역시    343245
dtype: int64

In [18]:
population['울산광역시']

325234

In [19]:
population['울산광역시':'인천광역시']

울산광역시    325234
부산광역시    324565
인천광역시    324354
dtype: int64

### DataFrame 객체

In [20]:
pd.DataFrame([{'A':2, 'B':5, 'D':3}, {'A':1, 'B':7, 'C':9}])

Unnamed: 0,A,B,D,C
0,2,5,3.0,
1,1,7,,9.0


In [21]:
pd.DataFrame(np.random.rand(5,5),
             columns=['A','B','C','D','E'],
             index = [1,6,3,8,4])

Unnamed: 0,A,B,C,D,E
1,0.758817,0.223363,0.722356,0.09545,0.885921
6,0.527964,0.989852,0.303666,0.251376,0.218127
3,0.691827,0.17672,0.139459,0.316938,0.526394
8,0.795664,0.162233,0.031126,0.033324,0.489398
4,0.284786,0.606865,0.149633,0.168378,0.800684


In [22]:
male_tuple = {'서울특별시':2343425,
             '울산광역시':234534,
             '부산광역시':3234565,
             '인천광역시':324354,
             '대구광역시':34324545}
male = pd.Series(male_tuple)
male

서울특별시     2343425
울산광역시      234534
부산광역시     3234565
인천광역시      324354
대구광역시    34324545
dtype: int64

In [23]:
female_tuple = {'서울특별시':3542334,
             '울산광역시':1234565,
             '부산광역시':7563422,
             '인천광역시':1234524,
             '대구광역시':25364756}
female = pd.Series(female_tuple)
female

서울특별시     3542334
울산광역시     1234565
부산광역시     7563422
인천광역시     1234524
대구광역시    25364756
dtype: int64

In [24]:
korea_df = pd.DataFrame({'인구수':population,
                        '남자수':male,
                        '여자수':female})
korea_df

Unnamed: 0,인구수,남자수,여자수
서울특별시,234587,2343425,3542334
울산광역시,325234,234534,1234565
부산광역시,324565,3234565,7563422
인천광역시,324354,324354,1234524
대구광역시,343245,34324545,25364756


In [25]:
korea_df.index

Index(['서울특별시', '울산광역시', '부산광역시', '인천광역시', '대구광역시'], dtype='object')

In [26]:
korea_df.columns

Index(['인구수', '남자수', '여자수'], dtype='object')

In [27]:
korea_df['여자수']

서울특별시     3542334
울산광역시     1234565
부산광역시     7563422
인천광역시     1234524
대구광역시    25364756
Name: 여자수, dtype: int64

In [28]:
korea_df['서울특별시':'인천광역시']

Unnamed: 0,인구수,남자수,여자수
서울특별시,234587,2343425,3542334
울산광역시,325234,234534,1234565
부산광역시,324565,3234565,7563422
인천광역시,324354,324354,1234524


### Index 객체


|클래스|설명|
|---|---|
|index|일반적인 index객체이며, Numpy 배열 형식으로 축의 이름 표현|
|int64index|정수 값을 위한 index|
|multiindex|단일 축에 여러 단계 색인을 표현하는 계층적 index 객체(튜플의 배열과 유사)|
|Dateimeindex|NumPy의 datetime64 타입으로 타임스탬프 저장|
|periodinex|기간 데이터를 위한 index|

In [29]:
idx = pd.Index([2,4,6,8,10])
idx

Int64Index([2, 4, 6, 8, 10], dtype='int64')

In [30]:
idx[1]

4

In [31]:
idx[1:5:2]

Int64Index([4, 8], dtype='int64')

In [32]:
idx[-1::]

Int64Index([10], dtype='int64')

In [33]:
idx[::2]

Int64Index([2, 6, 10], dtype='int64')

In [34]:
print('idx')
print(idx)
print('\n', 'idx.size')
print(idx.size)
print('\n', 'idx.shape')
print(idx.shape)
print('\n', 'idx.ndim')
print(idx.ndim)
print('\n', 'idx.dtype')
print(idx.dtype)

idx
Int64Index([2, 4, 6, 8, 10], dtype='int64')

 idx.size
5

 idx.shape
(5,)

 idx.ndim
1

 idx.dtype
int64


#### Index 연산

|연산자|메소드|설명|
|---|---|---|
||append|색인 객체를 추가한 새로운 색인 변환|
||difference|색인의 차집합 반환|
|&|intersection|색인의 교집합 반환|
|ㅣ|union|색인의 합집합 반환|
||isin|색인이 존재하는지 여부를 불리언 배열로 반환|
||delete|색인이 삭제된 새로운 색인 반환|
||drop|값이 삭제된 새로운 색인 반환|
||insert|색인이 추가된 새로운 색인 반환|
||is_monotonic|색인이 단조성을 가지면 True|
||is_unique|중복되는 색인이 없다면 True|
||unique|색인에서 중복되는 요소를 제거하고 유일한 값만 반환|

In [35]:
idx1 = pd.Index([1,2,4,6,7])
idx2 = pd.Index([2,4,5,6,7])

print('.append(합집합)')
print(idx1.append(idx2))

print('\n', '.difference(차집합)')
print(idx1.difference(idx2))
print('\n', '-(빼기)')
print(idx1 - idx2)

print('\n', '.intersection(교집합)')
print(idx1.intersection(idx2))
print(idx1 & idx2)

print('\n', '.union(중복제거)')
print(idx1.union(idx2))
print(idx1 | idx2)

print('\n', '.delete(제거)')
print(idx1.delete(0))

print('\n', '.drop(제거)')
print(idx1.drop(1))

print('\n', '^(여집합)')
print(idx1 ^ idx2)

.append(합집합)
Int64Index([1, 2, 4, 6, 7, 2, 4, 5, 6, 7], dtype='int64')

 .difference(차집합)
Int64Index([1], dtype='int64')

 -(빼기)
Int64Index([-1, -2, -1, 0, 0], dtype='int64')

 .intersection(교집합)
Int64Index([2, 4, 6, 7], dtype='int64')
Int64Index([2, 4, 6, 7], dtype='int64')

 .union(중복제거)
Int64Index([1, 2, 4, 5, 6, 7], dtype='int64')
Int64Index([1, 2, 4, 5, 6, 7], dtype='int64')

 .delete(제거)
Int64Index([2, 4, 6, 7], dtype='int64')

 .drop(제거)
Int64Index([2, 4, 6, 7], dtype='int64')

 ^(여집합)
Int64Index([1, 5], dtype='int64')


  print(idx1 & idx2)
  print(idx1 | idx2)
  print(idx1 ^ idx2)




---



## 인덱싱(Indexing)

In [36]:
s = pd.Series([0, 0.5, 0.25, 0.75, 1.0],
              index=['a','b','c','d','e'])
print(s)

a    0.00
b    0.50
c    0.25
d    0.75
e    1.00
dtype: float64


In [37]:
s['b']

0.5

In [38]:
'b' is s

  'b' is s


False

In [39]:
s.keys()

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [40]:
list(s.items())

[('a', 0.0), ('b', 0.5), ('c', 0.25), ('d', 0.75), ('e', 1.0)]

In [41]:
s['f'] = 1.25
print(s)

a    0.00
b    0.50
c    0.25
d    0.75
e    1.00
f    1.25
dtype: float64


In [42]:
s['a':'d']

a    0.00
b    0.50
c    0.25
d    0.75
dtype: float64

In [43]:
s[0:4]

a    0.00
b    0.50
c    0.25
d    0.75
dtype: float64

In [44]:
s[(s > 0.4) & (s < 0.8)]

b    0.50
d    0.75
dtype: float64

In [45]:
s[['a','c','d']]

a    0.00
c    0.25
d    0.75
dtype: float64

### Series 인덱싱

In [46]:
s = pd.Series(['a', 'b', 'c', 'd', 'e'],
              index=[1,3,5,7,9])
print(s)

1    a
3    b
5    c
7    d
9    e
dtype: object


In [47]:
s[1]

'a'

In [48]:
s[2:4]

5    c
7    d
dtype: object

In [49]:
s.iloc[1]

'b'

In [50]:
s.iloc[2:4]

5    c
7    d
dtype: object

In [51]:
s.reindex(range(10))

0    NaN
1      a
2    NaN
3      b
4    NaN
5      c
6    NaN
7      d
8    NaN
9      e
dtype: object

In [52]:
s.reindex(range(10), method='bfill')

0    a
1    a
2    b
3    b
4    c
5    c
6    d
7    d
8    e
9    e
dtype: object

### DataFrame 인덱싱


|사용방법|설명|
|---|---|
|df[val]|하나의 컬럼 또는 여러 컬럼을 선택|
|df.loc[val]|라벨값으로 로우의 부분집합 선택|
|df.loc[val1,val2]|라벨값으로 컬럼의 부분집합 선택|
|df.iloc[where]|정수 색인으로 로우의 부분집합 선택|
|df.iloc[:,where|정수 색인으로 컬럼의 부분집합 선택|
|df.iloc[whter_i,whtere_j|정수 색인으로 로우와 컬럼의 부분집합 선택|
|df.at[lavbel_i, label_j]|로우와 컬럼의 라벨로 단일 값 선택|
|df.iat[i,j]|로우와 컬럼의 정수 색인으로 단일 값 선택|
|reindex|하나 이상의 축을 새로운 색으로 재색인|
|get_value, set_value|로우와 컬럼의 이름으로 값 선택|


In [53]:
korea_df

Unnamed: 0,인구수,남자수,여자수
서울특별시,234587,2343425,3542334
울산광역시,325234,234534,1234565
부산광역시,324565,3234565,7563422
인천광역시,324354,324354,1234524
대구광역시,343245,34324545,25364756


In [54]:
korea_df['남자수']

서울특별시     2343425
울산광역시      234534
부산광역시     3234565
인천광역시      324354
대구광역시    34324545
Name: 남자수, dtype: int64

In [55]:
korea_df.남자수

서울특별시     2343425
울산광역시      234534
부산광역시     3234565
인천광역시      324354
대구광역시    34324545
Name: 남자수, dtype: int64

In [56]:
korea_df.여자수

서울특별시     3542334
울산광역시     1234565
부산광역시     7563422
인천광역시     1234524
대구광역시    25364756
Name: 여자수, dtype: int64

In [57]:
korea_df['남여비율'] = (korea_df['남자수'] * 100 / korea_df['여자수'])

korea_df.남여비율

서울특별시     66.154829
울산광역시     18.997299
부산광역시     42.765894
인천광역시     26.273608
대구광역시    135.323774
Name: 남여비율, dtype: float64

In [58]:
korea_df.values

array([[2.34587000e+05, 2.34342500e+06, 3.54233400e+06, 6.61548290e+01],
       [3.25234000e+05, 2.34534000e+05, 1.23456500e+06, 1.89972986e+01],
       [3.24565000e+05, 3.23456500e+06, 7.56342200e+06, 4.27658935e+01],
       [3.24354000e+05, 3.24354000e+05, 1.23452400e+06, 2.62736083e+01],
       [3.43245000e+05, 3.43245450e+07, 2.53647560e+07, 1.35323774e+02]])

In [59]:
korea_df.T

Unnamed: 0,서울특별시,울산광역시,부산광역시,인천광역시,대구광역시
인구수,234587.0,325234.0,324565.0,324354.0,343245.0
남자수,2343425.0,234534.0,3234565.0,324354.0,34324540.0
여자수,3542334.0,1234565.0,7563422.0,1234524.0,25364760.0
남여비율,66.15483,18.9973,42.76589,26.27361,135.3238


In [60]:
korea_df.values[0]

array([2.3458700e+05, 2.3434250e+06, 3.5423340e+06, 6.6154829e+01])

In [61]:
korea_df['인구수']

서울특별시    234587
울산광역시    325234
부산광역시    324565
인천광역시    324354
대구광역시    343245
Name: 인구수, dtype: int64

In [62]:
korea_df.loc[:'인천광역시', :'남자수']

Unnamed: 0,인구수,남자수
서울특별시,234587,2343425
울산광역시,325234,234534
부산광역시,324565,3234565
인천광역시,324354,324354


In [63]:
korea_df.loc[(korea_df.여자수 > 10000000)]

Unnamed: 0,인구수,남자수,여자수,남여비율
대구광역시,343245,34324545,25364756,135.323774


In [64]:
korea_df.loc[(korea_df.인구수 < 1000000)]

Unnamed: 0,인구수,남자수,여자수,남여비율
서울특별시,234587,2343425,3542334,66.154829
울산광역시,325234,234534,1234565,18.997299
부산광역시,324565,3234565,7563422,42.765894
인천광역시,324354,324354,1234524,26.273608
대구광역시,343245,34324545,25364756,135.323774


In [65]:
korea_df.loc[(korea_df.인구수 > 250000)]

Unnamed: 0,인구수,남자수,여자수,남여비율
울산광역시,325234,234534,1234565,18.997299
부산광역시,324565,3234565,7563422,42.765894
인천광역시,324354,324354,1234524,26.273608
대구광역시,343245,34324545,25364756,135.323774


In [66]:
korea_df.loc[korea_df.남여비율 > 100]

Unnamed: 0,인구수,남자수,여자수,남여비율
대구광역시,343245,34324545,25364756,135.323774


In [67]:
korea_df.loc[(korea_df.인구수 > 250000) & (korea_df.남여비율 > 40)]

Unnamed: 0,인구수,남자수,여자수,남여비율
부산광역시,324565,3234565,7563422,42.765894
대구광역시,343245,34324545,25364756,135.323774


In [68]:
korea_df.iloc[:3, :2]

Unnamed: 0,인구수,남자수
서울특별시,234587,2343425
울산광역시,325234,234534
부산광역시,324565,3234565


### 다중 인덱싱(Multi Indexing)

* 1차원의 Series와 2차원의 DataFrame 객체를 넘어 3차원, 4차원 이상의 고차원 데이터 처리
* 단일 인덱스 내에 여러 인덱스를 포함하는 다중 인덱싱

#### 다중 인덱스 Series

In [69]:
korea_df

Unnamed: 0,인구수,남자수,여자수,남여비율
서울특별시,234587,2343425,3542334,66.154829
울산광역시,325234,234534,1234565,18.997299
부산광역시,324565,3234565,7563422,42.765894
인천광역시,324354,324354,1234524,26.273608
대구광역시,343245,34324545,25364756,135.323774


In [70]:
idx_tuples = [('서울특별시', 2010), ('서울특별시', 2020),
            ('울산광역시', 2010), ('울산광역시', 2020),
            ('부산광역시', 2010), ('부산광역시', 2020),
            ('인천광역시', 2010), ('인천광역시', 2020),
            ('대구광역시', 2010), ('대구광역시', 2020),
            ('광주광역시', 2010), ('광주광역시', 2020),
            ('대전광역시', 2010), ('대전광역시', 2020)]
idx_tuples

[('서울특별시', 2010),
 ('서울특별시', 2020),
 ('울산광역시', 2010),
 ('울산광역시', 2020),
 ('부산광역시', 2010),
 ('부산광역시', 2020),
 ('인천광역시', 2010),
 ('인천광역시', 2020),
 ('대구광역시', 2010),
 ('대구광역시', 2020),
 ('광주광역시', 2010),
 ('광주광역시', 2020),
 ('대전광역시', 2010),
 ('대전광역시', 2020)]

In [71]:
pop_tuples = [12432354,132435434,
            213454322,325466434,
            243523453,124356435,
            123454342,635456343,
            342567763,564335643,
            234564324,523645633,
            324553564,342565234]
population = pd.Series(pop_tuples, index = idx_tuples)
population

(서울특별시, 2010)     12432354
(서울특별시, 2020)    132435434
(울산광역시, 2010)    213454322
(울산광역시, 2020)    325466434
(부산광역시, 2010)    243523453
(부산광역시, 2020)    124356435
(인천광역시, 2010)    123454342
(인천광역시, 2020)    635456343
(대구광역시, 2010)    342567763
(대구광역시, 2020)    564335643
(광주광역시, 2010)    234564324
(광주광역시, 2020)    523645633
(대전광역시, 2010)    324553564
(대전광역시, 2020)    342565234
dtype: int64

In [72]:
midx = pd.MultiIndex.from_tuples(idx_tuples)
midx

MultiIndex([('서울특별시', 2010),
            ('서울특별시', 2020),
            ('울산광역시', 2010),
            ('울산광역시', 2020),
            ('부산광역시', 2010),
            ('부산광역시', 2020),
            ('인천광역시', 2010),
            ('인천광역시', 2020),
            ('대구광역시', 2010),
            ('대구광역시', 2020),
            ('광주광역시', 2010),
            ('광주광역시', 2020),
            ('대전광역시', 2010),
            ('대전광역시', 2020)],
           )

In [73]:
population = population.reindex(midx)
population

서울특별시  2010     12432354
       2020    132435434
울산광역시  2010    213454322
       2020    325466434
부산광역시  2010    243523453
       2020    124356435
인천광역시  2010    123454342
       2020    635456343
대구광역시  2010    342567763
       2020    564335643
광주광역시  2010    234564324
       2020    523645633
대전광역시  2010    324553564
       2020    342565234
dtype: int64

In [74]:
population[:,2010]

서울특별시     12432354
울산광역시    213454322
부산광역시    243523453
인천광역시    123454342
대구광역시    342567763
광주광역시    234564324
대전광역시    324553564
dtype: int64

In [75]:
population['대전광역시',:]

2010    324553564
2020    342565234
dtype: int64

In [76]:
korea_mdf = population.unstack()
korea_mdf

Unnamed: 0,2010,2020
광주광역시,234564324,523645633
대구광역시,342567763,564335643
대전광역시,324553564,342565234
부산광역시,243523453,124356435
서울특별시,12432354,132435434
울산광역시,213454322,325466434
인천광역시,123454342,635456343


In [77]:
korea_mdf.stack()

광주광역시  2010    234564324
       2020    523645633
대구광역시  2010    342567763
       2020    564335643
대전광역시  2010    324553564
       2020    342565234
부산광역시  2010    243523453
       2020    124356435
서울특별시  2010     12432354
       2020    132435434
울산광역시  2010    213454322
       2020    325466434
인천광역시  2010    123454342
       2020    635456343
dtype: int64

In [78]:
male_tuples = [45234,34253,
            123423,12341,
            123412,12343,
            543321,12341,
            643534,23456,
            643564,23456,
            132434,67564,]
male_tuples

[45234,
 34253,
 123423,
 12341,
 123412,
 12343,
 543321,
 12341,
 643534,
 23456,
 643564,
 23456,
 132434,
 67564]

In [79]:
korea_mdf = pd.DataFrame({'총인구수':population,
                        '남자수':male_tuples})
korea_mdf

Unnamed: 0,Unnamed: 1,총인구수,남자수
서울특별시,2010,12432354,45234
서울특별시,2020,132435434,34253
울산광역시,2010,213454322,123423
울산광역시,2020,325466434,12341
부산광역시,2010,243523453,123412
부산광역시,2020,124356435,12343
인천광역시,2010,123454342,543321
인천광역시,2020,635456343,12341
대구광역시,2010,342567763,643534
대구광역시,2020,564335643,23456


In [80]:
female_tuples = [32435422,213433212,
                 32145422,234223424,
                 45673223,124535343,
                 12343543,213454633,
                 12453654,432543123,
                 12435463,123454343,
                 13243543,243553432,]
female_tuples

[32435422,
 213433212,
 32145422,
 234223424,
 45673223,
 124535343,
 12343543,
 213454633,
 12453654,
 432543123,
 12435463,
 123454343,
 13243543,
 243553432]

In [81]:
korea_mdf = pd.DataFrame({'총인구수':population,
                          '총남자수':male_tuples,
                          '총여자수':female_tuples})
korea_mdf

Unnamed: 0,Unnamed: 1,총인구수,총남자수,총여자수
서울특별시,2010,12432354,45234,32435422
서울특별시,2020,132435434,34253,213433212
울산광역시,2010,213454322,123423,32145422
울산광역시,2020,325466434,12341,234223424
부산광역시,2010,243523453,123412,45673223
부산광역시,2020,124356435,12343,124535343
인천광역시,2010,123454342,543321,12343543
인천광역시,2020,635456343,12341,213454633
대구광역시,2010,342567763,643534,12453654
대구광역시,2020,564335643,23456,432543123


In [82]:
ratio = korea_mdf['총남자수'] * 100 / korea_mdf['총여자수']
ratio

서울특별시  2010    0.139459
       2020    0.016049
울산광역시  2010    0.383952
       2020    0.005269
부산광역시  2010    0.270206
       2020    0.009911
인천광역시  2010    4.401662
       2020    0.005782
대구광역시  2010    5.167431
       2020    0.005423
광주광역시  2010    5.175232
       2020    0.019000
대전광역시  2010    0.999989
       2020    0.027741
dtype: float64

In [83]:
ratio.unstack()

Unnamed: 0,2010,2020
광주광역시,5.175232,0.019
대구광역시,5.167431,0.005423
대전광역시,0.999989,0.027741
부산광역시,0.270206,0.009911
서울특별시,0.139459,0.016049
울산광역시,0.383952,0.005269
인천광역시,4.401662,0.005782


In [84]:
korea_mdf = pd.DataFrame({'총인구수':population,
                          '총남자수':male_tuples,
                          '총여자수':female_tuples,
                          '남녀비율':ratio})
korea_mdf

Unnamed: 0,Unnamed: 1,총인구수,총남자수,총여자수,남녀비율
서울특별시,2010,12432354,45234,32435422,0.139459
서울특별시,2020,132435434,34253,213433212,0.016049
울산광역시,2010,213454322,123423,32145422,0.383952
울산광역시,2020,325466434,12341,234223424,0.005269
부산광역시,2010,243523453,123412,45673223,0.270206
부산광역시,2020,124356435,12343,124535343,0.009911
인천광역시,2010,123454342,543321,12343543,4.401662
인천광역시,2020,635456343,12341,213454633,0.005782
대구광역시,2010,342567763,643534,12453654,5.167431
대구광역시,2020,564335643,23456,432543123,0.005423


#### 다중 인덱스 생성

In [85]:
df = pd.DataFrame(np.random.rand(6, 3),
                index=[['a', 'a','b','b','c','c'], [1,2,1,2,1,2]],
                columns= ['c1','c2','c3'])
df

Unnamed: 0,Unnamed: 1,c1,c2,c3
a,1,0.920992,0.732939,0.72623
a,2,0.64791,0.509063,0.254607
b,1,0.038468,0.632008,0.131748
b,2,0.42377,0.831317,0.072003
c,1,0.75936,0.179103,0.345191
c,2,0.674975,0.573042,0.520343


In [86]:
pd.MultiIndex.from_arrays([['a','a','b', 'b','c','c'],[1,2,1,2,1,2]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2),
            ('c', 1),
            ('c', 2)],
           )

In [87]:
pd.MultiIndex.from_tuples([('a',1), ('a', 2), ('b',1), ('b',2), ('c',1), ('c',2)])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2),
            ('c', 1),
            ('c', 2)],
           )

In [88]:
pd.MultiIndex.from_product([['a','b','c'], [1,2]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2),
            ('c', 1),
            ('c', 2)],
           )

In [89]:
pd.MultiIndex(levels=[['a','b','c'],[1,2]],
            codes = [[0,0,1,1,2,2], [0,1,0,1,0,1]] )

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2),
            ('c', 1),
            ('c', 2)],
           )

In [90]:
population

서울특별시  2010     12432354
       2020    132435434
울산광역시  2010    213454322
       2020    325466434
부산광역시  2010    243523453
       2020    124356435
인천광역시  2010    123454342
       2020    635456343
대구광역시  2010    342567763
       2020    564335643
광주광역시  2010    234564324
       2020    523645633
대전광역시  2010    324553564
       2020    342565234
dtype: int64

In [91]:
population.index.names = ['행정구역', '년도']
population

행정구역   년도  
서울특별시  2010     12432354
       2020    132435434
울산광역시  2010    213454322
       2020    325466434
부산광역시  2010    243523453
       2020    124356435
인천광역시  2010    123454342
       2020    635456343
대구광역시  2010    342567763
       2020    564335643
광주광역시  2010    234564324
       2020    523645633
대전광역시  2010    324553564
       2020    342565234
dtype: int64

In [92]:
idx =pd.MultiIndex.from_product([['a','b','c'], [1,2]],
                                names = ['name1', 'name2'])
cols = pd.MultiIndex.from_product([['c1', 'c2', 'c3'], [1,2]],
                                names = ['col_name1', 'col_name2'])

data = np.round(np.random.rand(6,6), 2)
mdf = pd.DataFrame(data, index=idx, columns= cols)
mdf

Unnamed: 0_level_0,col_name1,c1,c1,c2,c2,c3,c3
Unnamed: 0_level_1,col_name2,1,2,1,2,1,2
name1,name2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
a,1,0.32,0.24,0.69,0.13,0.92,0.86
a,2,0.49,0.9,0.14,0.1,0.02,0.9
b,1,0.99,0.33,0.98,0.86,0.87,0.68
b,2,0.8,0.03,0.75,0.01,0.05,0.37
c,1,0.63,0.6,0.25,0.67,0.45,0.69
c,2,0.58,0.53,0.38,0.04,0.44,0.35


In [93]:

mdf['c2']

Unnamed: 0_level_0,col_name2,1,2
name1,name2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0.69,0.13
a,2,0.14,0.1
b,1,0.98,0.86
b,2,0.75,0.01
c,1,0.25,0.67
c,2,0.38,0.04


#### 인덱싱 및 슬라이싱

In [94]:
population

행정구역   년도  
서울특별시  2010     12432354
       2020    132435434
울산광역시  2010    213454322
       2020    325466434
부산광역시  2010    243523453
       2020    124356435
인천광역시  2010    123454342
       2020    635456343
대구광역시  2010    342567763
       2020    564335643
광주광역시  2010    234564324
       2020    523645633
대전광역시  2010    324553564
       2020    342565234
dtype: int64

In [95]:
population['인천광역시', 2010]

123454342

In [96]:
population[:, 2010]

행정구역
서울특별시     12432354
울산광역시    213454322
부산광역시    243523453
인천광역시    123454342
대구광역시    342567763
광주광역시    234564324
대전광역시    324553564
dtype: int64

In [97]:
population[population > 500000000]

행정구역   년도  
인천광역시  2020    635456343
대구광역시  2020    564335643
광주광역시  2020    523645633
dtype: int64

In [98]:
population[['대구광역시', '대전광역시']]

행정구역   년도  
대구광역시  2010    342567763
       2020    564335643
대전광역시  2010    324553564
       2020    342565234
dtype: int64

In [99]:
mdf

Unnamed: 0_level_0,col_name1,c1,c1,c2,c2,c3,c3
Unnamed: 0_level_1,col_name2,1,2,1,2,1,2
name1,name2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
a,1,0.32,0.24,0.69,0.13,0.92,0.86
a,2,0.49,0.9,0.14,0.1,0.02,0.9
b,1,0.99,0.33,0.98,0.86,0.87,0.68
b,2,0.8,0.03,0.75,0.01,0.05,0.37
c,1,0.63,0.6,0.25,0.67,0.45,0.69
c,2,0.58,0.53,0.38,0.04,0.44,0.35


In [100]:
mdf['c2', 1]

name1  name2
a      1        0.69
       2        0.14
b      1        0.98
       2        0.75
c      1        0.25
       2        0.38
Name: (c2, 1), dtype: float64

In [101]:
mdf.iloc[3,:4]

col_name1  col_name2
c1         1            0.80
           2            0.03
c2         1            0.75
           2            0.01
Name: (b, 2), dtype: float64

In [102]:
mdf.loc[:, ('c2',1)]

name1  name2
a      1        0.69
       2        0.14
b      1        0.98
       2        0.75
c      1        0.25
       2        0.38
Name: (c2, 1), dtype: float64

In [103]:
idx_slice = pd.IndexSlice
mdf.loc[idx_slice[:,2], idx_slice[:,2]]

Unnamed: 0_level_0,col_name1,c1,c2,c3
Unnamed: 0_level_1,col_name2,2,2,2
name1,name2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,2,0.9,0.1,0.9
b,2,0.03,0.01,0.37
c,2,0.53,0.04,0.35


#### 다중 인덱스 재정렬

In [104]:
idx

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2),
            ('c', 1),
            ('c', 2)],
           names=['name1', 'name2'])

In [105]:
korea_mdf

Unnamed: 0_level_0,Unnamed: 1_level_0,총인구수,총남자수,총여자수,남녀비율
행정구역,년도,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
서울특별시,2010,12432354,45234,32435422,0.139459
서울특별시,2020,132435434,34253,213433212,0.016049
울산광역시,2010,213454322,123423,32145422,0.383952
울산광역시,2020,325466434,12341,234223424,0.005269
부산광역시,2010,243523453,123412,45673223,0.270206
부산광역시,2020,124356435,12343,124535343,0.009911
인천광역시,2010,123454342,543321,12343543,4.401662
인천광역시,2020,635456343,12341,213454633,0.005782
대구광역시,2010,342567763,643534,12453654,5.167431
대구광역시,2020,564335643,23456,432543123,0.005423


In [106]:
# korea_mdf['서울특별시':'인천광역시']  # 정렬이 되지 않아서 아래의 코드를 사용 안하고 하면 오류가 발생한다.

korea_mdf = korea_mdf.sort_index()
korea_mdf

Unnamed: 0_level_0,Unnamed: 1_level_0,총인구수,총남자수,총여자수,남녀비율
행정구역,년도,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
광주광역시,2010,234564324,643564,12435463,5.175232
광주광역시,2020,523645633,23456,123454343,0.019
대구광역시,2010,342567763,643534,12453654,5.167431
대구광역시,2020,564335643,23456,432543123,0.005423
대전광역시,2010,324553564,132434,13243543,0.999989
대전광역시,2020,342565234,67564,243553432,0.027741
부산광역시,2010,243523453,123412,45673223,0.270206
부산광역시,2020,124356435,12343,124535343,0.009911
서울특별시,2010,12432354,45234,32435422,0.139459
서울특별시,2020,132435434,34253,213433212,0.016049


In [107]:
korea_mdf['서울특별시':'인천광역시']


Unnamed: 0_level_0,Unnamed: 1_level_0,총인구수,총남자수,총여자수,남녀비율
행정구역,년도,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
서울특별시,2010,12432354,45234,32435422,0.139459
서울특별시,2020,132435434,34253,213433212,0.016049
울산광역시,2010,213454322,123423,32145422,0.383952
울산광역시,2020,325466434,12341,234223424,0.005269
인천광역시,2010,123454342,543321,12343543,4.401662
인천광역시,2020,635456343,12341,213454633,0.005782


In [108]:
korea_mdf.unstack(level=0)

Unnamed: 0_level_0,총인구수,총인구수,총인구수,총인구수,총인구수,총인구수,총인구수,총남자수,총남자수,총남자수,...,총여자수,총여자수,총여자수,남녀비율,남녀비율,남녀비율,남녀비율,남녀비율,남녀비율,남녀비율
행정구역,광주광역시,대구광역시,대전광역시,부산광역시,서울특별시,울산광역시,인천광역시,광주광역시,대구광역시,대전광역시,...,서울특별시,울산광역시,인천광역시,광주광역시,대구광역시,대전광역시,부산광역시,서울특별시,울산광역시,인천광역시
년도,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2010,234564324,342567763,324553564,243523453,12432354,213454322,123454342,643564,643534,132434,...,32435422,32145422,12343543,5.175232,5.167431,0.999989,0.270206,0.139459,0.383952,4.401662
2020,523645633,564335643,342565234,124356435,132435434,325466434,635456343,23456,23456,67564,...,213433212,234223424,213454633,0.019,0.005423,0.027741,0.009911,0.016049,0.005269,0.005782


In [109]:
korea_mdf.unstack(level=1)

Unnamed: 0_level_0,총인구수,총인구수,총남자수,총남자수,총여자수,총여자수,남녀비율,남녀비율
년도,2010,2020,2010,2020,2010,2020,2010,2020
행정구역,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
광주광역시,234564324,523645633,643564,23456,12435463,123454343,5.175232,0.019
대구광역시,342567763,564335643,643534,23456,12453654,432543123,5.167431,0.005423
대전광역시,324553564,342565234,132434,67564,13243543,243553432,0.999989,0.027741
부산광역시,243523453,124356435,123412,12343,45673223,124535343,0.270206,0.009911
서울특별시,12432354,132435434,45234,34253,32435422,213433212,0.139459,0.016049
울산광역시,213454322,325466434,123423,12341,32145422,234223424,0.383952,0.005269
인천광역시,123454342,635456343,543321,12341,12343543,213454633,4.401662,0.005782


In [110]:
korea_mdf.stack()

행정구역   년도        
광주광역시  2010  총인구수    2.345643e+08
             총남자수    6.435640e+05
             총여자수    1.243546e+07
             남녀비율    5.175232e+00
       2020  총인구수    5.236456e+08
             총남자수    2.345600e+04
             총여자수    1.234543e+08
             남녀비율    1.899974e-02
대구광역시  2010  총인구수    3.425678e+08
             총남자수    6.435340e+05
             총여자수    1.245365e+07
             남녀비율    5.167431e+00
       2020  총인구수    5.643356e+08
             총남자수    2.345600e+04
             총여자수    4.325431e+08
             남녀비율    5.422812e-03
대전광역시  2010  총인구수    3.245536e+08
             총남자수    1.324340e+05
             총여자수    1.324354e+07
             남녀비율    9.999892e-01
       2020  총인구수    3.425652e+08
             총남자수    6.756400e+04
             총여자수    2.435534e+08
             남녀비율    2.774094e-02
부산광역시  2010  총인구수    2.435235e+08
             총남자수    1.234120e+05
             총여자수    4.567322e+07
             남녀비율    2.702065e-01
       2020  총인구수    1.243564e

In [111]:
korea_mdf

Unnamed: 0_level_0,Unnamed: 1_level_0,총인구수,총남자수,총여자수,남녀비율
행정구역,년도,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
광주광역시,2010,234564324,643564,12435463,5.175232
광주광역시,2020,523645633,23456,123454343,0.019
대구광역시,2010,342567763,643534,12453654,5.167431
대구광역시,2020,564335643,23456,432543123,0.005423
대전광역시,2010,324553564,132434,13243543,0.999989
대전광역시,2020,342565234,67564,243553432,0.027741
부산광역시,2010,243523453,123412,45673223,0.270206
부산광역시,2020,124356435,12343,124535343,0.009911
서울특별시,2010,12432354,45234,32435422,0.139459
서울특별시,2020,132435434,34253,213433212,0.016049


In [112]:
idx_flat = korea_mdf.reset_index(level=0)
idx_flat

Unnamed: 0_level_0,행정구역,총인구수,총남자수,총여자수,남녀비율
년도,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010,광주광역시,234564324,643564,12435463,5.175232
2020,광주광역시,523645633,23456,123454343,0.019
2010,대구광역시,342567763,643534,12453654,5.167431
2020,대구광역시,564335643,23456,432543123,0.005423
2010,대전광역시,324553564,132434,13243543,0.999989
2020,대전광역시,342565234,67564,243553432,0.027741
2010,부산광역시,243523453,123412,45673223,0.270206
2020,부산광역시,124356435,12343,124535343,0.009911
2010,서울특별시,12432354,45234,32435422,0.139459
2020,서울특별시,132435434,34253,213433212,0.016049


In [113]:
idx_flat = korea_mdf.reset_index(level = (0,1))
idx_flat

Unnamed: 0,행정구역,년도,총인구수,총남자수,총여자수,남녀비율
0,광주광역시,2010,234564324,643564,12435463,5.175232
1,광주광역시,2020,523645633,23456,123454343,0.019
2,대구광역시,2010,342567763,643534,12453654,5.167431
3,대구광역시,2020,564335643,23456,432543123,0.005423
4,대전광역시,2010,324553564,132434,13243543,0.999989
5,대전광역시,2020,342565234,67564,243553432,0.027741
6,부산광역시,2010,243523453,123412,45673223,0.270206
7,부산광역시,2020,124356435,12343,124535343,0.009911
8,서울특별시,2010,12432354,45234,32435422,0.139459
9,서울특별시,2020,132435434,34253,213433212,0.016049


In [114]:
idx_flat.set_index(['행정구역', '년도'])

Unnamed: 0_level_0,Unnamed: 1_level_0,총인구수,총남자수,총여자수,남녀비율
행정구역,년도,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
광주광역시,2010,234564324,643564,12435463,5.175232
광주광역시,2020,523645633,23456,123454343,0.019
대구광역시,2010,342567763,643534,12453654,5.167431
대구광역시,2020,564335643,23456,432543123,0.005423
대전광역시,2010,324553564,132434,13243543,0.999989
대전광역시,2020,342565234,67564,243553432,0.027741
부산광역시,2010,243523453,123412,45673223,0.270206
부산광역시,2020,124356435,12343,124535343,0.009911
서울특별시,2010,12432354,45234,32435422,0.139459
서울특별시,2020,132435434,34253,213433212,0.016049


## 데이터 연산

In [115]:
s = pd.Series(np.random.randint(0, 10, 5))
s

0    4
1    7
2    3
3    7
4    3
dtype: int64

In [116]:
df = pd.DataFrame(np.random.randint(0, 10, (3,3)),
                columns = ['A', 'B', 'C'])
df

Unnamed: 0,A,B,C
0,1,1,8
1,6,1,3
2,5,6,6


In [117]:
np.exp(s)

0      54.598150
1    1096.633158
2      20.085537
3    1096.633158
4      20.085537
dtype: float64

In [118]:
np.cos(df * np.pi /4)

Unnamed: 0,A,B,C
0,0.7071068,0.7071068,1.0
1,-1.83697e-16,0.7071068,-0.7071068
2,-0.7071068,-1.83697e-16,-1.83697e-16


In [119]:
s1 = pd.Series([1,3,5,7,9], index = [0,1,2,3,4])
s2 = pd.Series([2,4,6,8,10], index = [1,2,3,4,5])

s1 +s2

0     NaN
1     5.0
2     9.0
3    13.0
4    17.0
5     NaN
dtype: float64

In [120]:
s1.add(s2, fill_value=0)

0     1.0
1     5.0
2     9.0
3    13.0
4    17.0
5    10.0
dtype: float64

In [121]:
df1 = pd.DataFrame(np.random.randint(0,20,(3,3)),
                columns = list('ACD'))

df1

Unnamed: 0,A,C,D
0,17,18,12
1,1,13,3
2,10,6,10


In [122]:
df2 = pd.DataFrame(np.random.randint(0,20,(5,5)),
                columns = list('BAECD'))

df2

Unnamed: 0,B,A,E,C,D
0,5,7,16,15,3
1,13,16,16,4,10
2,9,2,7,16,18
3,18,15,0,5,5
4,2,13,12,10,19


In [123]:
df1 + df2

Unnamed: 0,A,B,C,D,E
0,24.0,,33.0,15.0,
1,17.0,,17.0,13.0,
2,12.0,,22.0,28.0,
3,,,,,
4,,,,,


In [124]:
fvalue = df1.stack().mean()
df1.add(df2, fill_value=fvalue)

Unnamed: 0,A,B,C,D,E
0,24.0,15.0,33.0,15.0,26.0
1,17.0,23.0,17.0,13.0,26.0
2,12.0,19.0,22.0,28.0,17.0
3,25.0,28.0,15.0,15.0,10.0
4,23.0,12.0,20.0,29.0,22.0


### 연산자 범용 함수


|Python 연산자|Pandas 매소드|
|---|---|
|+|add, radd|
|-|sub, rsub, subtract|
|*|mul, rmul, multipy|
|/|truediv, div, rdiv, divide|
|//|floordiv, rfloordiv|
|%|mod|
|**|pow, rpow|


#### add()

In [125]:
a = np.random.randint(1, 10, size = (3,3))
a

array([[5, 6, 4],
       [8, 5, 3],
       [4, 9, 1]])

In [126]:
a + a[0]

array([[10, 12,  8],
       [13, 11,  7],
       [ 9, 15,  5]])

In [127]:
df = pd.DataFrame(a, columns = list('ABC'))
df

Unnamed: 0,A,B,C
0,5,6,4
1,8,5,3
2,4,9,1


In [128]:
df + df.iloc[0]

Unnamed: 0,A,B,C
0,10,12,8
1,13,11,7
2,9,15,5


In [129]:
df.add(df.iloc[0])

Unnamed: 0,A,B,C
0,10,12,8
1,13,11,7
2,9,15,5


#### sub() / subtract()

In [130]:
a

array([[5, 6, 4],
       [8, 5, 3],
       [4, 9, 1]])

In [131]:
a - a[0]

array([[ 0,  0,  0],
       [ 3, -1, -1],
       [-1,  3, -3]])

In [132]:
df

Unnamed: 0,A,B,C
0,5,6,4
1,8,5,3
2,4,9,1


In [133]:
df - df.iloc[0]

Unnamed: 0,A,B,C
0,0,0,0
1,3,-1,-1
2,-1,3,-3


In [134]:
df.sub(df.iloc[0])

Unnamed: 0,A,B,C
0,0,0,0
1,3,-1,-1
2,-1,3,-3


In [135]:
df.subtract(df['B'], axis = 0)

Unnamed: 0,A,B,C
0,-1,0,-2
1,3,0,-2
2,-5,0,-8


#### mul() / multply()




In [136]:
a

array([[5, 6, 4],
       [8, 5, 3],
       [4, 9, 1]])

In [137]:
a * a[1]

array([[40, 30, 12],
       [64, 25,  9],
       [32, 45,  3]])

In [138]:
df

Unnamed: 0,A,B,C
0,5,6,4
1,8,5,3
2,4,9,1


In [139]:
df * df.iloc[1]

Unnamed: 0,A,B,C
0,40,30,12
1,64,25,9
2,32,45,3


In [140]:
df.mul(df.iloc[1])

Unnamed: 0,A,B,C
0,40,30,12
1,64,25,9
2,32,45,3


In [141]:
df.multiply(df.iloc[1])

Unnamed: 0,A,B,C
0,40,30,12
1,64,25,9
2,32,45,3


#### truediv() /  div() / divide() / floordiv()

In [142]:
a / a[0]

array([[1.        , 1.        , 1.        ],
       [1.6       , 0.83333333, 0.75      ],
       [0.8       , 1.5       , 0.25      ]])

In [143]:
df / df.iloc[0]

Unnamed: 0,A,B,C
0,1.0,1.0,1.0
1,1.6,0.833333,0.75
2,0.8,1.5,0.25


In [144]:
df.truediv(df.iloc[0])

Unnamed: 0,A,B,C
0,1.0,1.0,1.0
1,1.6,0.833333,0.75
2,0.8,1.5,0.25


In [145]:
df.div(df.iloc[1])

Unnamed: 0,A,B,C
0,0.625,1.2,1.333333
1,1.0,1.0,1.0
2,0.5,1.8,0.333333


In [146]:
df.divide(df.iloc[2])

Unnamed: 0,A,B,C
0,1.25,0.666667,4.0
1,2.0,0.555556,3.0
2,1.0,1.0,1.0


In [147]:
a // a[0]

array([[1, 1, 1],
       [1, 0, 0],
       [0, 1, 0]])

In [148]:
df.floordiv(df.iloc[0])

Unnamed: 0,A,B,C
0,1,1,1
1,1,0,0
2,0,1,0


#### mod()

In [149]:
a

array([[5, 6, 4],
       [8, 5, 3],
       [4, 9, 1]])

In [150]:
a % a[0]

array([[0, 0, 0],
       [3, 5, 3],
       [4, 3, 1]])

In [151]:
df

Unnamed: 0,A,B,C
0,5,6,4
1,8,5,3
2,4,9,1


In [152]:
df.mod(df.iloc[0])

Unnamed: 0,A,B,C
0,0,0,0
1,3,5,3
2,4,3,1


#### pow()

In [153]:
a

array([[5, 6, 4],
       [8, 5, 3],
       [4, 9, 1]])

In [154]:
df

Unnamed: 0,A,B,C
0,5,6,4
1,8,5,3
2,4,9,1


In [155]:
a ** a[0]

array([[  3125,  46656,    256],
       [ 32768,  15625,     81],
       [  1024, 531441,      1]])

In [156]:
df.pow(df.iloc[0])

Unnamed: 0,A,B,C
0,3125,46656,256
1,32768,15625,81
2,1024,531441,1


In [157]:
row = df.iloc[0, ::2]
row

A    5
C    4
Name: 0, dtype: int64

In [158]:
df - row

Unnamed: 0,A,B,C
0,0.0,,0.0
1,3.0,,-1.0
2,-1.0,,-3.0


### 정렬(Sort)

In [164]:
s = pd.Series(range(5), index = ['A', 'C', 'B', 'E', 'D'])
s

A    0
C    1
B    2
E    3
D    4
dtype: int64

In [165]:
s.sort_index()

A    0
B    2
C    1
D    4
E    3
dtype: int64

In [166]:
s.sort_values()

A    0
C    1
B    2
E    3
D    4
dtype: int64

In [170]:
df = pd.DataFrame(np.random.randint(0,10, size = (5,5)),
                index = [2,7,1,2,3],
                columns = list('DFGAE'))
df

Unnamed: 0,D,F,G,A,E
2,5,7,3,0,1
7,6,1,5,2,7
1,6,5,1,1,6
2,8,4,2,6,4
3,7,3,1,9,7


In [171]:
df.sort_index()

Unnamed: 0,D,F,G,A,E
1,6,5,1,1,6
2,5,7,3,0,1
2,8,4,2,6,4
3,7,3,1,9,7
7,6,1,5,2,7


In [172]:
df.sort_values(by = 'A') # A열만 정렬되었다.

Unnamed: 0,D,F,G,A,E
2,5,7,3,0,1
1,6,5,1,1,6
7,6,1,5,2,7
2,8,4,2,6,4
3,7,3,1,9,7


In [177]:
df.sort_values(by = ['A','G']) # df.sort_index()를 보면  A가 0,1일 떄 G는 각각 3,1이었다. 

Unnamed: 0,D,F,G,A,E
2,5,7,3,0,1
1,6,5,1,1,6
7,6,1,5,2,7
2,8,4,2,6,4
3,7,3,1,9,7


In [178]:
df.sort_index(axis = 1)

Unnamed: 0,A,D,E,F,G
2,0,5,1,7,3
7,2,6,7,1,5
1,1,6,6,5,1
2,6,8,4,4,2
3,9,7,7,3,1


### 순위(Ranking)


|메소드|설명|
|---|---|
|average|기본값. 순위에 같은 같은 값을 가지는 항목들의 평균값을 사용|
|min|같은 값을 가지는 그룹을 낮은 순위로 지정|
|max|같은 값을 가지는 그룹을 높은 순위로 지정|
|first|데이터 내의 위치에 따라 순위 지정|
|dense|같은 그룹내에서 모두 같은 순위를 적용하지 않고 1씩 증가|



In [194]:
a = pd.Series([0,1,2,3,4,5,6,6,6,7,8,9])
a

0     0
1     1
2     2
3     3
4     4
5     5
6     6
7     6
8     6
9     7
10    8
11    9
dtype: int64

In [185]:
s = pd.Series([-4,6,2,6,32,63,-7,-2,35])
s

0    -4
1     6
2     2
3     6
4    32
5    63
6    -7
7    -2
8    35
dtype: int64

In [186]:
s.rank()

0    2.0
1    5.5
2    4.0
3    5.5
4    7.0
5    9.0
6    1.0
7    3.0
8    8.0
dtype: float64

In [187]:
s.rank(method ='first' )

0    2.0
1    5.0
2    4.0
3    6.0
4    7.0
5    9.0
6    1.0
7    3.0
8    8.0
dtype: float64

In [188]:
s.rank(method='max')

0    2.0
1    6.0
2    4.0
3    6.0
4    7.0
5    9.0
6    1.0
7    3.0
8    8.0
dtype: float64

### 고성능 연산

In [197]:
nrows, ncols = 10000, 100
df1, df2, df3, df4 = (pd.DataFrame(np.random.rand(nrows, ncols)) for i in range(4))

In [198]:
%timeit df1+df2+df3+df4

2.62 ms ± 340 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [199]:
%timeit pd.eval('df1 + df2 +df3 +df4')

2.78 ms ± 76.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [200]:
%timeit df1 * df2 / ( - df3 * df4)

3.34 ms ± 266 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [201]:
%timeit pd.eval('df1 * df2 / ( -df3 * df4)')

2.86 ms ± 111 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [202]:
%timeit ( df1 < df2) & (df2 <= df3) & (df3 != df4)

2.47 ms ± 145 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [205]:
%timeit pd.eval('( df1 < df2) & (df2 <= df3) & (df3 != df4)')

4.2 ms ± 106 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [9]:
df = pd.DataFrame(np.random.rand(1000000, 5), columns=['A','B','C','D','E'])
df.head()

Unnamed: 0,A,B,C,D,E
0,0.831027,0.921242,0.378125,0.548673,0.542369
1,0.564748,0.889234,0.338366,0.26019,0.138669
2,0.938758,0.957838,0.122859,0.969842,0.090631
3,0.592902,0.446869,0.117078,0.102316,0.016726
4,0.587334,0.356023,0.541027,0.093416,0.409068


In [13]:
%timeit df['A'] + df['B'] / df['C'] - df['D'] + df['E']

6.34 ms ± 221 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [14]:
%timeit pd.eval('df.A + df.B / df.C - df.D +df.E')

3.05 ms ± 84.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [15]:
%timeit df.eval('A + B / C - D + E')

4.28 ms ± 86.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [17]:
df.eval('R = A + B / C - D + E', inplace=True)
df.head()

Unnamed: 0,A,B,C,D,E,R
0,0.831027,0.921242,0.378125,0.548673,0.542369,3.261065
1,0.564748,0.889234,0.338366,0.26019,0.138669,3.071249
2,0.938758,0.957838,0.122859,0.969842,0.090631,7.85579
3,0.592902,0.446869,0.117078,0.102316,0.016726,4.324153
4,0.587334,0.356023,0.541027,0.093416,0.409068,1.561038


In [18]:
df.eval('R = A + B / C + D * E', inplace=True)
df.head()

Unnamed: 0,A,B,C,D,E,R
0,0.831027,0.921242,0.378125,0.548673,0.542369,3.564952
1,0.564748,0.889234,0.338366,0.26019,0.138669,3.228851
2,0.938758,0.957838,0.122859,0.969842,0.090631,8.822899
3,0.592902,0.446869,0.117078,0.102316,0.016726,4.411454
4,0.587334,0.356023,0.541027,0.093416,0.409068,1.283598


In [19]:
col_mean = df.mean(1)
df['A'] + col_mean

0         1.962092
1         1.468090
2         2.922562
3         1.540793
4         1.132412
            ...   
999995    1.349709
999996    1.903993
999997    1.638239
999998    1.072601
999999    1.440074
Length: 1000000, dtype: float64

In [21]:
df.eval('A + @col_mean')

0         1.962092
1         1.468090
2         2.922562
3         1.540793
4         1.132412
            ...   
999995    1.349709
999996    1.903993
999997    1.638239
999998    1.072601
999999    1.440074
Length: 1000000, dtype: float64

In [27]:
df[(df.A < 0.5) & (df.B < 0.5) & (df.C > 0.5)]

Unnamed: 0,A,B,C,D,E,R
6,0.344047,0.210503,0.907463,0.860825,0.238244,0.781102
7,0.372610,0.322320,0.733953,0.101010,0.396921,0.851859
13,0.057535,0.377043,0.804198,0.963795,0.555719,1.061978
14,0.185693,0.279510,0.712090,0.080951,0.450755,0.614702
16,0.139474,0.374671,0.574872,0.450829,0.907157,1.200194
...,...,...,...,...,...,...
999936,0.156175,0.117086,0.891808,0.182607,0.776835,0.429322
999948,0.056836,0.057429,0.918205,0.597421,0.616849,0.487899
999952,0.280322,0.228484,0.958484,0.757159,0.382910,0.808626
999973,0.407212,0.306200,0.967633,0.011517,0.432759,0.728638


In [28]:
pd.eval('df[(df.A < 0.5) & (df.B < 0.5) & (df.C > 0.5)]')

Unnamed: 0,A,B,C,D,E,R
6,0.344047,0.210503,0.907463,0.860825,0.238244,0.781102
7,0.372610,0.322320,0.733953,0.101010,0.396921,0.851859
13,0.057535,0.377043,0.804198,0.963795,0.555719,1.061978
14,0.185693,0.279510,0.712090,0.080951,0.450755,0.614702
16,0.139474,0.374671,0.574872,0.450829,0.907157,1.200194
...,...,...,...,...,...,...
999936,0.156175,0.117086,0.891808,0.182607,0.776835,0.429322
999948,0.056836,0.057429,0.918205,0.597421,0.616849,0.487899
999952,0.280322,0.228484,0.958484,0.757159,0.382910,0.808626
999973,0.407212,0.306200,0.967633,0.011517,0.432759,0.728638


In [31]:
df.query('(A < 0.5) and (B < 0.5) and (C > 0.5)')

Unnamed: 0,A,B,C,D,E,R
6,0.344047,0.210503,0.907463,0.860825,0.238244,0.781102
7,0.372610,0.322320,0.733953,0.101010,0.396921,0.851859
13,0.057535,0.377043,0.804198,0.963795,0.555719,1.061978
14,0.185693,0.279510,0.712090,0.080951,0.450755,0.614702
16,0.139474,0.374671,0.574872,0.450829,0.907157,1.200194
...,...,...,...,...,...,...
999936,0.156175,0.117086,0.891808,0.182607,0.776835,0.429322
999948,0.056836,0.057429,0.918205,0.597421,0.616849,0.487899
999952,0.280322,0.228484,0.958484,0.757159,0.382910,0.808626
999973,0.407212,0.306200,0.967633,0.011517,0.432759,0.728638


In [33]:
col_mean =df['D'].mean()
df[(df.A < col_mean) & (df.B < col_mean)]

Unnamed: 0,A,B,C,D,E,R
6,0.344047,0.210503,0.907463,0.860825,0.238244,0.781102
7,0.372610,0.322320,0.733953,0.101010,0.396921,0.851859
13,0.057535,0.377043,0.804198,0.963795,0.555719,1.061978
14,0.185693,0.279510,0.712090,0.080951,0.450755,0.614702
16,0.139474,0.374671,0.574872,0.450829,0.907157,1.200194
...,...,...,...,...,...,...
999966,0.274636,0.126121,0.385730,0.554943,0.155136,0.687697
999973,0.407212,0.306200,0.967633,0.011517,0.432759,0.728638
999979,0.451752,0.168844,0.158343,0.832449,0.172648,1.661791
999982,0.407137,0.374937,0.038587,0.569545,0.402793,10.353291


In [35]:
df.query('A < @col_mean and B < @col_mean')

Unnamed: 0,A,B,C,D,E,R
6,0.344047,0.210503,0.907463,0.860825,0.238244,0.781102
7,0.372610,0.322320,0.733953,0.101010,0.396921,0.851859
13,0.057535,0.377043,0.804198,0.963795,0.555719,1.061978
14,0.185693,0.279510,0.712090,0.080951,0.450755,0.614702
16,0.139474,0.374671,0.574872,0.450829,0.907157,1.200194
...,...,...,...,...,...,...
999966,0.274636,0.126121,0.385730,0.554943,0.155136,0.687697
999973,0.407212,0.306200,0.967633,0.011517,0.432759,0.728638
999979,0.451752,0.168844,0.158343,0.832449,0.172648,1.661791
999982,0.407137,0.374937,0.038587,0.569545,0.402793,10.353291


## 데이터 결합

### Concat() / Append()

In [39]:
s1 = pd.Series(['a', 'b'], index = [1,2])
s2 = pd.Series(['c', 'd'], index = [3,4])
pd.concat([s1, s2])

1    a
2    b
3    c
4    d
dtype: object

In [41]:
def create_df(cols, idx):
    data = {c: [str(c.lower()) + str(i) for i in idx] for c in cols}
    return pd.DataFrame(data, idx)

In [42]:
df1 = create_df('AB', [1,2])
df1

Unnamed: 0,A,B
1,a1,b1
2,a2,b2


In [43]:
df2 = create_df('AB', [3,4])
df2

Unnamed: 0,A,B
3,a3,b3
4,a4,b4


In [44]:
pd.concat([df1, df2])

Unnamed: 0,A,B
1,a1,b1
2,a2,b2
3,a3,b3
4,a4,b4


In [47]:
df3 = create_df('AB', [0,1])
df3

Unnamed: 0,A,B
0,a0,b0
1,a1,b1


In [48]:
df4 = create_df('CD', [0,1])
df4

Unnamed: 0,C,D
0,c0,d0
1,c1,d1


In [49]:
pd.concat([df3,df4])

Unnamed: 0,A,B,C,D
0,a0,b0,,
1,a1,b1,,
0,,,c0,d0
1,,,c1,d1


### 병합과 조인

## 데이터 집계와 그룹 연산

#### 집계 연산(Aggregation)


### GroupBy 연산

### 피벗 테이블(Pivot Table)


### 범주형(Categorical) 데이터


## 문자열 연산

#### 문자열 연산자

#### 기타 연산자


#### 정규표현식


## 시계열 처리

#### 시계열 데이터 구조


### 시계열 기본

### 주기와 오프셋


### 시프트(Shift)

### 시간대 처리

* 국제표준시(Coordinated Universal Time, UTC)를 기준으로 떨어진 거리만큼 오프셋으로 시간대 처리
* 전 세계의 시간대 정보를 모아놓은 올슨 데이터베이스를 활용한 라이브러리인 `pytz` 사용

### 기간과 기간 연산

### 리샘플링(Resampling)

* 리샘플링(Resampling): 시계열의 빈도 변환
* 다운샘플링(Down sampling): 상위 빈도 데이터를 하위 빈도 데이터로 집계
* 업샘플링(Up sampling): 하위 빈도 데이터를 상위 빈도 데이터로 집계

### 무빙 윈도우(Moving Window)

## 데이터 읽기 및 저장


### 텍스트 파일 읽기/쓰기

### 이진 데이터 파일 읽기/쓰기

## 데이터 정제

### 누락값 처리

* 대부분의 실제 데이터들은 정제되지 않고 누락값들이 존재
* 서로 다른 데이터들은 다른 형태의 결측을 가짐
* 결측 데이터는 `null`, `NaN`, `NA`로 표기

#### None: 파이썬 누락 데이터

#### NaN: 누락된 수치 데이터

#### Null 값 처리


### 중복 제거

### 값 치환

## 참고문헌

* Pandas 사이트: https://pandas.pydata.org/
* Jake VanderPlas, "Python Data Science Handbook", O'Reilly
* Wes Mckinney, "Python for Data Analysis", O'Reilly