# [Pandas 한번에 제대로 배우기](https://www.youtube.com/watch?v=lG8pEwvYwCw&t=5887s)




---



In [1]:
import pandas as pd
import numpy as np

## Pandas 객체


### Series 객체

In [2]:
s = pd.Series([0, 0.25, 0.5, 0.75, 1],
             index = ['a','b','c','d','e'])
s

a    0.00
b    0.25
c    0.50
d    0.75
e    1.00
dtype: float64

In [3]:
s['a']

0.0

In [4]:
s['c']

0.5

In [5]:
'b' in s

True

In [6]:
s = pd.Series([0, 0.25, 0.5, 0.75, 1],
             index = ['2','4','6','8','10'])
s

2     0.00
4     0.25
6     0.50
8     0.75
10    1.00
dtype: float64

In [7]:
s[2:]

6     0.50
8     0.75
10    1.00
dtype: float64

In [8]:
s.unique()

array([0.  , 0.25, 0.5 , 0.75, 1.  ])

In [9]:
s.value_counts()

1.00    1
0.75    1
0.50    1
0.25    1
0.00    1
dtype: int64

In [10]:
s.isin([0.25, 0.75])

2     False
4      True
6     False
8      True
10    False
dtype: bool

In [11]:
pop_tuple = {'서울특별시':9720846,
            '부산광역시':3404423,
            '인천광역시':2947217,
            '대구광역시':2427954,
            '대전광역시':1471040,
            '광주광역시':1455048}
population = pd.Series(pop_tuple)
population

서울특별시    9720846
부산광역시    3404423
인천광역시    2947217
대구광역시    2427954
대전광역시    1471040
광주광역시    1455048
dtype: int64

In [12]:
population['서울특별시']

9720846

In [13]:
population['서울특별시':'인천광역시']

서울특별시    9720846
부산광역시    3404423
인천광역시    2947217
dtype: int64

### DataFrame 객체

In [14]:
pd.DataFrame([{'A':2, 'B':4, 'D':3}, {'A':4,'B':5, 'C':7}])

Unnamed: 0,A,B,D,C
0,2,4,3.0,
1,4,5,,7.0


In [15]:
pd.DataFrame(np.random.rand(5,5),
            columns = ['A','B','C','D','E'],
             index = [1,2,3,4,5])

Unnamed: 0,A,B,C,D,E
1,0.698821,0.571395,0.199569,0.65687,0.511367
2,0.349731,0.982478,0.999653,0.384499,0.915833
3,0.564566,0.424693,0.533296,0.723136,0.033753
4,0.373903,0.191928,0.464485,0.497404,0.161663
5,0.331083,0.749432,0.502074,0.052318,0.058647


In [16]:
male_tuple = {'서울특별시':720846,
            '부산광역시':404423,
            '인천광역시':947217,
            '대구광역시':427954,
            '대전광역시':471040,
            '광주광역시':455048}
male = pd.Series(male_tuple)
male

서울특별시    720846
부산광역시    404423
인천광역시    947217
대구광역시    427954
대전광역시    471040
광주광역시    455048
dtype: int64

In [17]:
female_tuple = {'서울특별시':1720846,
            '부산광역시':1404423,
            '인천광역시':1947217,
            '대구광역시':4227954,
            '대전광역시':1471040,
            '광주광역시':1455048}
female = pd.Series(female_tuple)
female

서울특별시    1720846
부산광역시    1404423
인천광역시    1947217
대구광역시    4227954
대전광역시    1471040
광주광역시    1455048
dtype: int64

In [18]:
korea_df = pd.DataFrame({'인구수':population,
              '남자인구수':male,
              '여자인구수':female})

In [19]:
korea_df.index

Index(['서울특별시', '부산광역시', '인천광역시', '대구광역시', '대전광역시', '광주광역시'], dtype='object')

In [20]:
korea_df.columns

Index(['인구수', '남자인구수', '여자인구수'], dtype='object')

In [21]:
korea_df['여자인구수']

서울특별시    1720846
부산광역시    1404423
인천광역시    1947217
대구광역시    4227954
대전광역시    1471040
광주광역시    1455048
Name: 여자인구수, dtype: int64

### Index 객체


In [22]:
idx = pd.Index([2,4,6,8,10])
idx

Int64Index([2, 4, 6, 8, 10], dtype='int64')

In [23]:
idx[1]

4

In [24]:
idx[1:2:2]

Int64Index([4], dtype='int64')

In [25]:
idx[-1::]

Int64Index([10], dtype='int64')

In [26]:
idx[::2]

Int64Index([2, 6, 10], dtype='int64')

In [27]:
print(idx)
print(idx.size)
print(idx.ndim)
print(idx.shape)
print(idx.dtype)

Int64Index([2, 4, 6, 8, 10], dtype='int64')
5
1
(5,)
int64


#### Index 연산

In [28]:
idx1 = pd.Index([1,2,3,4,5])
idx2 = pd.Index([4,5,6,7,8])

print(idx1.append(idx2))
print(idx1.difference(idx2))
print(idx1 - idx2)
print(idx1.intersection(idx2))
print(idx1 & idx2)
print(idx1.union(idx2))
print(idx1 | idx2 )
print(idx1.delete(0))
print(idx1.drop(1))
print(idx1 ^ idx2) # 공통 뺀 나머지 여집합.


Int64Index([1, 2, 3, 4, 5, 4, 5, 6, 7, 8], dtype='int64')
Int64Index([1, 2, 3], dtype='int64')
Int64Index([-3, -3, -3, -3, -3], dtype='int64')
Int64Index([4, 5], dtype='int64')
Int64Index([4, 5], dtype='int64')
Int64Index([1, 2, 3, 4, 5, 6, 7, 8], dtype='int64')
Int64Index([1, 2, 3, 4, 5, 6, 7, 8], dtype='int64')
Int64Index([2, 3, 4, 5], dtype='int64')
Int64Index([2, 3, 4, 5], dtype='int64')
Int64Index([1, 2, 3, 6, 7, 8], dtype='int64')


## 인덱싱(Indexing)

In [29]:
s = pd.Series([0, 0.25 , 0.5 , 0.75 , 1.0],
             index = ['a','b','c','d','e'])
s

a    0.00
b    0.25
c    0.50
d    0.75
e    1.00
dtype: float64

In [30]:
s['b']

0.25

In [31]:
'b' in s

True

In [32]:
s.keys()

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [33]:
list(s.items())

[('a', 0.0), ('b', 0.25), ('c', 0.5), ('d', 0.75), ('e', 1.0)]

In [34]:
s['f'] = 1.25
s

a    0.00
b    0.25
c    0.50
d    0.75
e    1.00
f    1.25
dtype: float64

In [35]:
s['a':'d']

a    0.00
b    0.25
c    0.50
d    0.75
dtype: float64

In [36]:
s[0:4]

a    0.00
b    0.25
c    0.50
d    0.75
dtype: float64

In [37]:
s[(s > 0.4) & (s < 0.8)]

c    0.50
d    0.75
dtype: float64

In [38]:
s[['a','c','d']]

a    0.00
c    0.50
d    0.75
dtype: float64

### Series 인덱싱

In [39]:
s = pd.Series(['a','b','c','d','e'],
             index = [1,3,5,7,9])
s

1    a
3    b
5    c
7    d
9    e
dtype: object

In [40]:
s[1]

'a'

In [41]:
s[2:4]

5    c
7    d
dtype: object

In [42]:
s.iloc[1]

'b'

In [43]:
s.iloc[2:4]

5    c
7    d
dtype: object

In [44]:
s.reindex(range(10))

0    NaN
1      a
2    NaN
3      b
4    NaN
5      c
6    NaN
7      d
8    NaN
9      e
dtype: object

In [45]:
s.reindex(range(10), method='bfill')

0    a
1    a
2    b
3    b
4    c
5    c
6    d
7    d
8    e
9    e
dtype: object

### DataFrame 인덱싱


In [46]:
korea_df

Unnamed: 0,인구수,남자인구수,여자인구수
서울특별시,9720846,720846,1720846
부산광역시,3404423,404423,1404423
인천광역시,2947217,947217,1947217
대구광역시,2427954,427954,4227954
대전광역시,1471040,471040,1471040
광주광역시,1455048,455048,1455048


In [47]:
korea_df['남자인구수']

서울특별시    720846
부산광역시    404423
인천광역시    947217
대구광역시    427954
대전광역시    471040
광주광역시    455048
Name: 남자인구수, dtype: int64

In [48]:
korea_df.남자인구수

서울특별시    720846
부산광역시    404423
인천광역시    947217
대구광역시    427954
대전광역시    471040
광주광역시    455048
Name: 남자인구수, dtype: int64

In [49]:
korea_df.여자인구수

서울특별시    1720846
부산광역시    1404423
인천광역시    1947217
대구광역시    4227954
대전광역시    1471040
광주광역시    1455048
Name: 여자인구수, dtype: int64

In [50]:
korea_df['남녀비율'] = (korea_df.남자인구수*100 / korea_df.여자인구수)

In [51]:
korea_df

Unnamed: 0,인구수,남자인구수,여자인구수,남녀비율
서울특별시,9720846,720846,1720846,41.889048
부산광역시,3404423,404423,1404423,28.796381
인천광역시,2947217,947217,1947217,48.644655
대구광역시,2427954,427954,4227954,10.122012
대전광역시,1471040,471040,1471040,32.020883
광주광역시,1455048,455048,1455048,31.273745


In [52]:
korea_df.values

array([[9.72084600e+06, 7.20846000e+05, 1.72084600e+06, 4.18890476e+01],
       [3.40442300e+06, 4.04423000e+05, 1.40442300e+06, 2.87963811e+01],
       [2.94721700e+06, 9.47217000e+05, 1.94721700e+06, 4.86446554e+01],
       [2.42795400e+06, 4.27954000e+05, 4.22795400e+06, 1.01220117e+01],
       [1.47104000e+06, 4.71040000e+05, 1.47104000e+06, 3.20208832e+01],
       [1.45504800e+06, 4.55048000e+05, 1.45504800e+06, 3.12737449e+01]])

In [53]:
korea_df.T

Unnamed: 0,서울특별시,부산광역시,인천광역시,대구광역시,대전광역시,광주광역시
인구수,9720846.0,3404423.0,2947217.0,2427954.0,1471040.0,1455048.0
남자인구수,720846.0,404423.0,947217.0,427954.0,471040.0,455048.0
여자인구수,1720846.0,1404423.0,1947217.0,4227954.0,1471040.0,1455048.0
남녀비율,41.88905,28.79638,48.64466,10.12201,32.02088,31.27374


In [54]:
korea_df.values[0] # 서울특별시만

array([9.72084600e+06, 7.20846000e+05, 1.72084600e+06, 4.18890476e+01])

In [55]:
korea_df['인구수']

서울특별시    9720846
부산광역시    3404423
인천광역시    2947217
대구광역시    2427954
대전광역시    1471040
광주광역시    1455048
Name: 인구수, dtype: int64

In [56]:
korea_df.loc[:'인천광역시',:'남자인구수']

Unnamed: 0,인구수,남자인구수
서울특별시,9720846,720846
부산광역시,3404423,404423
인천광역시,2947217,947217


In [57]:
korea_df.loc[(korea_df.여자인구수 > 1000000)]

Unnamed: 0,인구수,남자인구수,여자인구수,남녀비율
서울특별시,9720846,720846,1720846,41.889048
부산광역시,3404423,404423,1404423,28.796381
인천광역시,2947217,947217,1947217,48.644655
대구광역시,2427954,427954,4227954,10.122012
대전광역시,1471040,471040,1471040,32.020883
광주광역시,1455048,455048,1455048,31.273745


In [58]:
korea_df.loc[(korea_df.인구수 < 2000000)]

Unnamed: 0,인구수,남자인구수,여자인구수,남녀비율
대전광역시,1471040,471040,1471040,32.020883
광주광역시,1455048,455048,1455048,31.273745


In [59]:
korea_df.loc[(korea_df.인구수 >= 2500000)]

Unnamed: 0,인구수,남자인구수,여자인구수,남녀비율
서울특별시,9720846,720846,1720846,41.889048
부산광역시,3404423,404423,1404423,28.796381
인천광역시,2947217,947217,1947217,48.644655


In [60]:
korea_df.loc[korea_df.남녀비율>100]

Unnamed: 0,인구수,남자인구수,여자인구수,남녀비율


In [61]:
korea_df.loc[(korea_df.인구수 > 2500000) & (korea_df.남녀비율 < 100)]

Unnamed: 0,인구수,남자인구수,여자인구수,남녀비율
서울특별시,9720846,720846,1720846,41.889048
부산광역시,3404423,404423,1404423,28.796381
인천광역시,2947217,947217,1947217,48.644655


In [62]:
korea_df.iloc[:3,:2]

Unnamed: 0,인구수,남자인구수
서울특별시,9720846,720846
부산광역시,3404423,404423
인천광역시,2947217,947217


### 다중 인덱싱(Multi Indexing)

* 1차원의 Series와 2차원의 DataFrame 객체를 넘어 3차원, 4차원 이상의 고차원 데이터 처리
* 단일 인덱스 내에 여러 인덱스를 포함하는 다중 인덱싱

#### 다중 인덱스 Series

In [63]:
idx_tuples = [('서울특별시', 2010), ('서울특별시',2020),
             ('부산광역시', 2010), ('부산광역시',2020),
             ('인천광역시', 2010), ('인천광역시',2020),
             ('대구광역시', 2010), ('대구광역시',2020),
             ('대전광역시', 2010), ('대전광역시',2020),
             ('광주광역시', 2010), ('광주광역시',2020),]

In [64]:
idx_tuples

[('서울특별시', 2010),
 ('서울특별시', 2020),
 ('부산광역시', 2010),
 ('부산광역시', 2020),
 ('인천광역시', 2010),
 ('인천광역시', 2020),
 ('대구광역시', 2010),
 ('대구광역시', 2020),
 ('대전광역시', 2010),
 ('대전광역시', 2020),
 ('광주광역시', 2010),
 ('광주광역시', 2020)]

In [65]:
pop_tuples = [10312545, 9720846,
             2567910, 3404423,
             2758296, 2947217,
             2511676, 2427954,
             1503664, 1471040,
             1454636, 1455048]
population = pd.Series(pop_tuples, index = idx_tuples)

In [66]:
population

(서울특별시, 2010)    10312545
(서울특별시, 2020)     9720846
(부산광역시, 2010)     2567910
(부산광역시, 2020)     3404423
(인천광역시, 2010)     2758296
(인천광역시, 2020)     2947217
(대구광역시, 2010)     2511676
(대구광역시, 2020)     2427954
(대전광역시, 2010)     1503664
(대전광역시, 2020)     1471040
(광주광역시, 2010)     1454636
(광주광역시, 2020)     1455048
dtype: int64

In [67]:
midx = pd.MultiIndex.from_tuples(idx_tuples)
midx

MultiIndex([('서울특별시', 2010),
            ('서울특별시', 2020),
            ('부산광역시', 2010),
            ('부산광역시', 2020),
            ('인천광역시', 2010),
            ('인천광역시', 2020),
            ('대구광역시', 2010),
            ('대구광역시', 2020),
            ('대전광역시', 2010),
            ('대전광역시', 2020),
            ('광주광역시', 2010),
            ('광주광역시', 2020)],
           )

In [68]:
population = population.reindex(midx)
population

서울특별시  2010    10312545
       2020     9720846
부산광역시  2010     2567910
       2020     3404423
인천광역시  2010     2758296
       2020     2947217
대구광역시  2010     2511676
       2020     2427954
대전광역시  2010     1503664
       2020     1471040
광주광역시  2010     1454636
       2020     1455048
dtype: int64

In [69]:
population[:, 2010]

서울특별시    10312545
부산광역시     2567910
인천광역시     2758296
대구광역시     2511676
대전광역시     1503664
광주광역시     1454636
dtype: int64

In [70]:
population['대전광역시',:]

2010    1503664
2020    1471040
dtype: int64

In [71]:
korea_mdf = population.unstack()
korea_mdf

Unnamed: 0,2010,2020
광주광역시,1454636,1455048
대구광역시,2511676,2427954
대전광역시,1503664,1471040
부산광역시,2567910,3404423
서울특별시,10312545,9720846
인천광역시,2758296,2947217


In [72]:
korea_mdf.stack()

광주광역시  2010     1454636
       2020     1455048
대구광역시  2010     2511676
       2020     2427954
대전광역시  2010     1503664
       2020     1471040
부산광역시  2010     2567910
       2020     3404423
서울특별시  2010    10312545
       2020     9720846
인천광역시  2010     2758296
       2020     2947217
dtype: int64

In [73]:
male_tuples = [5111259, 4732275,
             1773170, 1668618,
             1390356, 1476813,
             1255245, 1198815,
             753648, 734441,
             721780, 720060]
male_tuples

[5111259,
 4732275,
 1773170,
 1668618,
 1390356,
 1476813,
 1255245,
 1198815,
 753648,
 734441,
 721780,
 720060]

In [74]:
korea_mdf = pd.DataFrame({'총인구수':population,
                         '남자인구수':male_tuples})
korea_mdf

Unnamed: 0,Unnamed: 1,총인구수,남자인구수
서울특별시,2010,10312545,5111259
서울특별시,2020,9720846,4732275
부산광역시,2010,2567910,1773170
부산광역시,2020,3404423,1668618
인천광역시,2010,2758296,1390356
인천광역시,2020,2947217,1476813
대구광역시,2010,2511676,1255245
대구광역시,2020,2427954,1198815
대전광역시,2010,1503664,753648
대전광역시,2020,1471040,734441


In [75]:
female_tuples = [5201286, 4988571,
                1794740, 1735805,
                1367940, 1470404,
                1256431, 1229139,
                750016, 736599,
                732856, 734988]
female_tuples

[5201286,
 4988571,
 1794740,
 1735805,
 1367940,
 1470404,
 1256431,
 1229139,
 750016,
 736599,
 732856,
 734988]

In [76]:
korea_mdf = pd.DataFrame({'총인구수':population,
                         '남자인구수':male_tuples,
                         '여자인구수':female_tuples,})
korea_mdf

Unnamed: 0,Unnamed: 1,총인구수,남자인구수,여자인구수
서울특별시,2010,10312545,5111259,5201286
서울특별시,2020,9720846,4732275,4988571
부산광역시,2010,2567910,1773170,1794740
부산광역시,2020,3404423,1668618,1735805
인천광역시,2010,2758296,1390356,1367940
인천광역시,2020,2947217,1476813,1470404
대구광역시,2010,2511676,1255245,1256431
대구광역시,2020,2427954,1198815,1229139
대전광역시,2010,1503664,753648,750016
대전광역시,2020,1471040,734441,736599


In [77]:
ratio = korea_mdf['남자인구수'] * 100 / korea_mdf['여자인구수']
ratio

서울특별시  2010     98.269140
       2020     94.862336
부산광역시  2010     98.798155
       2020     96.129346
인천광역시  2010    101.638668
       2020    100.435867
대구광역시  2010     99.905606
       2020     97.532907
대전광역시  2010    100.484256
       2020     99.707032
광주광역시  2010     98.488653
       2020     97.968946
dtype: float64

In [78]:
korea_mdf = pd.DataFrame({'총인구수':population,
                         '남자인구수':male_tuples,
                         '여자인구수':female_tuples,
                         '남녀비율':ratio})
korea_mdf

Unnamed: 0,Unnamed: 1,총인구수,남자인구수,여자인구수,남녀비율
서울특별시,2010,10312545,5111259,5201286,98.26914
서울특별시,2020,9720846,4732275,4988571,94.862336
부산광역시,2010,2567910,1773170,1794740,98.798155
부산광역시,2020,3404423,1668618,1735805,96.129346
인천광역시,2010,2758296,1390356,1367940,101.638668
인천광역시,2020,2947217,1476813,1470404,100.435867
대구광역시,2010,2511676,1255245,1256431,99.905606
대구광역시,2020,2427954,1198815,1229139,97.532907
대전광역시,2010,1503664,753648,750016,100.484256
대전광역시,2020,1471040,734441,736599,99.707032


In [79]:
ratio.unstack()

Unnamed: 0,2010,2020
광주광역시,98.488653,97.968946
대구광역시,99.905606,97.532907
대전광역시,100.484256,99.707032
부산광역시,98.798155,96.129346
서울특별시,98.26914,94.862336
인천광역시,101.638668,100.435867


#### 다중 인덱스 생성

In [80]:
df = pd.DataFrame(np.random.rand(6,3),
                 index = [['a','a','b','b','c','c'], [1,2,1,2,1,2,]],
                 columns = ['c1','c2','c3'])
df

Unnamed: 0,Unnamed: 1,c1,c2,c3
a,1,0.527172,0.053476,0.663152
a,2,0.655954,0.074316,0.173721
b,1,0.336526,0.436813,0.448837
b,2,0.042772,0.881854,0.632421
c,1,0.809558,0.836976,0.905595
c,2,0.20549,0.155449,0.355554


In [81]:
pd.MultiIndex.from_arrays([['a','a','b','b','c','c'], [1,2,1,2,1,2,]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2),
            ('c', 1),
            ('c', 2)],
           )

In [82]:
pd.MultiIndex.from_product([['a','b','c'],[1,2]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2),
            ('c', 1),
            ('c', 2)],
           )

In [83]:
pd.MultiIndex.from_product

<bound method MultiIndex.from_product of <class 'pandas.core.indexes.multi.MultiIndex'>>

In [84]:
pd.MultiIndex(levels=[['a','b','c'], [1,2]],
             codes = [[0,0,1,1,2,2], [0,1,0,1,0,1]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2),
            ('c', 1),
            ('c', 2)],
           )

In [85]:
population.index.names = ['행정구역', '년도']
population

행정구역   년도  
서울특별시  2010    10312545
       2020     9720846
부산광역시  2010     2567910
       2020     3404423
인천광역시  2010     2758296
       2020     2947217
대구광역시  2010     2511676
       2020     2427954
대전광역시  2010     1503664
       2020     1471040
광주광역시  2010     1454636
       2020     1455048
dtype: int64

In [86]:
idx = pd.MultiIndex.from_product([['a','b','c'],[1,2]],
                                names= ['name1','name2'])
cols = pd.MultiIndex.from_product([['c1','c2','c3'], [1,2]],
                                 names = ['col_name1','col_name2'])
data = np.round(np.random.randn(6, 6,),2)
mdf = pd.DataFrame(data, index = idx, columns = cols)
mdf

Unnamed: 0_level_0,col_name1,c1,c1,c2,c2,c3,c3
Unnamed: 0_level_1,col_name2,1,2,1,2,1,2
name1,name2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
a,1,0.02,-0.0,0.24,1.51,1.62,1.19
a,2,0.18,0.63,-0.45,0.03,-2.01,1.09
b,1,-0.92,-2.04,1.38,-1.65,0.13,0.47
b,2,-0.18,0.42,-0.27,0.49,-0.62,0.7
c,1,-0.06,-0.93,-1.55,-2.14,-1.12,-0.84
c,2,0.63,-1.09,1.41,-0.67,0.46,0.22


#### 인덱싱 및 슬라이싱

In [87]:
population['인천광역시', 2010]

2758296

In [88]:
population[:, 2010]

행정구역
서울특별시    10312545
부산광역시     2567910
인천광역시     2758296
대구광역시     2511676
대전광역시     1503664
광주광역시     1454636
dtype: int64

In [89]:
population[population > 3000000]

행정구역   년도  
서울특별시  2010    10312545
       2020     9720846
부산광역시  2020     3404423
dtype: int64

In [90]:
population[['대구광역시','부산광역시']]

행정구역   년도  
대구광역시  2010    2511676
       2020    2427954
부산광역시  2010    2567910
       2020    3404423
dtype: int64

In [91]:
mdf

Unnamed: 0_level_0,col_name1,c1,c1,c2,c2,c3,c3
Unnamed: 0_level_1,col_name2,1,2,1,2,1,2
name1,name2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
a,1,0.02,-0.0,0.24,1.51,1.62,1.19
a,2,0.18,0.63,-0.45,0.03,-2.01,1.09
b,1,-0.92,-2.04,1.38,-1.65,0.13,0.47
b,2,-0.18,0.42,-0.27,0.49,-0.62,0.7
c,1,-0.06,-0.93,-1.55,-2.14,-1.12,-0.84
c,2,0.63,-1.09,1.41,-0.67,0.46,0.22


In [92]:
mdf['c2',1]

name1  name2
a      1        0.24
       2       -0.45
b      1        1.38
       2       -0.27
c      1       -1.55
       2        1.41
Name: (c2, 1), dtype: float64

In [93]:
mdf.iloc[:3,:4]

Unnamed: 0_level_0,col_name1,c1,c1,c2,c2
Unnamed: 0_level_1,col_name2,1,2,1,2
name1,name2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
a,1,0.02,-0.0,0.24,1.51
a,2,0.18,0.63,-0.45,0.03
b,1,-0.92,-2.04,1.38,-1.65


In [94]:
mdf.loc[:, ('c2',1)]

name1  name2
a      1        0.24
       2       -0.45
b      1        1.38
       2       -0.27
c      1       -1.55
       2        1.41
Name: (c2, 1), dtype: float64

In [95]:
idx_slice = pd.IndexSlice

In [96]:
mdf.loc[idx_slice[:,2], idx_slice[:,2]]

Unnamed: 0_level_0,col_name1,c1,c2,c3
Unnamed: 0_level_1,col_name2,2,2,2
name1,name2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,2,0.63,0.03,1.09
b,2,0.42,0.49,0.7
c,2,-1.09,-0.67,0.22


#### 다중 인덱스 재정렬

In [97]:
idx

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2),
            ('c', 1),
            ('c', 2)],
           names=['name1', 'name2'])

In [98]:
korea_mdf

Unnamed: 0_level_0,Unnamed: 1_level_0,총인구수,남자인구수,여자인구수,남녀비율
행정구역,년도,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
서울특별시,2010,10312545,5111259,5201286,98.26914
서울특별시,2020,9720846,4732275,4988571,94.862336
부산광역시,2010,2567910,1773170,1794740,98.798155
부산광역시,2020,3404423,1668618,1735805,96.129346
인천광역시,2010,2758296,1390356,1367940,101.638668
인천광역시,2020,2947217,1476813,1470404,100.435867
대구광역시,2010,2511676,1255245,1256431,99.905606
대구광역시,2020,2427954,1198815,1229139,97.532907
대전광역시,2010,1503664,753648,750016,100.484256
대전광역시,2020,1471040,734441,736599,99.707032


In [99]:
# korea_mdf['서울특별시':'인천광역시'] # 정렬을 해줘야 함.
korea_mdf = korea_mdf.sort_index()
korea_mdf

Unnamed: 0_level_0,Unnamed: 1_level_0,총인구수,남자인구수,여자인구수,남녀비율
행정구역,년도,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
광주광역시,2010,1454636,721780,732856,98.488653
광주광역시,2020,1455048,720060,734988,97.968946
대구광역시,2010,2511676,1255245,1256431,99.905606
대구광역시,2020,2427954,1198815,1229139,97.532907
대전광역시,2010,1503664,753648,750016,100.484256
대전광역시,2020,1471040,734441,736599,99.707032
부산광역시,2010,2567910,1773170,1794740,98.798155
부산광역시,2020,3404423,1668618,1735805,96.129346
서울특별시,2010,10312545,5111259,5201286,98.26914
서울특별시,2020,9720846,4732275,4988571,94.862336


In [100]:
korea_mdf['서울특별시':'인천광역시']

Unnamed: 0_level_0,Unnamed: 1_level_0,총인구수,남자인구수,여자인구수,남녀비율
행정구역,년도,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
서울특별시,2010,10312545,5111259,5201286,98.26914
서울특별시,2020,9720846,4732275,4988571,94.862336
인천광역시,2010,2758296,1390356,1367940,101.638668
인천광역시,2020,2947217,1476813,1470404,100.435867


In [101]:
korea_mdf.unstack(level=0)

Unnamed: 0_level_0,총인구수,총인구수,총인구수,총인구수,총인구수,총인구수,남자인구수,남자인구수,남자인구수,남자인구수,...,여자인구수,여자인구수,여자인구수,여자인구수,남녀비율,남녀비율,남녀비율,남녀비율,남녀비율,남녀비율
행정구역,광주광역시,대구광역시,대전광역시,부산광역시,서울특별시,인천광역시,광주광역시,대구광역시,대전광역시,부산광역시,...,대전광역시,부산광역시,서울특별시,인천광역시,광주광역시,대구광역시,대전광역시,부산광역시,서울특별시,인천광역시
년도,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2010,1454636,2511676,1503664,2567910,10312545,2758296,721780,1255245,753648,1773170,...,750016,1794740,5201286,1367940,98.488653,99.905606,100.484256,98.798155,98.26914,101.638668
2020,1455048,2427954,1471040,3404423,9720846,2947217,720060,1198815,734441,1668618,...,736599,1735805,4988571,1470404,97.968946,97.532907,99.707032,96.129346,94.862336,100.435867


In [102]:
korea_mdf.unstack(level=1)

Unnamed: 0_level_0,총인구수,총인구수,남자인구수,남자인구수,여자인구수,여자인구수,남녀비율,남녀비율
년도,2010,2020,2010,2020,2010,2020,2010,2020
행정구역,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
광주광역시,1454636,1455048,721780,720060,732856,734988,98.488653,97.968946
대구광역시,2511676,2427954,1255245,1198815,1256431,1229139,99.905606,97.532907
대전광역시,1503664,1471040,753648,734441,750016,736599,100.484256,99.707032
부산광역시,2567910,3404423,1773170,1668618,1794740,1735805,98.798155,96.129346
서울특별시,10312545,9720846,5111259,4732275,5201286,4988571,98.26914,94.862336
인천광역시,2758296,2947217,1390356,1476813,1367940,1470404,101.638668,100.435867


In [103]:
korea_mdf.stack()

행정구역   년도         
광주광역시  2010  총인구수     1.454636e+06
             남자인구수    7.217800e+05
             여자인구수    7.328560e+05
             남녀비율     9.848865e+01
       2020  총인구수     1.455048e+06
             남자인구수    7.200600e+05
             여자인구수    7.349880e+05
             남녀비율     9.796895e+01
대구광역시  2010  총인구수     2.511676e+06
             남자인구수    1.255245e+06
             여자인구수    1.256431e+06
             남녀비율     9.990561e+01
       2020  총인구수     2.427954e+06
             남자인구수    1.198815e+06
             여자인구수    1.229139e+06
             남녀비율     9.753291e+01
대전광역시  2010  총인구수     1.503664e+06
             남자인구수    7.536480e+05
             여자인구수    7.500160e+05
             남녀비율     1.004843e+02
       2020  총인구수     1.471040e+06
             남자인구수    7.344410e+05
             여자인구수    7.365990e+05
             남녀비율     9.970703e+01
부산광역시  2010  총인구수     2.567910e+06
             남자인구수    1.773170e+06
             여자인구수    1.794740e+06
             남녀비율     9.879815e+01
 

In [104]:
korea_mdf

Unnamed: 0_level_0,Unnamed: 1_level_0,총인구수,남자인구수,여자인구수,남녀비율
행정구역,년도,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
광주광역시,2010,1454636,721780,732856,98.488653
광주광역시,2020,1455048,720060,734988,97.968946
대구광역시,2010,2511676,1255245,1256431,99.905606
대구광역시,2020,2427954,1198815,1229139,97.532907
대전광역시,2010,1503664,753648,750016,100.484256
대전광역시,2020,1471040,734441,736599,99.707032
부산광역시,2010,2567910,1773170,1794740,98.798155
부산광역시,2020,3404423,1668618,1735805,96.129346
서울특별시,2010,10312545,5111259,5201286,98.26914
서울특별시,2020,9720846,4732275,4988571,94.862336


In [105]:
idx_flat = korea_mdf.reset_index()

In [106]:
idx_flat # flat한 형태로 변경됨

Unnamed: 0,행정구역,년도,총인구수,남자인구수,여자인구수,남녀비율
0,광주광역시,2010,1454636,721780,732856,98.488653
1,광주광역시,2020,1455048,720060,734988,97.968946
2,대구광역시,2010,2511676,1255245,1256431,99.905606
3,대구광역시,2020,2427954,1198815,1229139,97.532907
4,대전광역시,2010,1503664,753648,750016,100.484256
5,대전광역시,2020,1471040,734441,736599,99.707032
6,부산광역시,2010,2567910,1773170,1794740,98.798155
7,부산광역시,2020,3404423,1668618,1735805,96.129346
8,서울특별시,2010,10312545,5111259,5201286,98.26914
9,서울특별시,2020,9720846,4732275,4988571,94.862336


In [107]:
idx_flat = korea_mdf.reset_index(level = (0 ,1))

In [108]:
idx_flat

Unnamed: 0,행정구역,년도,총인구수,남자인구수,여자인구수,남녀비율
0,광주광역시,2010,1454636,721780,732856,98.488653
1,광주광역시,2020,1455048,720060,734988,97.968946
2,대구광역시,2010,2511676,1255245,1256431,99.905606
3,대구광역시,2020,2427954,1198815,1229139,97.532907
4,대전광역시,2010,1503664,753648,750016,100.484256
5,대전광역시,2020,1471040,734441,736599,99.707032
6,부산광역시,2010,2567910,1773170,1794740,98.798155
7,부산광역시,2020,3404423,1668618,1735805,96.129346
8,서울특별시,2010,10312545,5111259,5201286,98.26914
9,서울특별시,2020,9720846,4732275,4988571,94.862336


In [109]:
idx_flat.set_index(['행정구역','년도'])

Unnamed: 0_level_0,Unnamed: 1_level_0,총인구수,남자인구수,여자인구수,남녀비율
행정구역,년도,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
광주광역시,2010,1454636,721780,732856,98.488653
광주광역시,2020,1455048,720060,734988,97.968946
대구광역시,2010,2511676,1255245,1256431,99.905606
대구광역시,2020,2427954,1198815,1229139,97.532907
대전광역시,2010,1503664,753648,750016,100.484256
대전광역시,2020,1471040,734441,736599,99.707032
부산광역시,2010,2567910,1773170,1794740,98.798155
부산광역시,2020,3404423,1668618,1735805,96.129346
서울특별시,2010,10312545,5111259,5201286,98.26914
서울특별시,2020,9720846,4732275,4988571,94.862336


## 데이터 연산

In [110]:
s = pd.Series(np.random.randint(0, 10,5))
s

0    4
1    3
2    2
3    5
4    6
dtype: int32

In [111]:
df = pd.DataFrame(np.random.randint(0,10, (3,3)),
                  columns=['A','B','C'])
df

Unnamed: 0,A,B,C
0,6,7,6
1,9,2,7
2,9,8,6


In [112]:
np.exp(s)

0     54.598150
1     20.085537
2      7.389056
3    148.413159
4    403.428793
dtype: float64

In [113]:
np.cos(df*np.pi/4)

Unnamed: 0,A,B,C
0,-1.83697e-16,0.7071068,-1.83697e-16
1,0.7071068,6.123234000000001e-17,0.7071068
2,0.7071068,1.0,-1.83697e-16


In [114]:
s1 = pd.Series([1,3,5,7,9], index = [0,1,2,3,4])
s2 = pd.Series([2,4,6,8,10], index = [1,2,3,4,5])
s1+s2

0     NaN
1     5.0
2     9.0
3    13.0
4    17.0
5     NaN
dtype: float64

In [115]:
s1.add(s2, fill_value = 0)

0     1.0
1     5.0
2     9.0
3    13.0
4    17.0
5    10.0
dtype: float64

In [116]:
df1 = pd.DataFrame(np.random.randint(0,20,(3,3)),
                  columns=list('ACD'))
df1

Unnamed: 0,A,C,D
0,2,2,0
1,4,18,3
2,2,0,11


In [117]:
df2 = pd.DataFrame(np.random.randint(0,20,(5,5)),
                  columns=list('BAECD'))
df2

Unnamed: 0,B,A,E,C,D
0,4,12,8,18,6
1,16,1,2,16,3
2,14,10,17,11,1
3,1,0,15,12,12
4,10,18,9,13,18


In [118]:
df1 + df2 

Unnamed: 0,A,B,C,D,E
0,14.0,,20.0,6.0,
1,5.0,,34.0,6.0,
2,12.0,,11.0,12.0,
3,,,,,
4,,,,,


In [119]:
fvalue = df1.stack().mean()
df1.add(df2, fill_value = fvalue)

Unnamed: 0,A,B,C,D,E
0,14.0,8.666667,20.0,6.0,12.666667
1,5.0,20.666667,34.0,6.0,6.666667
2,12.0,18.666667,11.0,12.0,21.666667
3,4.666667,5.666667,16.666667,16.666667,19.666667
4,22.666667,14.666667,17.666667,22.666667,13.666667


### 연산자 범용 함수


#### add()

In [120]:
a = np.random.randint(1,10, size=(3,3))
a

array([[9, 7, 1],
       [5, 8, 1],
       [3, 8, 5]])

In [121]:
a + a[0]

array([[18, 14,  2],
       [14, 15,  2],
       [12, 15,  6]])

In [122]:
df = pd.DataFrame(a , columns=list('ABC'))
df

Unnamed: 0,A,B,C
0,9,7,1
1,5,8,1
2,3,8,5


In [123]:
df+df.iloc[0]

Unnamed: 0,A,B,C
0,18,14,2
1,14,15,2
2,12,15,6


In [124]:
df.add(df.iloc[0])

Unnamed: 0,A,B,C
0,18,14,2
1,14,15,2
2,12,15,6


#### sub() / subtract()

In [125]:
a

array([[9, 7, 1],
       [5, 8, 1],
       [3, 8, 5]])

In [126]:
a-a[0]

array([[ 0,  0,  0],
       [-4,  1,  0],
       [-6,  1,  4]])

In [127]:
df

Unnamed: 0,A,B,C
0,9,7,1
1,5,8,1
2,3,8,5


In [128]:
df-df.iloc[0]

Unnamed: 0,A,B,C
0,0,0,0
1,-4,1,0
2,-6,1,4


In [129]:
df.sub(df.iloc[0])

Unnamed: 0,A,B,C
0,0,0,0
1,-4,1,0
2,-6,1,4


In [130]:
df.subtract(df['B'], axis=0)

Unnamed: 0,A,B,C
0,2,0,-6
1,-3,0,-7
2,-5,0,-3


#### mul() / multply()




In [131]:
a

array([[9, 7, 1],
       [5, 8, 1],
       [3, 8, 5]])

In [132]:
a*a[1]

array([[45, 56,  1],
       [25, 64,  1],
       [15, 64,  5]])

In [133]:
df

Unnamed: 0,A,B,C
0,9,7,1
1,5,8,1
2,3,8,5


In [134]:
df*df.iloc[1]

Unnamed: 0,A,B,C
0,45,56,1
1,25,64,1
2,15,64,5


In [135]:
df.mul(df.iloc[1])

Unnamed: 0,A,B,C
0,45,56,1
1,25,64,1
2,15,64,5


In [136]:
df.multiply(df.iloc[1])

Unnamed: 0,A,B,C
0,45,56,1
1,25,64,1
2,15,64,5


#### truediv() /  div() / divide() / floordiv()

In [137]:
a

array([[9, 7, 1],
       [5, 8, 1],
       [3, 8, 5]])

In [138]:
a / a[0]

array([[1.        , 1.        , 1.        ],
       [0.55555556, 1.14285714, 1.        ],
       [0.33333333, 1.14285714, 5.        ]])

In [139]:
df

Unnamed: 0,A,B,C
0,9,7,1
1,5,8,1
2,3,8,5


In [140]:
df / df.iloc[0]

Unnamed: 0,A,B,C
0,1.0,1.0,1.0
1,0.555556,1.142857,1.0
2,0.333333,1.142857,5.0


In [141]:
df.truediv(df.iloc[0])

Unnamed: 0,A,B,C
0,1.0,1.0,1.0
1,0.555556,1.142857,1.0
2,0.333333,1.142857,5.0


In [142]:
df.div(df.iloc[0])

Unnamed: 0,A,B,C
0,1.0,1.0,1.0
1,0.555556,1.142857,1.0
2,0.333333,1.142857,5.0


In [143]:
a // a[0]

array([[1, 1, 1],
       [0, 1, 1],
       [0, 1, 5]], dtype=int32)

In [144]:
df.floordiv(df.iloc[1])

Unnamed: 0,A,B,C
0,1,0,1
1,1,1,1
2,0,1,5


#### mod()

In [145]:
a % a[0]

array([[0, 0, 0],
       [5, 1, 0],
       [3, 1, 0]], dtype=int32)

In [146]:
df

Unnamed: 0,A,B,C
0,9,7,1
1,5,8,1
2,3,8,5


In [147]:
df.mod(df.iloc[1])

Unnamed: 0,A,B,C
0,4,7,0
1,0,0,0
2,3,0,0


#### pow()

In [148]:
a ** a[0]

array([[387420489,    823543,         1],
       [  1953125,   2097152,         1],
       [    19683,   2097152,         5]], dtype=int32)

In [149]:
df

Unnamed: 0,A,B,C
0,9,7,1
1,5,8,1
2,3,8,5


In [150]:
df.pow(df.iloc[1])

Unnamed: 0,A,B,C
0,59049,5764801,1
1,3125,16777216,1
2,243,16777216,5


In [151]:
df

Unnamed: 0,A,B,C
0,9,7,1
1,5,8,1
2,3,8,5


In [152]:
row = df.iloc[0, ::2]
row

A    9
C    1
Name: 0, dtype: int32

In [153]:
df - row

Unnamed: 0,A,B,C
0,0.0,,0.0
1,-4.0,,0.0
2,-6.0,,4.0


### 정렬(Sort)

In [154]:
s = pd.Series(range(5), index=['A','B','C','D','E'])
s

A    0
B    1
C    2
D    3
E    4
dtype: int64

In [155]:
s.sort_values()

A    0
B    1
C    2
D    3
E    4
dtype: int64

In [156]:
df = pd.DataFrame(np.random.randint(0, 10 ,(4,4)),
                 index=[2,4,1,3],
                 columns = list('BDAC'))
df

Unnamed: 0,B,D,A,C
2,4,5,0,1
4,1,6,9,7
1,0,5,4,2
3,0,0,1,3


In [157]:
df.sort_index()

Unnamed: 0,B,D,A,C
1,0,5,4,2
2,4,5,0,1
3,0,0,1,3
4,1,6,9,7


In [158]:
df.sort_index(axis =1)

Unnamed: 0,A,B,C,D
2,0,4,1,5
4,9,1,7,6
1,4,0,2,5
3,1,0,3,0


In [159]:
df.sort_values(by='A')

Unnamed: 0,B,D,A,C
2,4,5,0,1
3,0,0,1,3
1,0,5,4,2
4,1,6,9,7


In [160]:
df.sort_values(by=['A','C'])

Unnamed: 0,B,D,A,C
2,4,5,0,1
3,0,0,1,3
1,0,5,4,2
4,1,6,9,7


### 순위(Ranking)


In [161]:
s = pd.Series([1,3,32,45,4.2,4,234,43,23,343,43,4,43,33])
s

0       1.0
1       3.0
2      32.0
3      45.0
4       4.2
5       4.0
6     234.0
7      43.0
8      23.0
9     343.0
10     43.0
11      4.0
12     43.0
13     33.0
dtype: float64

In [162]:
s.rank()

0      1.0
1      2.0
2      7.0
3     12.0
4      5.0
5      3.5
6     13.0
7     10.0
8      6.0
9     14.0
10    10.0
11     3.5
12    10.0
13     8.0
dtype: float64

In [163]:
s.rank(method='first')

0      1.0
1      2.0
2      7.0
3     12.0
4      5.0
5      3.0
6     13.0
7      9.0
8      6.0
9     14.0
10    10.0
11     4.0
12    11.0
13     8.0
dtype: float64

In [164]:
s.rank(method='max')

0      1.0
1      2.0
2      7.0
3     12.0
4      5.0
5      4.0
6     13.0
7     11.0
8      6.0
9     14.0
10    11.0
11     4.0
12    11.0
13     8.0
dtype: float64

### 고성능 연산

In [165]:
nrows, ncols = 100000, 100
df1, df2, df3 , df4 = (pd.DataFrame(np.random.rand(nrows, ncols)) for i in range(4))

In [166]:
%timeit df1 + df2 + df3 +df4

100 ms ± 3.2 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [167]:
%timeit pd.eval('df1+df2+df3+df4')

41.1 ms ± 382 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [168]:
%timeit df1 * df2 / (-df3 * df4)

143 ms ± 5.78 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [169]:
%timeit pd.eval('df1 * df2 / (-df3 * df4)')

42.9 ms ± 312 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
%timeit pd.eval('(df1 < df2) & (df2 <= df3) & df3 != df4')

In [173]:
df = pd.DataFrame(np.random.rand(1000000, 5),
                 columns = ['A','B','C','D','E'])

In [175]:
%timeit pd.eval('df.A + df.B / df.C -df.D*df.E')

7.13 ms ± 321 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [176]:
%time df.eval('A + B / C - D * E')

Wall time: 10 ms


0         2.397416
1         1.124712
2         3.224759
3         0.391338
4         1.294773
            ...   
999995    0.624562
999996    1.516667
999997    1.317784
999998    0.864890
999999    0.311607
Length: 1000000, dtype: float64

In [174]:
df.eval('R = A + B / C - D * E', inplace=True)
df.head()

Unnamed: 0,A,B,C,D,E,R
0,0.998851,0.812826,0.494251,0.873627,0.281579,2.397416
1,0.672451,0.566863,0.939973,0.844198,0.178634,1.124712
2,0.10108,0.321851,0.096908,0.24796,0.796549,3.224759
3,0.206046,0.246968,0.600585,0.272307,0.829658,0.391338
4,0.211559,0.504707,0.314837,0.570089,0.911889,1.294773


In [177]:
col_mean = df.mean(1)
df['A'] + col_mean

0         1.975276
1         1.393589
2         0.899265
3         0.630530
4         0.846201
            ...   
999995    0.427285
999996    1.514367
999997    1.137532
999998    1.277033
999999    0.772393
Length: 1000000, dtype: float64

In [184]:
df.eval('A + @col_mean') # 외부 값 가져옴

0         1.975276
1         1.393589
2         0.899265
3         0.630530
4         0.846201
            ...   
999995    0.427285
999996    1.514367
999997    1.137532
999998    1.277033
999999    0.772393
Length: 1000000, dtype: float64

In [186]:
df[(df.A < 0.5) & (df.B > 0.7)]

Unnamed: 0,A,B,C,D,E,R
9,0.278807,0.844968,0.979162,0.310296,0.496119,0.987814
15,0.103176,0.771677,0.365725,0.484296,0.795448,1.827939
20,0.435996,0.795330,0.358488,0.512029,0.702146,2.295044
21,0.249166,0.947968,0.858405,0.015966,0.611194,1.343744
24,0.349947,0.736315,0.473777,0.798945,0.890529,1.192602
...,...,...,...,...,...,...
999962,0.123717,0.849943,0.992765,0.919698,0.355754,0.652668
999982,0.125535,0.874250,0.557773,0.065549,0.541445,1.657438
999987,0.460128,0.929979,0.172747,0.618546,0.690225,5.416658
999992,0.191482,0.855012,0.585890,0.568579,0.495303,1.369203


In [187]:
pd.eval('df[(df.A < 0.5) & (df.B > 0.7)]')

Unnamed: 0,A,B,C,D,E,R
9,0.278807,0.844968,0.979162,0.310296,0.496119,0.987814
15,0.103176,0.771677,0.365725,0.484296,0.795448,1.827939
20,0.435996,0.795330,0.358488,0.512029,0.702146,2.295044
21,0.249166,0.947968,0.858405,0.015966,0.611194,1.343744
24,0.349947,0.736315,0.473777,0.798945,0.890529,1.192602
...,...,...,...,...,...,...
999962,0.123717,0.849943,0.992765,0.919698,0.355754,0.652668
999982,0.125535,0.874250,0.557773,0.065549,0.541445,1.657438
999987,0.460128,0.929979,0.172747,0.618546,0.690225,5.416658
999992,0.191482,0.855012,0.585890,0.568579,0.495303,1.369203


In [192]:
df.query('(A>0.5) and (B<0.7)')

Unnamed: 0,A,B,C,D,E,R
1,0.672451,0.566863,0.939973,0.844198,0.178634,1.124712
5,0.709318,0.190029,0.114270,0.878874,0.876457,1.602009
6,0.871634,0.225475,0.462967,0.853605,0.433068,0.988987
7,0.671951,0.343986,0.866337,0.282838,0.707944,0.868775
8,0.555573,0.576699,0.175750,0.900827,0.562372,3.330336
...,...,...,...,...,...,...
999988,0.998775,0.600313,0.854146,0.632279,0.363143,1.471990
999989,0.963433,0.499432,0.633874,0.797409,0.745613,1.156778
999990,0.708452,0.264305,0.881671,0.330511,0.105194,0.973461
999994,0.658701,0.203534,0.313804,0.322307,0.430089,1.168681


In [193]:
col_mean = df['D'].mean()
df[(df.A < col_mean) & (df.B < col_mean)]

Unnamed: 0,A,B,C,D,E,R
2,0.101080,0.321851,0.096908,0.247960,0.796549,3.224759
3,0.206046,0.246968,0.600585,0.272307,0.829658,0.391338
14,0.470737,0.352541,0.398715,0.779199,0.043640,1.320925
16,0.424098,0.438626,0.852284,0.989951,0.604696,0.340127
17,0.262322,0.153580,0.389773,0.223617,0.040411,0.647310
...,...,...,...,...,...,...
999981,0.351942,0.109490,0.867608,0.709798,0.738505,-0.046050
999983,0.310883,0.465609,0.215866,0.538814,0.958055,1.951603
999984,0.251485,0.105785,0.122837,0.221586,0.492300,1.003581
999991,0.170853,0.264481,0.041053,0.470843,0.012759,6.607211


In [194]:
df.query('A < @col_mean and B < @col_mean')

Unnamed: 0,A,B,C,D,E,R
2,0.101080,0.321851,0.096908,0.247960,0.796549,3.224759
3,0.206046,0.246968,0.600585,0.272307,0.829658,0.391338
14,0.470737,0.352541,0.398715,0.779199,0.043640,1.320925
16,0.424098,0.438626,0.852284,0.989951,0.604696,0.340127
17,0.262322,0.153580,0.389773,0.223617,0.040411,0.647310
...,...,...,...,...,...,...
999981,0.351942,0.109490,0.867608,0.709798,0.738505,-0.046050
999983,0.310883,0.465609,0.215866,0.538814,0.958055,1.951603
999984,0.251485,0.105785,0.122837,0.221586,0.492300,1.003581
999991,0.170853,0.264481,0.041053,0.470843,0.012759,6.607211


## 데이터 결합

### Concat() / Append()

In [197]:
s1 = pd.Series(['a','b'], index = [1,2])
s1 = pd.Series(['c','d'], index = [3,4])
pd.concat([s1,s2])

3     c
4     d
1     2
2     4
3     6
4     8
5    10
dtype: object

In [211]:
def create_df(cols, idx):
    data = {c: [str(c.lower()) + str(i) for i in idx] for c in cols}
    return pd.DataFrame(data, index = idx)

In [212]:
df1 = create_df('AB',[1,2])
df2 = create_df('AB',[3,4])
df1

Unnamed: 0,A,B
1,a1,b1
2,a2,b2


In [202]:
df2

Unnamed: 0,A,B
3,a3,b3
4,a4,b4


In [204]:
pd.concat([df1, df2])

Unnamed: 0,A,B
1,a1,b1
2,a2,b2
3,a3,b3
4,a4,b4


In [205]:
df3 = create_df('AB',[0,1])
df3

Unnamed: 0,A,B
0,a0,b0
1,a1,b1


In [206]:
df4 = create_df('CD', [0,1])
df4

Unnamed: 0,C,D
0,c0,d0
1,c1,d1


In [207]:
pd.concat([df3, df4])

Unnamed: 0,A,B,C,D
0,a0,b0,,
1,a1,b1,,
0,,,c0,d0
1,,,c1,d1


In [209]:
# pd.concat([df1 ,df3], verify_integrity = True)

Unnamed: 0,A,B
1,a1,b1
2,a2,b2
0,a0,b0
1,a1,b1


In [210]:
pd.concat([df1, df3], ignore_index = True)

Unnamed: 0,A,B
0,a1,b1
1,a2,b2
2,a0,b0
3,a1,b1


In [213]:
pd.concat([df1, df2], keys=['x','y'])

Unnamed: 0,Unnamed: 1,A,B
x,1,a1,b1
x,2,a2,b2
y,3,a3,b3
y,4,a4,b4


In [215]:
df5 = create_df('ABC', [1,2])
df6 = create_df('BCD', [3,4])
pd.concat([df5,df6])

Unnamed: 0,A,B,C,D
1,a1,b1,c1,
2,a2,b2,c2,
3,,b3,c3,d3
4,,b4,c4,d4


In [216]:
# 1:54:36
pd.concat([df5, df6], join='inner') # 모두 존재하는 데이터만 inner join 함

Unnamed: 0,B,C
1,b1,c1
2,b2,c2
3,b3,c3
4,b4,c4


In [217]:
df5.append(df6)

Unnamed: 0,A,B,C,D
1,a1,b1,c1,
2,a2,b2,c2,
3,,b3,c3,d3
4,,b4,c4,d4


In [218]:
pd.concat([df3, df4], axis=1)

Unnamed: 0,A,B,C,D
0,a0,b0,c0,d0
1,a1,b1,c1,d1


In [221]:
pd.concat([df1 ,df3, df4])

Unnamed: 0,A,B,C,D
1,a1,b1,,
2,a2,b2,,
0,a0,b0,,
1,a1,b1,,
0,,,c0,d0
1,,,c1,d1


### 병합과 조인

In [223]:
df1 = pd.DataFrame({'학생':['솔이','박솔이','허솔이','이솔이'],
                    '학과' :['경영','회계','경제','영문']})
                          
df1

Unnamed: 0,학생,학과
0,솔이,경영
1,박솔이,회계
2,허솔이,경제
3,이솔이,영문


In [224]:
df2 = pd.DataFrame({'학생':['솔이','박솔이','허솔이','이솔이'],
                    '입학년도' :[2013,2014,2015,2016]})
                          
df2

Unnamed: 0,학생,입학년도
0,솔이,2013
1,박솔이,2014
2,허솔이,2015
3,이솔이,2016


In [225]:
df3 = pd.merge(df1 , df2)
df3

Unnamed: 0,학생,학과,입학년도
0,솔이,경영,2013
1,박솔이,회계,2014
2,허솔이,경제,2015
3,이솔이,영문,2016


In [229]:
df4 = pd.DataFrame({'학과':['경영','회계','경제','영문'],
                   '학과장':['응','너','유','재']})

In [230]:
df4

Unnamed: 0,학과,학과장
0,경영,응
1,회계,너
2,경제,유
3,영문,재


In [231]:
pd.merge(df3, df4)

Unnamed: 0,학생,학과,입학년도,학과장
0,솔이,경영,2013,응
1,박솔이,회계,2014,너
2,허솔이,경제,2015,유
3,이솔이,영문,2016,재


In [233]:
df5 = pd.DataFrame({'학과':['경영','교육','교육','컴퓨터','컴퓨터','통계'],
                   '과목' : ['경영개론','기초수학','통계학','미시경제학','프로그래밍','알고리즘']})

In [234]:
pd.merge(df1, df2 , on = '학생')

Unnamed: 0,학생,학과,입학년도
0,솔이,경영,2013
1,박솔이,회계,2014
2,허솔이,경제,2015
3,이솔이,영문,2016


In [235]:
df6 = pd.DataFrame({'이름':['솔이','박솔이','허솔이','이솔이'],
                   '성적':['A','B','C','D']})

In [238]:
pd.merge(df1, df6, left_on = '학생', right_on = '이름')

Unnamed: 0,학생,학과,이름,성적
0,솔이,경영,솔이,A
1,박솔이,회계,박솔이,B
2,허솔이,경제,허솔이,C
3,이솔이,영문,이솔이,D


In [239]:
pd.merge(df1, df6, left_on = '학생', right_on = '이름').drop('이름', axis=1)

Unnamed: 0,학생,학과,성적
0,솔이,경영,A
1,박솔이,회계,B
2,허솔이,경제,C
3,이솔이,영문,D


In [240]:
mdf1 = df1.set_index('학생')
mdf2 = df2.set_index('학생')

In [241]:
mdf1

Unnamed: 0_level_0,학과
학생,Unnamed: 1_level_1
솔이,경영
박솔이,회계
허솔이,경제
이솔이,영문


In [242]:
mdf2

Unnamed: 0_level_0,입학년도
학생,Unnamed: 1_level_1
솔이,2013
박솔이,2014
허솔이,2015
이솔이,2016


In [243]:
pd.merge(mdf1, mdf2, left_index = True, right_index = True)

Unnamed: 0_level_0,학과,입학년도
학생,Unnamed: 1_level_1,Unnamed: 2_level_1
솔이,경영,2013
박솔이,회계,2014
허솔이,경제,2015
이솔이,영문,2016


In [244]:
mdf1.join(mdf2)

Unnamed: 0_level_0,학과,입학년도
학생,Unnamed: 1_level_1,Unnamed: 2_level_1
솔이,경영,2013
박솔이,회계,2014
허솔이,경제,2015
이솔이,영문,2016


In [245]:
pd.merge(mdf1, df6, left_index = True, right_on ='이름')

Unnamed: 0,학과,이름,성적
0,경영,솔이,A
1,회계,박솔이,B
2,경제,허솔이,C
3,영문,이솔이,D


In [248]:
df7 = pd.DataFrame({'이름':['솔이','박솔이','이솔이'],
                   '주문음식' : ['피자','햄버거','자장면']})
df7

Unnamed: 0,이름,주문음식
0,솔이,피자
1,박솔이,햄버거
2,이솔이,자장면


In [247]:
df8 = pd.DataFrame({'이름':['솔이','박솔이','요솔이'],
                   '주문음료' : ['콜라','사이다','약콩']})
df8

Unnamed: 0,이름,주문음료
0,솔이,콜라
1,박솔이,사이다
2,요솔이,약콩


In [250]:
pd.merge(df7, df8)

Unnamed: 0,이름,주문음식,주문음료
0,솔이,피자,콜라
1,박솔이,햄버거,사이다


In [251]:
pd.merge(df7, df8, how='outer')

Unnamed: 0,이름,주문음식,주문음료
0,솔이,피자,콜라
1,박솔이,햄버거,사이다
2,이솔이,자장면,
3,요솔이,,약콩


In [252]:
pd.merge(df7, df8, how='left')

Unnamed: 0,이름,주문음식,주문음료
0,솔이,피자,콜라
1,박솔이,햄버거,사이다
2,이솔이,자장면,


In [253]:
pd.merge(df7, df8, how='right')

Unnamed: 0,이름,주문음식,주문음료
0,솔이,피자,콜라
1,박솔이,햄버거,사이다
2,요솔이,,약콩


In [255]:
df9 = pd.DataFrame({'이름':['솔이','박솔이','이솔이','정솔이'],
                   '순위':[4,1,3,2]})
df9

Unnamed: 0,이름,순위
0,솔이,4
1,박솔이,1
2,이솔이,3
3,정솔이,2


In [257]:
df10 = pd.DataFrame({'이름':['솔이','박솔이','이솔이','정솔이'],
                   '순위':[1,2,3,4]})
df10

Unnamed: 0,이름,순위
0,솔이,1
1,박솔이,2
2,이솔이,3
3,정솔이,4


In [258]:
pd.merge(df9,df10, on='이름')

Unnamed: 0,이름,순위_x,순위_y
0,솔이,4,1
1,박솔이,1,2
2,이솔이,3,3
3,정솔이,2,4


In [259]:
pd.merge(df9,df10, on='이름', suffixes=['_인기','_성적'])

Unnamed: 0,이름,순위_인기,순위_성적
0,솔이,4,1
1,박솔이,1,2
2,이솔이,3,3
3,정솔이,2,4


## 데이터 집계와 그룹 연산

#### 집계 연산(Aggregation) 2:15


In [263]:
df = pd.DataFrame([[1,1.2,np.nan],
                 [2.4,5.5,4.2],
                 [np.nan,np.nan,np.nan],
                 [0.44 , -3.1, -4.1]],
                 index = [1,2,3,4],
                 columns = ['A','B','C'])
df

Unnamed: 0,A,B,C
1,1.0,1.2,
2,2.4,5.5,4.2
3,,,
4,0.44,-3.1,-4.1


In [265]:
df.head(2)

Unnamed: 0,A,B,C
1,1.0,1.2,
2,2.4,5.5,4.2


In [266]:
df.tail(2)

Unnamed: 0,A,B,C
3,,,
4,0.44,-3.1,-4.1


In [267]:
df.describe()

Unnamed: 0,A,B,C
count,3.0,3.0,2.0
mean,1.28,1.2,0.05
std,1.009554,4.3,5.868986
min,0.44,-3.1,-4.1
25%,0.72,-0.95,-2.025
50%,1.0,1.2,0.05
75%,1.7,3.35,2.125
max,2.4,5.5,4.2


In [269]:
print(df)
print(np.argmin(df), np.argmax(df))

      A    B    C
1  1.00  1.2  NaN
2  2.40  5.5  4.2
3   NaN  NaN  NaN
4  0.44 -3.1 -4.1
2 2


In [270]:
print(df)
print(df.idxmin())
print(df.idxmax())

      A    B    C
1  1.00  1.2  NaN
2  2.40  5.5  4.2
3   NaN  NaN  NaN
4  0.44 -3.1 -4.1
A    4
B    4
C    4
dtype: int64
A    2
B    2
C    2
dtype: int64


In [271]:
print(df)
print(df.std())
print(df.var())

      A    B    C
1  1.00  1.2  NaN
2  2.40  5.5  4.2
3   NaN  NaN  NaN
4  0.44 -3.1 -4.1
A    1.009554
B    4.300000
C    5.868986
dtype: float64
A     1.0192
B    18.4900
C    34.4450
dtype: float64


In [272]:
print(df)
print(df.skew())
print(df.kurt())

      A    B    C
1  1.00  1.2  NaN
2  2.40  5.5  4.2
3   NaN  NaN  NaN
4  0.44 -3.1 -4.1
A    1.15207
B    0.00000
C        NaN
dtype: float64
A   NaN
B   NaN
C   NaN
dtype: float64


In [274]:
print(df)
print(df.sum())
print(df.cumsum())

      A    B    C
1  1.00  1.2  NaN
2  2.40  5.5  4.2
3   NaN  NaN  NaN
4  0.44 -3.1 -4.1
A    3.84
B    3.60
C    0.10
dtype: float64
      A    B    C
1  1.00  1.2  NaN
2  3.40  6.7  4.2
3   NaN  NaN  NaN
4  3.84  3.6  0.1


In [275]:
print(df)
print(df.prod())
print(df.cumprod())

      A    B    C
1  1.00  1.2  NaN
2  2.40  5.5  4.2
3   NaN  NaN  NaN
4  0.44 -3.1 -4.1
A     1.056
B   -20.460
C   -17.220
dtype: float64
       A      B      C
1  1.000   1.20    NaN
2  2.400   6.60   4.20
3    NaN    NaN    NaN
4  1.056 -20.46 -17.22


In [276]:
df.diff()

Unnamed: 0,A,B,C
1,,,
2,1.4,4.3,
3,,,
4,,,


In [277]:
df.quantile()

A    1.00
B    1.20
C    0.05
Name: 0.5, dtype: float64

In [278]:
df.pct_change()

Unnamed: 0,A,B,C
1,,,
2,1.4,3.583333,
3,0.0,0.0,0.0
4,-0.816667,-1.563636,-1.97619


In [279]:
df.corr()

Unnamed: 0,A,B,C
A,1.0,0.970725,1.0
B,0.970725,1.0,1.0
C,1.0,1.0,1.0


In [280]:
df.corrwith(df.B) # B를 기준으로 correlation

A    0.970725
B    1.000000
C    1.000000
dtype: float64

In [281]:
df.cov()

Unnamed: 0,A,B,C
A,1.0192,4.214,8.134
B,4.214,18.49,35.69
C,8.134,35.69,34.445


In [282]:
df['B'].unique()

array([ 1.2,  5.5,  nan, -3.1])

In [283]:
df['A'].value_counts()

0.44    1
2.40    1
1.00    1
Name: A, dtype: int64

### GroupBy 연산

In [285]:
df = pd.DataFrame({'c1':['a','a','b','b','c','d','d'],
                  'c2': ['A','B','B','A','D','C','C'],
                  'c3':np.random.randint(7),
                  'c4': np.random.randint(7)})
df

Unnamed: 0,c1,c2,c3,c4
0,a,A,1,2
1,a,B,1,2
2,b,B,1,2
3,b,A,1,2
4,c,D,1,2
5,d,C,1,2
6,d,C,1,2


In [286]:
df.dtypes

c1    object
c2    object
c3     int64
c4     int64
dtype: object

In [287]:
df['c3'].groupby(df['c1']).mean()

c1
a    1
b    1
c    1
d    1
Name: c3, dtype: int64

In [288]:
df['c4'].groupby(df['c2']).std()

c2
A    0.0
B    0.0
C    0.0
D    NaN
Name: c4, dtype: float64

In [289]:
df['c4'].groupby([df['c1'],df['c2']]).mean()

c1  c2
a   A     2
    B     2
b   A     2
    B     2
c   D     2
d   C     2
Name: c4, dtype: int64

In [291]:
df.groupby('c1').mean()

Unnamed: 0_level_0,c3,c4
c1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,1,2
b,1,2
c,1,2
d,1,2


In [290]:
df.groupby(['c1','c2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,c3,c4
c1,c2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,A,1,2
a,B,1,2
b,A,1,2
b,B,1,2
c,D,1,2
d,C,1,2


In [292]:
df.groupby(['c1','c2']).size()

c1  c2
a   A     1
    B     1
b   A     1
    B     1
c   D     1
d   C     2
dtype: int64

In [293]:
for c1, group in df.groupby('c1'):
    print(c1)
    print(group)

a
  c1 c2  c3  c4
0  a  A   1   2
1  a  B   1   2
b
  c1 c2  c3  c4
2  b  B   1   2
3  b  A   1   2
c
  c1 c2  c3  c4
4  c  D   1   2
d
  c1 c2  c3  c4
5  d  C   1   2
6  d  C   1   2


In [295]:
df.groupby(['c1','c2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,c3,c4
c1,c2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,A,1,2
a,B,1,2
b,A,1,2
b,B,1,2
c,D,1,2
d,C,1,2


In [294]:
for (c1, c2) in df.groupby(['c1','c2']):
    print((c1, c2))
    print(group)

(('a', 'A'),   c1 c2  c3  c4
0  a  A   1   2)
  c1 c2  c3  c4
5  d  C   1   2
6  d  C   1   2
(('a', 'B'),   c1 c2  c3  c4
1  a  B   1   2)
  c1 c2  c3  c4
5  d  C   1   2
6  d  C   1   2
(('b', 'A'),   c1 c2  c3  c4
3  b  A   1   2)
  c1 c2  c3  c4
5  d  C   1   2
6  d  C   1   2
(('b', 'B'),   c1 c2  c3  c4
2  b  B   1   2)
  c1 c2  c3  c4
5  d  C   1   2
6  d  C   1   2
(('c', 'D'),   c1 c2  c3  c4
4  c  D   1   2)
  c1 c2  c3  c4
5  d  C   1   2
6  d  C   1   2
(('d', 'C'),   c1 c2  c3  c4
5  d  C   1   2
6  d  C   1   2)
  c1 c2  c3  c4
5  d  C   1   2
6  d  C   1   2


In [296]:
df.groupby(['c1','c2'])['c4'].mean()

c1  c2
a   A     2
    B     2
b   A     2
    B     2
c   D     2
d   C     2
Name: c4, dtype: int64

In [297]:
df.groupby('c1')['c3'].quantile()

c1
a    1.0
b    1.0
c    1.0
d    1.0
Name: c3, dtype: float64

In [298]:
df.groupby('c1')['c3'].count()

c1
a    2
b    2
c    1
d    2
Name: c3, dtype: int64

In [299]:
df.groupby('c1')['c3'].median()

c1
a    1
b    1
c    1
d    1
Name: c3, dtype: int64

In [300]:
df.groupby(['c1','c2'])['c4'].agg(['mean','max','min'])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,max,min
c1,c2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
a,A,2,2,2
a,B,2,2,2
b,A,2,2,2
b,B,2,2,2
c,D,2,2,2
d,C,2,2,2


In [301]:
df.groupby(['c1','c2'], as_index=False)['c4'].mean()

Unnamed: 0,c1,c2,c4
0,a,A,2
1,a,B,2
2,b,A,2
3,b,B,2
4,c,D,2
5,d,C,2


In [303]:
df.groupby(['c1','c2'], group_keys=False)['c4'].mean()

c1  c2
a   A     2
    B     2
b   A     2
    B     2
c   D     2
d   C     2
Name: c4, dtype: int64

In [305]:
def top(df, n=3, column = 'c1'):
    return df.sort_values(by=column)[-n:]

top(df, n=5)

Unnamed: 0,c1,c2,c3,c4
2,b,B,1,2
3,b,A,1,2
4,c,D,1,2
5,d,C,1,2
6,d,C,1,2


In [306]:
df.groupby('c1').apply(top)

Unnamed: 0_level_0,Unnamed: 1_level_0,c1,c2,c3,c4
c1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
a,0,a,A,1,2
a,1,a,B,1,2
b,2,b,B,1,2
b,3,b,A,1,2
c,4,c,D,1,2
d,5,d,C,1,2
d,6,d,C,1,2


### 피벗 테이블(Pivot Table)


In [307]:
df.pivot_table(['c3','c4'],
              index = ['c1'],
              columns=['c2'])

Unnamed: 0_level_0,c3,c3,c3,c3,c4,c4,c4,c4
c2,A,B,C,D,A,B,C,D
c1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
a,1.0,1.0,,,2.0,2.0,,
b,1.0,1.0,,,2.0,2.0,,
c,,,,1.0,,,,2.0
d,,,1.0,,,,2.0,


In [308]:
df.pivot_table(['c3','c4'],
              index = ['c1'],
              columns=['c2'],
              margins=True) # all 이 출력됨 부분합 총계

Unnamed: 0_level_0,c3,c3,c3,c3,c3,c4,c4,c4,c4,c4
c2,A,B,C,D,All,A,B,C,D,All
c1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
a,1.0,1.0,,,1,2.0,2.0,,,2
b,1.0,1.0,,,1,2.0,2.0,,,2
c,,,,1.0,1,,,,2.0,2
d,,,1.0,,1,,,2.0,,2
All,1.0,1.0,1.0,1.0,1,2.0,2.0,2.0,2.0,2


In [309]:
df.pivot_table(['c3','c4'],
              index = ['c1'],
              columns=['c2'],
              margins=True,
              aggfunc=sum)

Unnamed: 0_level_0,c3,c3,c3,c3,c3,c4,c4,c4,c4,c4
c2,A,B,C,D,All,A,B,C,D,All
c1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
a,1.0,1.0,,,2,2.0,2.0,,,4
b,1.0,1.0,,,2,2.0,2.0,,,4
c,,,,1.0,1,,,,2.0,2
d,,,2.0,,2,,,4.0,,4
All,2.0,2.0,2.0,1.0,7,4.0,4.0,4.0,2.0,14


In [310]:
df.pivot_table(['c3','c4'],
              index = ['c1'],
              columns=['c2'],
              margins=True,
              aggfunc=sum,
              fill_value = 0)

Unnamed: 0_level_0,c3,c3,c3,c3,c3,c4,c4,c4,c4,c4
c2,A,B,C,D,All,A,B,C,D,All
c1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
a,1,1,0,0,2,2,2,0,0,4
b,1,1,0,0,2,2,2,0,0,4
c,0,0,0,1,1,0,0,0,2,2
d,0,0,2,0,2,0,0,4,0,4
All,2,2,2,1,7,4,4,4,2,14


In [311]:
pd.crosstab(df.c1 ,df.c2)

c2,A,B,C,D
c1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
a,1,1,0,0
b,1,1,0,0
c,0,0,0,1
d,0,0,2,0


In [312]:
pd.crosstab(df.c1, df.c2, values=df.c3 , aggfunc = sum, margins = True)

c2,A,B,C,D,All
c1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
a,1.0,1.0,,,2
b,1.0,1.0,,,2
c,,,,1.0,1
d,,,2.0,,2
All,2.0,2.0,2.0,1.0,7


### 범주형(Categorical) 데이터


In [315]:
s = pd.Series(['c1','c2','c1','c2','c1'] * 2)
s

0    c1
1    c2
2    c1
3    c2
4    c1
5    c1
6    c2
7    c1
8    c2
9    c1
dtype: object

In [314]:
s.unique()

array(['c1', 'c2'], dtype=object)

In [316]:
s.value_counts()

c1    6
c2    4
dtype: int64

In [318]:
code = pd.Series([0,1,0,1,0,1]*2)
code

0     0
1     1
2     0
3     1
4     0
5     1
6     0
7     1
8     0
9     1
10    0
11    1
dtype: int64

In [319]:
d= pd.Series(['c1','c2'])

In [320]:
# 범주형에 특정 코드를 붙여서 계산에 코드를 사용
d.take(code)

0    c1
1    c2
0    c1
1    c2
0    c1
1    c2
0    c1
1    c2
0    c1
1    c2
0    c1
1    c2
dtype: object

In [324]:
df = pd.DataFrame({'id':np.arange(len(s)),
                   'c':s,
                   'v':np.random.randint(1000,5000, size=len(s))})
df


Unnamed: 0,id,c,v
0,0,c1,2525
1,1,c2,3188
2,2,c1,4118
3,3,c2,4197
4,4,c1,1870
5,5,c1,1132
6,6,c2,3099
7,7,c1,1890
8,8,c2,4546
9,9,c1,4428


In [326]:
c = df['c'].astype('category')
c

0    c1
1    c2
2    c1
3    c2
4    c1
5    c1
6    c2
7    c1
8    c2
9    c1
Name: c, dtype: category
Categories (2, object): ['c1', 'c2']

In [327]:
c.values

['c1', 'c2', 'c1', 'c2', 'c1', 'c1', 'c2', 'c1', 'c2', 'c1']
Categories (2, object): ['c1', 'c2']

In [328]:
c.values.categories

Index(['c1', 'c2'], dtype='object')

In [329]:
c.values.codes

array([0, 1, 0, 1, 0, 0, 1, 0, 1, 0], dtype=int8)

In [330]:
df['c'] = c
df.c

0    c1
1    c2
2    c1
3    c2
4    c1
5    c1
6    c2
7    c1
8    c2
9    c1
Name: c, dtype: category
Categories (2, object): ['c1', 'c2']

In [332]:
c = pd.Categorical(['c1','c2','c3','c1','c2'])
c

['c1', 'c2', 'c3', 'c1', 'c2']
Categories (3, object): ['c1', 'c2', 'c3']

In [336]:
categories = ['c1','c2','c3']
codes = [0,1,2,0,1]
c = pd.Categorical.from_codes(codes, categories)
c

['c1', 'c2', 'c3', 'c1', 'c2']
Categories (3, object): ['c1', 'c2', 'c3']

In [337]:
pd.Categorical.from_codes(codes, categories, ordered=True)

['c1', 'c2', 'c3', 'c1', 'c2']
Categories (3, object): ['c1' < 'c2' < 'c3']

In [338]:
c.as_ordered()

['c1', 'c2', 'c3', 'c1', 'c2']
Categories (3, object): ['c1' < 'c2' < 'c3']

In [339]:
c.codes

array([0, 1, 2, 0, 1], dtype=int8)

In [340]:
c.categories

Index(['c1', 'c2', 'c3'], dtype='object')

In [341]:
c = c.set_categories(['c1','c2','c3','c4','c5'])
c.categories

Index(['c1', 'c2', 'c3', 'c4', 'c5'], dtype='object')

In [342]:
c.value_counts()

c1    2
c2    2
c3    1
c4    0
c5    0
dtype: int64

In [344]:
c[c.isin(['c1','c3'])]

['c1', 'c3', 'c1']
Categories (5, object): ['c1', 'c2', 'c3', 'c4', 'c5']

In [345]:
c= c.remove_unused_categories()

['c1', 'c2', 'c3', 'c1', 'c2']
Categories (3, object): ['c1', 'c2', 'c3']

In [346]:
c.categories

Index(['c1', 'c2', 'c3'], dtype='object')

## 문자열 연산

#### 문자열 연산자

#### 기타 연산자


#### 정규표현식


## 시계열 처리

#### 시계열 데이터 구조


### 시계열 기본

### 주기와 오프셋


### 시프트(Shift)

### 시간대 처리

* 국제표준시(Coordinated Universal Time, UTC)를 기준으로 떨어진 거리만큼 오프셋으로 시간대 처리
* 전 세계의 시간대 정보를 모아놓은 올슨 데이터베이스를 활용한 라이브러리인 `pytz` 사용

### 기간과 기간 연산

### 리샘플링(Resampling)

* 리샘플링(Resampling): 시계열의 빈도 변환
* 다운샘플링(Down sampling): 상위 빈도 데이터를 하위 빈도 데이터로 집계
* 업샘플링(Up sampling): 하위 빈도 데이터를 상위 빈도 데이터로 집계

### 무빙 윈도우(Moving Window)

## 데이터 읽기 및 저장


### 텍스트 파일 읽기/쓰기

### 이진 데이터 파일 읽기/쓰기

## 데이터 정제

### 누락값 처리

* 대부분의 실제 데이터들은 정제되지 않고 누락값들이 존재
* 서로 다른 데이터들은 다른 형태의 결측을 가짐
* 결측 데이터는 `null`, `NaN`, `NA`로 표기

#### None: 파이썬 누락 데이터

#### NaN: 누락된 수치 데이터

#### Null 값 처리


### 중복 제거

### 값 치환

## 참고문헌

* Pandas 사이트: https://pandas.pydata.org/
* Jake VanderPlas, "Python Data Science Handbook", O'Reilly
* Wes Mckinney, "Python for Data Analysis", O'Reilly