In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

ImportError: No module named 'matplotlib'

## Object Creation

### `Series`

In [2]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])

> `np.nan`: Not An Number. 결측치 (missing value) 를 나타낼 때 주로 쓰임

In [4]:
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

### `DataFrame`

In [7]:
dates = pd.date_range('20130101', '20130106') # 원 페이지 코드 오류: 시작, 끝, 선택적으로 주기(=step)값 인수임
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [8]:
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))

In [9]:
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.732504,-0.47773,1.125425,1.110776
2013-01-02,1.114156,-1.569409,0.643545,-0.482571
2013-01-03,0.134919,-0.035634,0.517091,-0.791114
2013-01-04,-0.842252,-2.124258,0.600832,1.834915
2013-01-05,0.343848,0.496475,-1.021254,-0.350403
2013-01-06,-1.0172,1.282888,-1.91324,1.366801


In [10]:
df.head(2)

Unnamed: 0,A,B,C,D
2013-01-01,-0.732504,-0.47773,1.125425,1.110776
2013-01-02,1.114156,-1.569409,0.643545,-0.482571


In [11]:
df.tail(2)

Unnamed: 0,A,B,C,D
2013-01-05,0.343848,0.496475,-1.021254,-0.350403
2013-01-06,-1.0172,1.282888,-1.91324,1.366801


In [12]:
df2 = pd.DataFrame({ 'A' : 1.,
                         'B' : pd.Timestamp('20130102'),
                         'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
                         'D' : np.array([3] * 4,dtype='int32'),
                         'E' : pd.Categorical(["test","train","test","train"]),
                         'F' : 'foo' })

In [13]:
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [18]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [17]:
pd.DataFrame.__dict__

mappingproxy({'T': <property at 0x7fe1fc3c60e8>,
              '_AXIS_ALIASES': {'rows': 0},
              '_AXIS_IALIASES': {0: 'rows'},
              '_AXIS_LEN': 2,
              '_AXIS_NAMES': {0: 'index', 1: 'columns'},
              '_AXIS_NUMBERS': {'columns': 1, 'index': 0},
              '_AXIS_ORDERS': ['index', 'columns'],
              '_AXIS_REVERSED': True,
              '_AXIS_SLICEMAP': None,
              '__add__': <function pandas.core.ops._arith_method_FRAME.<locals>.f>,
              '__and__': <function pandas.core.ops._arith_method_FRAME.<locals>.f>,
              '__div__': <function pandas.core.ops._arith_method_FRAME.<locals>.f>,
              '__doc__': " Two-dimensional size-mutable, potentially heterogeneous tabular data\n    structure with labeled axes (rows and columns). Arithmetic operations\n    align on both row and column labels. Can be thought of as a dict-like\n    container for Series objects. The primary pandas data structure\n\n    Parameters\n  

### 보기 / 단순 변환 

#### `head()`, `tail()`

In [27]:
df.head(2)

Unnamed: 0,A,B,C,D
2013-01-01,-0.732504,-0.47773,1.125425,1.110776
2013-01-02,1.114156,-1.569409,0.643545,-0.482571


In [11]:
df.tail(2)

Unnamed: 0,A,B,C,D
2013-01-05,0.343848,0.496475,-1.021254,-0.350403
2013-01-06,-1.0172,1.282888,-1.91324,1.366801


#### `index`, `columns`, `values`

In [19]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [20]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [21]:
df.values

array([[-0.73250363, -0.47773003,  1.1254247 ,  1.11077616],
       [ 1.11415596, -1.56940912,  0.64354483, -0.48257119],
       [ 0.13491946, -0.03563399,  0.51709146, -0.79111395],
       [-0.84225244, -2.12425804,  0.60083234,  1.83491503],
       [ 0.34384774,  0.49647541, -1.02125428, -0.35040336],
       [-1.01720036,  1.28288814, -1.91324029,  1.366801  ]])

#### `describe()`

- column 별 데이터 요약정보(기술통계) 제공

In [22]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.166506,-0.404611,-0.007934,0.448067
std,0.835713,1.273973,1.184245,1.11766
min,-1.0172,-2.124258,-1.91324,-0.791114
25%,-0.814815,-1.296489,-0.636668,-0.449529
50%,-0.298792,-0.256682,0.558962,0.380186
75%,0.291616,0.363448,0.632867,1.302795
max,1.114156,1.282888,1.125425,1.834915


#### `T` : 행/열 바꾸기

- transpose 명령

In [23]:
df.T

Unnamed: 0,2013-01-01 00:00:00,2013-01-02 00:00:00,2013-01-03 00:00:00,2013-01-04 00:00:00,2013-01-05 00:00:00,2013-01-06 00:00:00
A,-0.732504,1.114156,0.134919,-0.842252,0.343848,-1.0172
B,-0.47773,-1.569409,-0.035634,-2.124258,0.496475,1.282888
C,1.125425,0.643545,0.517091,0.600832,-1.021254,-1.91324
D,1.110776,-0.482571,-0.791114,1.834915,-0.350403,1.366801


####  `sort_index()`

- 행/열 인덱스명(이름)에 따라 정렬

In [24]:
df.sort_index(axis=0, ascending=False)

Unnamed: 0,A,B,C,D
2013-01-06,-1.0172,1.282888,-1.91324,1.366801
2013-01-05,0.343848,0.496475,-1.021254,-0.350403
2013-01-04,-0.842252,-2.124258,0.600832,1.834915
2013-01-03,0.134919,-0.035634,0.517091,-0.791114
2013-01-02,1.114156,-1.569409,0.643545,-0.482571
2013-01-01,-0.732504,-0.47773,1.125425,1.110776


In [24]:
df.sort_index(axis=0, ascending=False)

Unnamed: 0,A,B,C,D
2013-01-06,-1.0172,1.282888,-1.91324,1.366801
2013-01-05,0.343848,0.496475,-1.021254,-0.350403
2013-01-04,-0.842252,-2.124258,0.600832,1.834915
2013-01-03,0.134919,-0.035634,0.517091,-0.791114
2013-01-02,1.114156,-1.569409,0.643545,-0.482571
2013-01-01,-0.732504,-0.47773,1.125425,1.110776


In [25]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,1.110776,1.125425,-0.47773,-0.732504
2013-01-02,-0.482571,0.643545,-1.569409,1.114156
2013-01-03,-0.791114,0.517091,-0.035634,0.134919
2013-01-04,1.834915,0.600832,-2.124258,-0.842252
2013-01-05,-0.350403,-1.021254,0.496475,0.343848
2013-01-06,1.366801,-1.91324,1.282888,-1.0172


####  `sort_values()`

- column 값에 따라 정렬

In [26]:
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2013-01-04,-0.842252,-2.124258,0.600832,1.834915
2013-01-02,1.114156,-1.569409,0.643545,-0.482571
2013-01-01,-0.732504,-0.47773,1.125425,1.110776
2013-01-03,0.134919,-0.035634,0.517091,-0.791114
2013-01-05,0.343848,0.496475,-1.021254,-0.350403
2013-01-06,-1.0172,1.282888,-1.91324,1.366801


## Selection

> 실제 최적화된 Access 방식: `.at`, `.iat`, `.loc`, `.iloc`, `.ix`

### Getting

#### 열(Column) 하나 선택

In [28]:
df['A']

2013-01-01   -0.732504
2013-01-02    1.114156
2013-01-03    0.134919
2013-01-04   -0.842252
2013-01-05    0.343848
2013-01-06   -1.017200
Freq: D, Name: A, dtype: float64

#### 행(Rows) 단위 Slicing

In [29]:
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,-0.732504,-0.47773,1.125425,1.110776
2013-01-02,1.114156,-1.569409,0.643545,-0.482571
2013-01-03,0.134919,-0.035634,0.517091,-0.791114


In [30]:
df['20130102':'20130104']

Unnamed: 0,A,B,C,D
2013-01-02,1.114156,-1.569409,0.643545,-0.482571
2013-01-03,0.134919,-0.035634,0.517091,-0.791114
2013-01-04,-0.842252,-2.124258,0.600832,1.834915


### Label 활용 Selection

#### getting a cross section using a label

In [None]:
df.loc[dates[0]]

#### Selecting on a multi-axis by label

> 행 전체 선택을 위해 `:` 사용

In [32]:
df.loc[:,['A', 'B']]

Unnamed: 0,A,B
2013-01-01,-0.732504,-0.47773
2013-01-02,1.114156,-1.569409
2013-01-03,0.134919,-0.035634
2013-01-04,-0.842252,-2.124258
2013-01-05,0.343848,0.496475
2013-01-06,-1.0172,1.282888


#### label slicing, both endpoints are included

In [33]:
df.loc['20130102':'20130104',['A','B']]

Unnamed: 0,A,B
2013-01-02,1.114156,-1.569409
2013-01-03,0.134919,-0.035634
2013-01-04,-0.842252,-2.124258


#### 행/열중 값 하나만 선택

- Reduction in the dimensions of the returned object

In [34]:
df.loc['20130102',['A','B']]

A    1.114156
B   -1.569409
Name: 2013-01-02 00:00:00, dtype: float64

#### 행/열 모두 값 하나만 선택

- 스칼라 값 반환

In [35]:
df.loc[dates[0],'A']

-0.73250362704049998

##### `.at` 

- 위와 동일하나, 더 빠른 값 Access 

In [36]:
df.at[dates[0],'A']

-0.73250362704049998

### 위치 활용 선택

- 더 자세한 방법: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-integer


#### `.iloc[n]`

- 넘어온 정수값 위치 Row의 모든 Column 값

In [37]:
df.iloc[3]

A   -0.842252
B   -2.124258
C    0.600832
D    1.834915
Name: 2013-01-04 00:00:00, dtype: float64

#### `.iloc[n1:n2]`

- 이렇게 integer slice 활용하면, numpy와 비슷하게 slicing

In [38]:
df.iloc[3:5,0:2]

Unnamed: 0,A,B
2013-01-04,-0.842252,-2.124258
2013-01-05,0.343848,0.496475


#### `.iloc[[r0,r1,. ..], [c0,c1, ...]]`

- 마찬가지로 `numpy`와 같이 위치값 list로 slicing 가능

In [39]:
df.iloc[[1,2,4],[0,2]]

Unnamed: 0,A,C
2013-01-02,1.114156,0.643545
2013-01-03,0.134919,0.517091
2013-01-05,0.343848,-1.021254


In [40]:
df.iloc[[4,2,1],[2,0]] # 이렇게 순서를 바꾼다면?

Unnamed: 0,C,A
2013-01-05,-1.021254,0.343848
2013-01-03,0.517091,0.134919
2013-01-02,0.643545,1.114156


#### `.iloc[r1:r2, :]`, `.iloc[:, c1:c2]`

- row / column 단위 slicing

In [41]:
 df.iloc[1:3,:]

Unnamed: 0,A,B,C,D
2013-01-02,1.114156,-1.569409,0.643545,-0.482571
2013-01-03,0.134919,-0.035634,0.517091,-0.791114


In [42]:
df.iloc[:,1:3]

Unnamed: 0,B,C
2013-01-01,-0.47773,1.125425
2013-01-02,-1.569409,0.643545
2013-01-03,-0.035634,0.517091
2013-01-04,-2.124258,0.600832
2013-01-05,0.496475,-1.021254
2013-01-06,1.282888,-1.91324


#### `.iloc[r,c]`

- 해당 위치의 scala값

In [43]:
df.iloc[1,1]

-1.5694091224817042

#### `.iat[r,c]`

- 위와 동일 (fast access)

In [44]:
 df.iat[1,1]

-1.5694091224817042

### Boolean Indexing

- Boolean 값을 통한 indexing, 즉 특정 조건을 가진 값만 얻는 filtering 가능

#### Row 단위 Boolean Indexing

- 아래와 같이 모든 row의 지정 column 값에 대한 Boolean 연산 결과 `Series`를 얻을 수 있음

In [45]:
df.A > 0

2013-01-01    False
2013-01-02     True
2013-01-03     True
2013-01-04    False
2013-01-05     True
2013-01-06    False
Freq: D, Name: A, dtype: bool

- 위의 Boolean `Series` 값을 활용하면 해당 조건을 만족하는 모든 Row를 얻을 수 있음

In [47]:
df[df.A > 0]

Unnamed: 0,A,B,C,D
2013-01-02,1.114156,-1.569409,0.643545,-0.482571
2013-01-03,0.134919,-0.035634,0.517091,-0.791114
2013-01-05,0.343848,0.496475,-1.021254,-0.350403


#### 전체 값 대상 Boolean Indexing

In [48]:
df > 0

Unnamed: 0,A,B,C,D
2013-01-01,False,False,True,True
2013-01-02,True,False,True,False
2013-01-03,True,False,True,False
2013-01-04,False,False,True,True
2013-01-05,True,True,False,False
2013-01-06,False,True,False,True


In [49]:
df[df > 0]

Unnamed: 0,A,B,C,D
2013-01-01,,,1.125425,1.110776
2013-01-02,1.114156,,0.643545,
2013-01-03,0.134919,,0.517091,
2013-01-04,,,0.600832,1.834915
2013-01-05,0.343848,0.496475,,
2013-01-06,,1.282888,,1.366801


#### `.isin(value_list)`

- 주어진 `value_list`를 가졌는지에 대한 Boolean 결과

In [50]:
df2 = df.copy()

df2['E'] = ['one', 'one','two','three','four','three']
df2

Unnamed: 0,A,B,C,D,E
2013-01-01,-0.732504,-0.47773,1.125425,1.110776,one
2013-01-02,1.114156,-1.569409,0.643545,-0.482571,one
2013-01-03,0.134919,-0.035634,0.517091,-0.791114,two
2013-01-04,-0.842252,-2.124258,0.600832,1.834915,three
2013-01-05,0.343848,0.496475,-1.021254,-0.350403,four
2013-01-06,-1.0172,1.282888,-1.91324,1.366801,three


In [51]:
df2['E'].isin(['two','four'])

2013-01-01    False
2013-01-02    False
2013-01-03     True
2013-01-04    False
2013-01-05     True
2013-01-06    False
Freq: D, Name: E, dtype: bool

- 위 결과를 활용한 필터링 가능

In [52]:
df2[df2['E'].isin(['two','four'])]

Unnamed: 0,A,B,C,D,E
2013-01-03,0.134919,-0.035634,0.517091,-0.791114,two
2013-01-05,0.343848,0.496475,-1.021254,-0.350403,four


### Setting

In [53]:
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102', '20130107'))
s1

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

In [55]:
df3 = df.copy()
df3['F'] = s1 # s1 Series값을 'F'라는 Label을 가진 열로 설정
df3

Unnamed: 0,A,B,C,D,F
2013-01-01,-0.732504,-0.47773,1.125425,1.110776,
2013-01-02,1.114156,-1.569409,0.643545,-0.482571,1.0
2013-01-03,0.134919,-0.035634,0.517091,-0.791114,2.0
2013-01-04,-0.842252,-2.124258,0.600832,1.834915,3.0
2013-01-05,0.343848,0.496475,-1.021254,-0.350403,4.0
2013-01-06,-1.0172,1.282888,-1.91324,1.366801,5.0


## Operations

## 시계열

## 범주 데이터 (Category, Factor)

## I/O

## Chart