# 1. Pandas - main classes and structure

In [1]:
%pylab
import pandas as pd
from pandas import Series, DataFrame

Using matplotlib backend: Qt4Agg
Populating the interactive namespace from numpy and matplotlib


---
### Series 和 DataFrame  


<div style="width:200px;height:200px;float:left">
![Series 和 DataFrame](../jpgs/MyPicture1.jpg)
</div>  

---
## Series
Series 由一組 索引標籤+數據 組成，就如同Excel中的單一個 column

In [2]:
# Series 由一組 索引標籤+數據 組成
s = Series([4, 7, -5, 3])
s

0    4
1    7
2   -5
3    3
dtype: int64

Series物件有 index 和 values屬性

In [3]:
# series的 index
# dtype是 int64
s.index

Int64Index([0, 1, 2, 3], dtype='int64')

In [4]:
# series的 value
s.values

array([ 4,  7, -5,  3], dtype=int64)

index如同Excel的 row number，但不一定是數字  
和Excel最大的不同: Series 的 Index 可以有重複的值

In [5]:
s.index = pd.Index([0, 'a', 'a', 2])
s

0    4
a    7
a   -5
2    3
dtype: int64

In [6]:
# 一個index可以對應多個rows
s['a']

a    7
a   -5
dtype: int64

#### Series等同是一個 有序 字典####

建構 Series物件的方式，索引不一定是數字

In [7]:
# 指定索引
s = Series([4, 7, -5, 3], index = ['d', 'b', 'a', 'c'])
s

d    4
b    7
a   -5
c    3
dtype: int64

Series的index是一個 Index物件

In [8]:
# dtype是 object
s.index

Index(['d', 'b', 'a', 'c'], dtype='object')

In [9]:
# 用索引取值
s['b']

7

In [10]:
# 可以取多個值
s[['b', 'c']]

b    7
c    3
dtype: int64

#### 各種運算之後，還是會保留index####

In [11]:
# 各種運算之後，還是會保留index
# 可以使用 陣列式索引
s[s > 3]

d    4
b    7
dtype: int64

In [12]:
# 廣播運算
s * 2

d     8
b    14
a   -10
c     6
dtype: int64

In [13]:
# 用 NumPy的頂級函示對Series做廣播運算
np.exp(s)

d      54.598150
b    1096.633158
a       0.006738
c      20.085537
dtype: float64

In [14]:
# 可以將Series看成是一個有序字典
'b' in s
# 等同是 'b' in s.index

True

In [15]:
'e' in s

False

In [16]:
# 可以命名 Series物件
s.name = 'test'
s.name

'test'

In [17]:
# 可以用Python字典來創建 Series
dt = {'Ohio' : 35000, 'Texas' : 71000, 'Oregon' : 16000, 'Utah' : 5000}
dt

{'Ohio': 35000, 'Oregon': 16000, 'Texas': 71000, 'Utah': 5000}

#### Series等同是一個有序的字典

In [18]:
s1 = Series(dt)
s1
# Series等同是一個有序的字典

Ohio      35000
Oregon    16000
Texas     71000
Utah       5000
dtype: int64

In [19]:
s1.index

Index(['Ohio', 'Oregon', 'Texas', 'Utah'], dtype='object')

#### 用 Index物件來設定 Series物件的 index屬性

In [20]:
# 建構Series的時候指定 index
# 其中 index California 在 dt中找不到，因此對應的value就標示為 NaN
states = ['Utah', 'California', 'Ohio', 'Oregon', 'Texas']
s2 = Series(dt, index = states)
s2

Utah           5000
California      NaN
Ohio          35000
Oregon        16000
Texas         71000
dtype: float64

#### pandas的頂級函式都可以對Series物件做廣播運算

In [21]:
# isnull, notnull 可用來檢測 NaN
pd.isnull(s2)

Utah          False
California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [22]:
pd.notnull(s2)

Utah           True
California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

#### Series物件也自帶很多ufunc

In [23]:
# Series 的 isnull(), notnull()
s2.isnull()

Utah          False
California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [24]:
s2.notnull()

Utah           True
California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

#### Series 最重要的功能之一 是能在算術運算中 自動對齊 不同索引的數據####
依據index自動對齊

In [25]:
s1

Ohio      35000
Oregon    16000
Texas     71000
Utah       5000
dtype: int64

In [26]:
s2

Utah           5000
California      NaN
Ohio          35000
Oregon        16000
Texas         71000
dtype: float64

In [27]:
s1 + s2

California       NaN
Ohio           70000
Oregon         32000
Texas         142000
Utah           10000
dtype: float64

In [28]:
s1 * s2

California           NaN
Ohio          1225000000
Oregon         256000000
Texas         5041000000
Utah            25000000
dtype: float64

#### 可以直接修改index中索引的標籤，資料不會受到影響

In [29]:
# index可以隨時修改，會依照順序對應來修改
s2.index = ['Utah', 'New York', 'Ohio', 'Oregon', 'Texas']
s2

Utah         5000
New York      NaN
Ohio        35000
Oregon      16000
Texas       71000
dtype: float64

---
## DataFrame

DataFrame 是一個表格型的數據結構，有一組有序的列，每列可以是不同的資料類型。

DataFrame 既有列索引，也有行索引，
#### DataFrame可以被視為由一個或多個Series所組成的字典，等同是Excel的工作表####
可以由等長的列表或字典 建構 DataFrame，字典頂層的每個item代表一個Series，也就是Excel工作表中的一個 column

由字典建立DataFrame的時候 columns 會自動以字母排序，除非顯式的以 columns參數指定

In [30]:
# 由等長的列表或字典 建構 DataFrame
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'], 
        'year': [2000, 2001, 2002, 2001, 2002], 
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}
frame = DataFrame(data)
frame

Unnamed: 0,pop,state,year
0,1.5,Ohio,2000
1,1.7,Ohio,2001
2,3.6,Ohio,2002
3,2.4,Nevada,2001
4,2.9,Nevada,2002


#### 以字典建立DataFrame的時候，可以用columns參數指定 columns名稱與排序

In [31]:
# 可以指定columns的排序
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'], 
        'year': [2000, 2001, 2002, 2001, 2002], 
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}
frame = DataFrame(data, columns = ['state', 'year', 'pop'])
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [32]:
# 找不到的column以NaN表示
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'], 
        'year': [2000, 2001, 2002, 2001, 2002], 
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}
frame = DataFrame(data,
                  columns = ['state', 'year', 'pop', 'debt'],
                  index = ['one', 'two', 'three', 'four', 'five']
                 )
frame

Unnamed: 0,state,year,pop,debt
one,Ohio,2000,1.5,
two,Ohio,2001,1.7,
three,Ohio,2002,3.6,
four,Nevada,2001,2.4,
five,Nevada,2002,2.9,


In [33]:
# DataFrame的 columns 索引，也是一個 Index物件
frame.columns

Index(['state', 'year', 'pop', 'debt'], dtype='object')

#### DataFrame的一個 column 就是一個 Series物件，可以用 column索引來取出，每個column也是DataFrame的一個屬性

In [34]:
# 將DataFrame的一個column取出成為一個Series
s = frame['state']
s

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
Name: state, dtype: object

In [35]:
# s 有 name屬性
s.name

'state'

In [36]:
# 和 frame.state是一樣的，是一個Series
frame.state

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
Name: state, dtype: object

In [37]:
frame.year

one      2000
two      2001
three    2002
four     2001
five     2002
Name: year, dtype: int64

#### 由上可知，DataFrame可以被視為由一個或多個Series所組成的字典，字典的key是各個column索引的名稱####
column索引 和 row索引，可以用'[]'或者'.'交互參照

In [38]:
# row 也可以透過索引取得，返回的是一個視圖，和原本的物件共用資料
frame.state.two

'Ohio'

In [39]:
frame['state'].two

'Ohio'

In [40]:
frame.state['two']

'Ohio'

In [41]:
frame['state']['two']

'Ohio'

#### 對整個Series(column)賦值####
是一種廣播

In [42]:
# 對整個Series賦值
frame.debt = 16.5
frame

Unnamed: 0,state,year,pop,debt
one,Ohio,2000,1.5,16.5
two,Ohio,2001,1.7,16.5
three,Ohio,2002,3.6,16.5
four,Nevada,2001,2.4,16.5
five,Nevada,2002,2.9,16.5


In [43]:
# 長度相同的情況下，會做 mapping
frame.debt = np.arange(5.)
frame

Unnamed: 0,state,year,pop,debt
one,Ohio,2000,1.5,0
two,Ohio,2001,1.7,1
three,Ohio,2002,3.6,2
four,Nevada,2001,2.4,3
five,Nevada,2002,2.9,4


#### 如果將Series填入DataFrame，會依據index自動對齊

In [44]:
# 使用Series並指定index，並將之填入一個DataFrame, 則DataFrame中空缺的位置都會被填上NaN
s = Series([-1.2, -1.5, -1.7], index = ['two', 'four', 'five'])
frame.debt = s
frame

Unnamed: 0,state,year,pop,debt
one,Ohio,2000,1.5,
two,Ohio,2001,1.7,-1.2
three,Ohio,2002,3.6,
four,Nevada,2001,2.4,-1.5
five,Nevada,2002,2.9,-1.7


In [45]:
# 為不存在的column賦值會產生一個新的column
frame['eastern'] = (frame.state == 'Ohio')
frame

Unnamed: 0,state,year,pop,debt,eastern
one,Ohio,2000,1.5,,True
two,Ohio,2001,1.7,-1.2,True
three,Ohio,2002,3.6,,True
four,Nevada,2001,2.4,-1.5,False
five,Nevada,2002,2.9,-1.7,False


In [46]:
frame['eastern']

one       True
two       True
three     True
four     False
five     False
Name: eastern, dtype: bool

#### 如果有指定column索引名稱，則該column的名稱就是其索引的名稱
如同Excel的欄位名稱

In [47]:
frame['eastern'].name

'eastern'

#### 可以以雙層的字典，一次性的建立DataFrame

In [48]:
# 以嵌套的字典建立DataFrame，外層的字典作為columns，內層的字典作為rows
pop = {'Nevada': {2001: 2.4, 2002: 2.9}, 
       'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}
pop

{'Nevada': {2001: 2.4, 2002: 2.9}, 'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}

In [49]:
frame = DataFrame(pop)
frame

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [50]:
# 也可以進行轉置
frame.T

Unnamed: 0,2000,2001,2002
Nevada,,2.4,2.9
Ohio,1.5,1.7,3.6


#### 如果使用雙層字典來建立DataFrame，且指定的 row Index中沒有對準字典中的內層key，則以指定的row Index為準，沒對到的會被標示為NaN

In [51]:
# 內層的鍵會被合併、排序，但如果顯示地指定了索引，則不會合併或排序
DataFrame(pop, index = [2001, 2002, 2003])

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2003,,


In [52]:
# 可以設置 rows, columns 的名稱
frame.index.name = 'year'
frame.columns.name = 'state'
frame

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [53]:
# DataFrame 的 values屬性 返回一個二維 np.ndarray
v = frame.values
v

array([[ nan,  1.5],
       [ 2.4,  1.7],
       [ 2.9,  3.6]])

## 索引(Index)物件

#### 建構Series或者DataFrame的時候，所用到的任何數組或其他序列的標籤都會被轉換為一個 Index物件####

In [54]:
obj = Series(range(3), name = 'd', index = ['a', 'b', 'c'])
obj

a    0
b    1
c    2
Name: d, dtype: int32

In [55]:
index = obj.index
index # 是一個 Index物件

Index(['a', 'b', 'c'], dtype='object')

In [56]:
index[1:]

Index(['b', 'c'], dtype='object')

In [57]:
# Index物件是 immutable，不可以修改
# index[1] = 'b' # 會出錯

In [58]:
# 建立一個Index物件
index = pd.Index(np.arange(3))
index

Int64Index([0, 1, 2], dtype='int64')

In [59]:
# 置換 Series物件的 index
obj.index = index
obj

0    0
1    1
2    2
Name: d, dtype: int32

In [60]:
# 用Index物件來指定Series的index
obj2 = Series([1.5, -2.5, 0], index = index)
obj2

0    1.5
1   -2.5
2    0.0
dtype: float64

In [61]:
obj2.index is index

True

In [62]:
# Index 就像一個大小固定的 Set
frame

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [63]:
'Nevada' in frame.columns

True

In [64]:
2002 in frame.index

True

## 基本功能##

### 重新索引###

In [65]:
obj = Series([4.5, 7.2, -5.3, 3.6], index = ['d', 'b', 'a', 'c'])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

#### reindex()用來移動row或者column的排列

In [66]:
# reindex方法會根據新索引重新排序資料
obj.reindex(['a', 'b', 'c', 'd', 'e'])

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [67]:
# 可以指定 空缺資料的填充值 fill_value
obj.reindex(['a', 'b', 'c', 'd', 'e'], fill_value = 0)

a   -5.3
b    7.2
c    3.6
d    4.5
e    0.0
dtype: float64

In [68]:
obj = Series(['blue', 'purple', 'yellow'], index = [0, 2, 4])
obj

0      blue
2    purple
4    yellow
dtype: object

#### reindex()會依照指定的方式重新排列rows或者columns，可以指定若遇空缺時，插入rows或者columns的方式

In [69]:
# method參數可以以 "method" 指定 插值 的函式
obj.reindex(range(6), method = 'ffill')

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [70]:
# 如果只傳入一個序列，則 .reindex()會優先對 row重新索引
frame = DataFrame(np.arange(9).reshape((3, 3)),
                  index = ['a', 'c', 'd'], 
                  columns = ['Ohio', 'Texas', 'California'])
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [71]:
frame.reindex(['a', 'b', 'c', 'd'])

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


#### 使用reindex()，最好指定是針對row或者column

In [72]:
# 可以以 columns參數重新索引columns
frame.reindex(columns = ['Texas', 'Ohio', 'California'])

Unnamed: 0,Texas,Ohio,California
a,1,0,2
c,4,3,5
d,7,6,8


In [73]:
# 可以以 對 rows, columns都重新索引
frame.reindex(index = ['a', 'b', 'c', 'd'], 
              columns = ['Texas', 'Ohio', 'California'])

Unnamed: 0,Texas,Ohio,California
a,1.0,0.0,2.0
b,,,
c,4.0,3.0,5.0
d,7.0,6.0,8.0


In [74]:
# 插值
frame.reindex(index = ['a', 'b', 'c', 'd'], 
              columns = ['Texas', 'Ohio', 'California'], 
              method = 'ffill')

Unnamed: 0,Texas,Ohio,California
a,1,0,2
b,1,0,2
c,4,3,5
d,7,6,8


#### .ix[ ] 的索引方式很方便，只需傳入rows, columns 方向的索引列表

In [75]:
# 用 ix函式來重新索引
frame.ix[['a', 'b', 'c', 'd'],
         ['Texas', 'Ohio', 'California']]

Unnamed: 0,Texas,Ohio,California
a,1.0,0.0,2.0
b,,,
c,4.0,3.0,5.0
d,7.0,6.0,8.0


### 丟棄(drop)指定軸上的項###

In [76]:
# 以一個索引數組指定要刪除的元素
obj = Series(np.arange(5.), index = ['a', 'b', 'c', 'd', 'e'])
obj

a    0
b    1
c    2
d    3
e    4
dtype: float64

In [77]:
new_obj = obj.drop(['c'])
new_obj

a    0
b    1
d    3
e    4
dtype: float64

In [78]:
new_obj = obj.drop(['c', 'd'])
new_obj

a    0
b    1
e    4
dtype: float64

In [79]:
# 對於 DataFrame，可以刪除任意軸上的索引值
data = DataFrame(np.arange(16).reshape((4, 4)),
                 index = ['Ohio', 'Colorado', 'Utah', 'New York'], 
                 columns = ['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [80]:
# 對於 DataFrame，可以刪除任意軸上的索引值
data.drop(['Colorado', 'Ohio'])

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [81]:
# axis = 0 或省略，可以刪除rows
data.drop(['Colorado', 'Ohio'])

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [82]:
# axis = 1，可以刪除columns
data.drop(['two', 'four'], axis = 1)

Unnamed: 0,one,three
Ohio,0,2
Colorado,4,6
Utah,8,10
New York,12,14


### 索引、選取、過濾###

In [83]:
# Series的索引值不只是整數
obj = Series(np.arange(4.), index = ['a', 'b', 'c', 'd'])
obj

a    0
b    1
c    2
d    3
dtype: float64

In [84]:
# 單一值，不顯示索引
obj['b']

1.0

In [85]:
# 單一值，不顯示索引
obj[1]

1.0

In [86]:
# 多個值，顯示索引
obj[2:4]

c    2
d    3
dtype: float64

In [87]:
# 多個值，顯示索引，依照指定的順序
obj[['b', 'a', 'd']]

b    1
a    0
d    3
dtype: float64

In [88]:
# 多個值，顯示索引，依照指定的順序
obj[[1, 2, 3]]

b    1
c    2
d    3
dtype: float64

In [89]:
# 多個值，顯示索引
obj[obj < 2]

a    0
b    1
dtype: float64

In [90]:
# Series, DataFrame的切片算，其末端是包含的
obj['b':'d']

b    1
c    2
d    3
dtype: float64

In [91]:
# 賦值的方式也很簡單
obj['b':'c'] = 5
obj

a    0
b    5
c    5
d    3
dtype: float64

In [92]:
# 對於 DataFrame 索引，其實就是獲取一個或多個列
data = DataFrame(np.arange(16).reshape((4, 4)),
                 index = ['Ohio', 'Colorado', 'Utah', 'New York'], 
                 columns = ['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [93]:
data[['two', 'four']]

Unnamed: 0,two,four
Ohio,1,3
Colorado,5,7
Utah,9,11
New York,13,15


In [94]:
data[['four', 'two']]

Unnamed: 0,four,two
Ohio,3,1
Colorado,7,5
Utah,11,9
New York,15,13


In [95]:
data[[1, 3]]

Unnamed: 0,two,four
Ohio,1,3
Colorado,5,7
Utah,9,11
New York,13,15


In [96]:
# 這是row方向的切片
data[:2]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [97]:
# 多層次的索引
data[data['three'] > 5]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [98]:
# 透過 boolean型態的 DataFrame 進行索引
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [99]:
# 透過 boolean型態的 DataFrame 進行索引
data < 5

Unnamed: 0,one,two,three,four
Ohio,True,True,True,True
Colorado,True,False,False,False
Utah,False,False,False,False
New York,False,False,False,False


In [100]:
data[data < 5]

Unnamed: 0,one,two,three,four
Ohio,0.0,1.0,2.0,3.0
Colorado,4.0,,,
Utah,,,,
New York,,,,


#### ix 是重新索引的簡單方法####

In [101]:
# ix 是重新索引的簡單方法
data.ix['Colorado', ['two', 'four']]

two     5
four    7
Name: Colorado, dtype: int32

In [102]:
# ix 對兩軸重新索引，依照指定的順序
data.ix[['Colorado', 'Utah'], [3, 0, 1]]

Unnamed: 0,four,one,two
Colorado,7,4,5
Utah,11,8,9


In [103]:
# ix 索引，取出第0軸的第2個Series
data.ix[2]

one       8
two       9
three    10
four     11
Name: Utah, dtype: int32

In [104]:
data.ix[: 'Utah', 'two']

Ohio        1
Colorado    5
Utah        9
Name: two, dtype: int32

In [105]:
data.three > 5

Ohio        False
Colorado     True
Utah         True
New York     True
Name: three, dtype: bool

In [106]:
# 對兩個軸索引，取出交集
data.ix[data.three > 5, :3]

Unnamed: 0,one,two,three
Colorado,4,5,6
Utah,8,9,10
New York,12,13,14


### 算數運算和數據對齊###

pandas最重要的一個功能是: 可以對不同所引的對象進行算術運算。

在將對象相加時，若存在不同的索引，則結果的索引就是該索引對的聯集。

In [107]:
s1 = Series([7.3, -2.5, 3.4, 1.5], index = ['a', 'c', 'd', 'e'])
s2 = Series([-2.1, 3.6, -1.5, 4, 3.1], index = ['a', 'c', 'e', 'f', 'g'])

In [108]:
s1

a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64

In [109]:
s2

a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64

In [110]:
# 兩個Series的索引會自動對齊，空缺的值填入NaN
s1 + s2

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

In [111]:
# 對於 DataFrame，索引自動對齊會發生在row 和 column方向
df1 = DataFrame(np.arange(9.).reshape((3, 3)),
                index = ['Ohio', 'Texas', 'Colorado'], 
                columns = list('bcd'))
df2 = DataFrame(np.arange(12.).reshape((4, 3)),
                index = ['Utah', 'Ohio', 'Texas', 'Oregon'], 
                columns = list('bde'))

In [112]:
df1

Unnamed: 0,b,c,d
Ohio,0,1,2
Texas,3,4,5
Colorado,6,7,8


In [113]:
df2

Unnamed: 0,b,d,e
Utah,0,1,2
Ohio,3,4,5
Texas,6,7,8
Oregon,9,10,11


In [114]:
df1 + df2

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


### 在算術方法中填充值###

In [115]:
df1 = DataFrame(np.arange(12.).reshape((3, 4)),
                columns = list('abcd'))
df2 = DataFrame(np.arange(20.).reshape((4, 5)),
                columns = list('abcde'))

In [116]:
df1

Unnamed: 0,a,b,c,d
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11


In [117]:
df2

Unnamed: 0,a,b,c,d,e
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19


In [118]:
# 以指定的預設值取代 NaN作為自動填充值
# 但是兩個DataFrame都沒有的元素位置，還是會被填入NaN
df1.add(df2, fill_value = 0)

Unnamed: 0,a,b,c,d,e
0,0,2,4,6,4
1,9,11,13,15,9
2,18,20,22,24,14
3,15,16,17,18,19


In [119]:
# 重新索引的時候，也可以指定填充值
df1.reindex(columns = df2.columns, fill_value = 0)

Unnamed: 0,a,b,c,d,e
0,0,1,2,3,0
1,4,5,6,7,0
2,8,9,10,11,0


### DataFrame 和 Series中間的運算###

In [120]:
arr = np.arange(12.).reshape((3, 4))

In [121]:
arr

array([[  0.,   1.,   2.,   3.],
       [  4.,   5.,   6.,   7.],
       [  8.,   9.,  10.,  11.]])

In [122]:
arr[0]

array([ 0.,  1.,  2.,  3.])

In [123]:
# 算術運算 會進行 廣播
arr - arr[0]

array([[ 0.,  0.,  0.,  0.],
       [ 4.,  4.,  4.,  4.],
       [ 8.,  8.,  8.,  8.]])

In [124]:
# DataFrame 和 Series 之間也是如此
df = DataFrame(np.arange(12.).reshape((4, 3)),
               columns = list("bde"), 
               index = ['Utah', 'Ohio', 'Texas', 'Oregon'])
df

Unnamed: 0,b,d,e
Utah,0,1,2
Ohio,3,4,5
Texas,6,7,8
Oregon,9,10,11


In [125]:
s = df.ix[0]
s

b    0
d    1
e    2
Name: Utah, dtype: float64

In [126]:
# 也是進行廣播
# 會將Series的索引批被盜DataFrame的columns, 然後沿著rows(軸0)的方向一直向下廣播
df + s

Unnamed: 0,b,d,e
Utah,0,2,4
Ohio,3,5,7
Texas,6,8,10
Oregon,9,11,13


In [127]:
# 如果索引不同，則索引會聯集之後自動對齊
df

Unnamed: 0,b,d,e
Utah,0,1,2
Ohio,3,4,5
Texas,6,7,8
Oregon,9,10,11


In [128]:
s2 = df['d']
s2

Utah       1
Ohio       4
Texas      7
Oregon    10
Name: d, dtype: float64

In [129]:
# 如果希望索引自動匹配且在row方向上廣播，則必須用算術運算方法
# 傳入的軸就是希望匹配的軸
df.add(s2, axis = 0)

Unnamed: 0,b,d,e
Utah,1,2,3
Ohio,7,8,9
Texas,13,14,15
Oregon,19,20,21


### 函數應用和映射###

NumPy的 ufuncs (元素級數組方法) 也可用於操作pandas物件

In [130]:
frame = DataFrame(np.random.randn(4, 3),
                  columns = list('bde'),
                  index = ['Utah', 'Ohio', 'Texas', 'Oregon'])
frame

Unnamed: 0,b,d,e
Utah,-1.600378,0.612283,-1.317069
Ohio,-0.298505,0.352629,0.037499
Texas,0.963969,1.495626,-1.168511
Oregon,-0.188566,-0.004913,-0.114383


In [131]:
# NumPy的 ufuncs (元素級數組方法) 也可用於操作pandas物件
np.abs(frame)

Unnamed: 0,b,d,e
Utah,1.600378,0.612283,1.317069
Ohio,0.298505,0.352629,0.037499
Texas,0.963969,1.495626,1.168511
Oregon,0.188566,0.004913,0.114383


In [132]:
# DataFrame 上的 apply方法，可以實現元素級的運算
f = lambda x: x.max() - x.min()
frame.apply(f) # 預設會對每個 column操作

b    2.564347
d    1.500539
e    1.354568
dtype: float64

In [133]:
# 沿著軸1
frame.apply(f, axis = 1)

Utah      2.212660
Ohio      0.651134
Texas     2.664138
Oregon    0.183653
dtype: float64

In [134]:
frame

Unnamed: 0,b,d,e
Utah,-1.600378,0.612283,-1.317069
Ohio,-0.298505,0.352629,0.037499
Texas,0.963969,1.495626,-1.168511
Oregon,-0.188566,-0.004913,-0.114383


In [135]:
# 返回 由多個值組成的Series
def f(x):
    return Series([x.min(), x.max()], index = ['min', 'max'])
frame.apply(f) # 會對每個column操作 f，每個column傳回一個Series，再重新組合成DataFrame

Unnamed: 0,b,d,e
min,-1.600378,-0.004913,-1.317069
max,0.963969,1.495626,0.037499


In [136]:
# DataFrame 也可以透過 applymap(), 使用Python元素級的函式
f = lambda x: "{0:.3f}".format(x)
frame.applymap(f)

Unnamed: 0,b,d,e
Utah,-1.6,0.612,-1.317
Ohio,-0.299,0.353,0.037
Texas,0.964,1.496,-1.169
Oregon,-0.189,-0.005,-0.114


In [137]:
# Series 也可以透過 map(), 使用Python元素級的函式
f = lambda x: "{0:.3f}".format(x)
f2 = frame['b']
f2.map(f)

Utah      -1.600
Ohio      -0.299
Texas      0.964
Oregon    -0.189
Name: b, dtype: object

### 排序和排名###

#### 排序####

In [138]:
# 可以使用 sort_index方法 來對軸索引排序
obj = Series(range(4), index = list('dabc'))
obj

d    0
a    1
b    2
c    3
dtype: int32

In [139]:
# 可以使用 sort_index方法 來對軸索引排序
# 是針對索引來排序，而不是針對資料值
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int32

In [140]:
# DataFrame 也可以使用 sort_index 並指定軸來排序索引
frame = DataFrame(np.arange(8).reshape((2, 4)),
                  index = ['three', 'one'],
                  columns = list('dabc')
                 )
frame

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [141]:
frame.sort_index(axis = 0)

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [142]:
frame.sort_index(axis = 1)

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [143]:
# 可以串接
frame.sort_index(axis = 0).sort_index(axis = 1)

Unnamed: 0,a,b,c,d
one,5,6,7,4
three,1,2,3,0


In [144]:
# 可以指定 降幕 排序
frame.sort_index(axis = 1, ascending = False)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


In [145]:
# 若要以值來排序，可以使用 sort_values()方法
obj['b'] = 4
print(obj)
obj.sort_values()

d    0
a    1
b    4
c    3
dtype: int32


d    0
a    1
c    3
b    4
dtype: int32

In [146]:
# 若以sort_values()方法排序，空缺的值會被排到最後面
obj['a'] = None
print(obj)
obj.sort_values()

d     0
a   NaN
b     4
c     3
dtype: float64


d     0
c     3
b     4
a   NaN
dtype: float64

In [147]:
# 要根據一個或多個column中的值來排序，可以使用 sort_values()
frame = DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
frame

Unnamed: 0,a,b
0,0,4
1,1,7
2,0,-3
3,1,2


In [148]:
# 使用 sort_values() 根據值來排序
frame.sort_values('b')

Unnamed: 0,a,b
2,0,-3
3,1,2
0,0,4
1,1,7


In [149]:
# 使用 sort_values() 根據多個columns的值來排序
frame.sort_values(['a', 'b'])

Unnamed: 0,a,b
2,0,-3
0,0,4
3,1,2
1,1,7


## 彙總和計算描述統計##

In [150]:
df = DataFrame([[1.4, np.nan], [7.1, -4.5], 
               [np.nan, np.nan], [0.75, -1.3]], 
               index = list('abcd'), 
               columns = ['one', 'two'])
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [151]:
# sum()傳回一個Series
df.sum()

one    9.25
two   -5.80
dtype: float64

In [152]:
# 指定軸向做 sum()
df.sum(axis = 1)

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [153]:
# NaN會被自動排除(當作0)，可以使用skipna參數改變
df.sum(axis = 1, skipna = False)

a     NaN
b    2.60
c     NaN
d   -0.55
dtype: float64

In [154]:
# idxmin, idxmax 傳回間接統計，最大值或最小值的索引
df.idxmin()

one    d
two    b
dtype: object

In [155]:
df.idxmax()

one    b
two    d
dtype: object

In [156]:
# 累積加總 cumsum()
df.cumsum()

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [157]:
# describe 可以一次性產生多種統計數字
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [158]:
# describe 對非數字資料，產生另外一種統計數字
obj = Series(list('aabc') * 4)
obj

0     a
1     a
2     b
3     c
4     a
5     a
6     b
7     c
8     a
9     a
10    b
11    c
12    a
13    a
14    b
15    c
dtype: object

In [159]:
obj.describe()

count     16
unique     3
top        a
freq       8
dtype: object

## 相關係數與協方差##

In [160]:
# 透過 參數對 計算出來的 彙總統計(如 相關係數和協方差)
import pandas.io.data as web
# from pandas_datareader import data, wb

all_data = {}

for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']:
    all_data[ticker] = web.get_data_yahoo(ticker, '1/1/2013', '1/1/2015')
    
price = DataFrame({tic: data['Adj Close'] for tic, data in all_data.items()})
volume = DataFrame({tic: data['Volume'] for tic, data in all_data.items()})

The pandas.io.data module is moved to a separate package (pandas-datareader) and will be removed from pandas in a future version.
After installing the pandas-datareader package (https://github.com/pydata/pandas-datareader), you can change the import ``from pandas.io import data, wb`` to ``from pandas_datareader import data, wb``.


In [161]:
# 百分比變化
returns = price.pct_change()
returns.tail()

Unnamed: 0_level_0,AAPL,GOOG,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2014-12-24,-0.004709,-0.00343,-0.002589,-0.006398
2014-12-26,0.017677,0.009948,0.003213,-0.005401
2014-12-29,-0.000702,-0.006928,-0.011273,-0.008981
2014-12-30,-0.012203,0.00017,-0.002866,-0.009062
2014-12-31,-0.019019,-0.007579,0.002437,-0.012122


In [162]:
# corr() 用來計算相關係數
returns.MSFT.corr(returns.IBM)

0.2619999189916245

In [163]:
# cov()用來計算協方差
returns.MSFT.cov(returns.IBM)

4.0951211527301678e-05

In [164]:
# DataFrame的 corr(), cov() 將會返回相同形狀的矩陣
returns.corr()

Unnamed: 0,AAPL,GOOG,IBM,MSFT
AAPL,1.0,0.168597,0.140436,0.135613
GOOG,0.168597,1.0,0.275024,0.327994
IBM,0.140436,0.275024,1.0,0.262
MSFT,0.135613,0.327994,0.262,1.0


In [165]:
returns.cov()

Unnamed: 0,AAPL,GOOG,IBM,MSFT
AAPL,0.000253,3.6e-05,2.5e-05,3e-05
GOOG,3.6e-05,0.000185,4.2e-05,6.2e-05
IBM,2.5e-05,4.2e-05,0.000127,4.1e-05
MSFT,3e-05,6.2e-05,4.1e-05,0.000193


In [166]:
# 利用DataFrame的 corrwith()方法，可以計算列或行 跟另外一個Series或DataFrame之間的相關係數
returns.corrwith(returns.IBM)

AAPL    0.140436
GOOG    0.275024
IBM     1.000000
MSFT    0.262000
dtype: float64

In [167]:
returns.corrwith(volume)

AAPL   -0.134204
GOOG    0.231548
IBM    -0.343626
MSFT   -0.025271
dtype: float64

### 唯一值、值計數與成員資格###

In [168]:
obj = Series(list('cadaabbcc'))
obj

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

In [169]:
# uniquie 唯一值
obj.unique()

array(['c', 'a', 'd', 'b'], dtype=object)

In [170]:
# 排序之後的唯一值
np.sort(obj.unique())

array(['a', 'b', 'c', 'd'], dtype=object)

In [171]:
# value_count()傳回 各值出現的次數，依照出現的次數降幕排序
obj.value_counts()

c    3
a    3
b    2
d    1
dtype: int64

In [172]:
# value_counts()可以做為頂層函式，依照出現的次數降幕排序
pd.value_counts(obj)

c    3
a    3
b    2
d    1
dtype: int64

In [173]:
# 可以使用 sort 參數 禁止排序
pd.value_counts(obj, sort = False)

a    3
b    2
d    1
c    3
dtype: int64

In [174]:
# 用 isin() 判斷成員資格
mask = obj.isin(['b', 'c'])

In [175]:
mask

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

### 處理缺失數據(missing data)###

In [176]:
# 以NaN標示缺失數據
s = Series(['aardradf', 'asdfasfas', np.nan, 'asdfasfasf'])
s

0      aardradf
1     asdfasfas
2           NaN
3    asdfasfasf
dtype: object

In [177]:
# 用 isnull()來檢驗NaN
s.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [178]:
# None等同於 NaN
s[0] = None
s.isnull()

0     True
1    False
2     True
3    False
dtype: bool

### 濾除缺失數據###

In [179]:
from numpy import nan as NA
data = Series([1, NA, 3.5, NA, 7])
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [180]:
# 用 dropna()捨棄 NA，index 並不會重新設定
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [181]:
# 也可透過 boolean型索引過濾
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [182]:
# 對 DataFrame來說，dropna()預設捨棄任何有NA的row
df = DataFrame([[1., 6.5, 3.], [1., NA, NA], [NA, NA, NA],[NA, 6.5, 3.]])
df

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [183]:
# dropna()預設捨棄任何有NA的row
df.dropna()

Unnamed: 0,0,1,2
0,1,6.5,3


In [184]:
# 若傳入 how='all'，則只捨棄 所有數值皆為NA的那個row
df.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [185]:
# 要用這種方式捨棄column，則需傳入 axis=1即可
df[3] = NA
df

Unnamed: 0,0,1,2,3
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [186]:
# 傳入 axis=1，捨棄整列為NA的column
df.dropna(axis = 1, how = 'all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [187]:
# 使用 thresh 參數，只留下一部分觀測數據
df = DataFrame(np.random.randn(7, 3))
df

Unnamed: 0,0,1,2
0,0.603192,-0.538438,-0.716263
1,1.52322,1.383066,1.105979
2,0.900062,1.309426,0.394871
3,-0.337391,-2.064032,0.114132
4,0.141388,-0.821408,0.765602
5,0.046289,0.074875,-1.224645
6,0.927628,-1.059172,-1.125331


In [188]:
df.ix[:4, 1] = NA
df.ix[:2, 2] = NA
df

Unnamed: 0,0,1,2
0,0.603192,,
1,1.52322,,
2,0.900062,,
3,-0.337391,,0.114132
4,0.141388,,0.765602
5,0.046289,0.074875,-1.224645
6,0.927628,-1.059172,-1.125331


In [189]:
# 用 thresh 參數
df.dropna(thresh = 3)

Unnamed: 0,0,1,2
5,0.046289,0.074875,-1.224645
6,0.927628,-1.059172,-1.125331


### 填充缺失數據###

In [190]:
# 使用 fillna()來填充缺失數據
df.fillna(0)

Unnamed: 0,0,1,2
0,0.603192,0.0,0.0
1,1.52322,0.0,0.0
2,0.900062,0.0,0.0
3,-0.337391,0.0,0.114132
4,0.141388,0.0,0.765602
5,0.046289,0.074875,-1.224645
6,0.927628,-1.059172,-1.125331


In [191]:
# 依據字典，對不同的column填充不同的值
df.fillna({1: 0.5, 2: -1})

Unnamed: 0,0,1,2
0,0.603192,0.5,-1.0
1,1.52322,0.5,-1.0
2,0.900062,0.5,-1.0
3,-0.337391,0.5,0.114132
4,0.141388,0.5,0.765602
5,0.046289,0.074875,-1.224645
6,0.927628,-1.059172,-1.125331


In [192]:
# fillna()預設傳回副本，但也可以用 inplace 參數來就地修改
df.fillna(0, inplace = True)
df

Unnamed: 0,0,1,2
0,0.603192,0.0,0.0
1,1.52322,0.0,0.0
2,0.900062,0.0,0.0
3,-0.337391,0.0,0.114132
4,0.141388,0.0,0.765602
5,0.046289,0.074875,-1.224645
6,0.927628,-1.059172,-1.125331


In [193]:
# 差值的方法 ffill, bfill
df = DataFrame(np.random.randn(6, 3))
df.ix[2:, 1] = NA
df.ix[4:, 2] = NA
df

Unnamed: 0,0,1,2
0,0.629602,0.938939,0.204447
1,-0.615635,-0.277439,-2.101218
2,-0.384307,,1.639589
3,1.01173,,0.261025
4,-0.636945,,
5,-0.126693,,


In [194]:
# 'ffill'的插值方式
df.fillna(method = 'ffill')

Unnamed: 0,0,1,2
0,0.629602,0.938939,0.204447
1,-0.615635,-0.277439,-2.101218
2,-0.384307,-0.277439,1.639589
3,1.01173,-0.277439,0.261025
4,-0.636945,-0.277439,0.261025
5,-0.126693,-0.277439,0.261025


In [195]:
# 限制插值的次數
df.fillna(method = 'ffill', limit = 2)

Unnamed: 0,0,1,2
0,0.629602,0.938939,0.204447
1,-0.615635,-0.277439,-2.101218
2,-0.384307,-0.277439,1.639589
3,1.01173,-0.277439,0.261025
4,-0.636945,,0.261025
5,-0.126693,,0.261025


In [196]:
# 用 mean 作為插入值
df.fillna(df.mean())

Unnamed: 0,0,1,2
0,0.629602,0.938939,0.204447
1,-0.615635,-0.277439,-2.101218
2,-0.384307,0.33075,1.639589
3,1.01173,0.33075,0.261025
4,-0.636945,0.33075,0.000961
5,-0.126693,0.33075,0.000961


## 層次化索引(hierachical indexing)##

In [197]:
# 使用 MultiIndex 索引的Series的格式化輸出形式
# 可以用一維的方式來表達二維的資料，以低維度的形式來處理高維度的資料
s = Series(np.random.randn(10),
           index = [list('aaabbbccdd'),  [1, 2, 3, 1, 2, 3, 1, 2, 2, 3]])
s

a  1   -1.146266
   2    1.476380
   3   -1.697642
b  1    0.481533
   2   -0.720553
   3    0.008540
c  1    0.421649
   2    1.669382
d  2    0.391926
   3    0.742371
dtype: float64

In [198]:
s.index

MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 1, 2, 0, 1, 1, 2]])

In [199]:
# 選取數據 子集合
s['b']

1    0.481533
2   -0.720553
3    0.008540
dtype: float64

In [200]:
s['b':'c']

b  1    0.481533
   2   -0.720553
   3    0.008540
c  1    0.421649
   2    1.669382
dtype: float64

In [201]:
s.ix[['b', 'c']]

b  1    0.481533
   2   -0.720553
   3    0.008540
c  1    0.421649
   2    1.669382
dtype: float64

In [202]:
# 選取 內層 的數據
s[:, 2]

a    1.476380
b   -0.720553
c    1.669382
d    0.391926
dtype: float64

In [203]:
# 數據可以透過 unstack 方法被重新安排到一個 DataFrame中
s.unstack()

Unnamed: 0,1,2,3
a,-1.146266,1.47638,-1.697642
b,0.481533,-0.720553,0.00854
c,0.421649,1.669382,
d,,0.391926,0.742371


In [204]:
# unstack 的逆運算是 stack
s.unstack().stack()

a  1   -1.146266
   2    1.476380
   3   -1.697642
b  1    0.481533
   2   -0.720553
   3    0.008540
c  1    0.421649
   2    1.669382
d  2    0.391926
   3    0.742371
dtype: float64

In [205]:
# 對於一個 DataFrame，每條軸都可以有分層索引
df = DataFrame(np.arange(12).reshape((4, 3)), 
               index = [['a', 'a', 'b', 'b'], [1, 2, 1, 2]], 
               columns = [['Ohio', 'Ohio', 'Colorado'], ['Green', 'Red', 'Green']])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [206]:
# 每層的索引都可以有名字
df.index.names = ['key1', 'key2']
df.columns.names = ['state', 'color']
df

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [207]:
# 可以藉由索引來選取列分組
df['Colorado']

Unnamed: 0_level_0,color,Green
key1,key2,Unnamed: 2_level_1
a,1,2
a,2,5
b,1,8
b,2,11


In [208]:
# 可以先建構好 MultiIndex 物件，再用來創建 DataFrame物件
mi = pd.MultiIndex.from_arrays([['Ohio', 'Ohio', 'Colorado'], ['Green', 'Red', 'Green']], names = ['state', 'color'])
mi

MultiIndex(levels=[['Colorado', 'Ohio'], ['Green', 'Red']],
           labels=[[1, 1, 0], [0, 1, 0]],
           names=['state', 'color'])

### 重排分級順序###

In [209]:
# 用 swaplevel 互換級別
df.swaplevel('key1', 'key2')

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
2,a,3,4,5
1,b,6,7,8
2,b,9,10,11


In [210]:
# sortlevel 根據單一個級別中的值對數據進行排序
df.swaplevel('key1', 'key2').sortlevel(0)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
1,b,6,7,8
2,a,3,4,5
2,b,9,10,11


In [211]:
df.swaplevel('key1', 'key2').sortlevel(1)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
2,a,3,4,5
1,b,6,7,8
2,b,9,10,11


### 根據級別彙總統計###

In [212]:
# 設定 level 參數，用來指定對某個索引級別來操作統計函式
df.sum(level = 'key2')

state,Ohio,Ohio,Colorado
color,Green,Red,Green
key2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,6,8,10
2,12,14,16


In [213]:
# 對column上的索引級別來操作統計函式
df.sum(axis = 1, level = 'color')

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,2,1
a,2,8,4
b,1,14,7
b,2,20,10


### 使用DataFrame的列###

In [214]:
# 將 DataFrame的一個或多個列當作行索引來用，或者希望將行索引變成DataFrame的列
df = DataFrame({'a': range(7), 'b': range(7, 0, -1), 
                'c': ['one', 'one', 'one', 'two', 'two', 'two', 'two'], 
                'd': [0, 1, 2, 0, 1, 2, 3]})
df

Unnamed: 0,a,b,c,d
0,0,7,one,0
1,1,6,one,1
2,2,5,one,2
3,3,4,two,0
4,4,3,two,1
5,5,2,two,2
6,6,1,two,3


In [215]:
# set_index() 會將其一個或多個columns轉換為 row索引，並創建一個 DataFrame
df2 = df.set_index(['c', 'd'])
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
two,0,3,4
two,1,4,3
two,2,5,2
two,3,6,1


In [216]:
# 預設情況下，這些columns會被移除，但也可以設定 drop參數將之保留下來
df2 = df.set_index(['c', 'd'], drop = False)
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c,d
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,0,0,7,one,0
one,1,1,6,one,1
one,2,2,5,one,2
two,0,3,4,two,0
two,1,4,3,two,1
two,2,5,2,two,2
two,3,6,1,two,3


In [217]:
# reset_index() 會將 row方向上的多層次索引 移動到 column上
df2 = df.set_index(['c', 'd'])
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
two,0,3,4
two,1,4,3
two,2,5,2
two,3,6,1


In [218]:
# reset_index() 會將 row方向上的多層次索引 移動到 column上
df2.reset_index()

Unnamed: 0,c,d,a,b
0,one,0,0,7
1,one,1,1,6
2,one,2,2,5
3,two,0,3,4
4,two,1,4,3
5,two,2,5,2
6,two,3,6,1
