# 範例目標:

Pandas資料的索引、操作、選擇、過濾、合併與排序。

# 範例重點:

1. 資料過濾與操作資料不同，**過濾出來的資料將是新資料集，不會動到原本的資料。**
2. 合併資料時合併欄位(key)可多個欄位，遇到相同欄位名稱時merge會自動產生字尾，join則不會。


# [教學目標]

* 正確使用欄位名稱與索引選取資料
* 正確使用 location 座標選取資料
* 正確使用 遮罩操作 選取資料
  - 利用欄位名稱選取單行資料
  - 利用欄位名稱選取多行資料
  - 利用列索引位置選取單列/多列資料
  - 用 loc, iloc, ix 取得行與列
  - 用 iat, at 取得資料
  - 根據條件篩選資料（遮罩）


In [26]:
import pandas as pd

一種index

In [27]:
boston_data = pd.read_csv('boston.csv',usecols=['CRIM','ZN','key','INDUS'])
boston_data_index = boston_data.set_index('key')
boston_data_index

Unnamed: 0_level_0,CRIM,ZN,INDUS
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.00632,18.0,2.31
1,0.02731,0.0,7.07
2,0.02729,0.0,7.07
3,0.03237,0.0,2.18
4,0.06905,0.0,2.18
...,...,...,...
501,0.06263,0.0,11.93
502,0.04527,0.0,11.93
503,0.06076,0.0,11.93
504,0.10959,0.0,11.93


In [28]:
boston_data_index.index

Int64Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
            ...
            496, 497, 498, 499, 500, 501, 502, 503, 504, 505],
           dtype='int64', name='key', length=506)

兩種index

In [29]:
boston_data_index2 = boston_data.set_index(['key','INDUS'])
boston_data_index2

Unnamed: 0_level_0,Unnamed: 1_level_0,CRIM,ZN
key,INDUS,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2.31,0.00632,18.0
1,7.07,0.02731,0.0
2,7.07,0.02729,0.0
3,2.18,0.03237,0.0
4,2.18,0.06905,0.0
...,...,...,...
501,11.93,0.06263,0.0
502,11.93,0.04527,0.0
503,11.93,0.06076,0.0
504,11.93,0.10959,0.0


In [30]:
boston_data_index2.index

MultiIndex([(  0,  2.31),
            (  1,  7.07),
            (  2,  7.07),
            (  3,  2.18),
            (  4,  2.18),
            (  5,  2.18),
            (  6,  7.87),
            (  7,  7.87),
            (  8,  7.87),
            (  9,  7.87),
            ...
            (496,  9.69),
            (497,  9.69),
            (498,  9.69),
            (499,  9.69),
            (500,  9.69),
            (501, 11.93),
            (502, 11.93),
            (503, 11.93),
            (504, 11.93),
            (505, 11.93)],
           names=['key', 'INDUS'], length=506)

Rename column

In [31]:
new_boston_data = boston_data.rename(columns={'CRIM':'feature1'})
new_boston_data

Unnamed: 0,key,feature1,ZN,INDUS
0,0,0.00632,18.0,2.31
1,1,0.02731,0.0,7.07
2,2,0.02729,0.0,7.07
3,3,0.03237,0.0,2.18
4,4,0.06905,0.0,2.18
...,...,...,...,...
501,501,0.06263,0.0,11.93
502,502,0.04527,0.0,11.93
503,503,0.06076,0.0,11.93
504,504,0.10959,0.0,11.93


新增column

法一 :

In [32]:
copy1 = boston_data.copy()
copy1['round_INDUS'] = round(copy1['INDUS'])
copy1

Unnamed: 0,key,CRIM,ZN,INDUS,round_INDUS
0,0,0.00632,18.0,2.31,2.0
1,1,0.02731,0.0,7.07,7.0
2,2,0.02729,0.0,7.07,7.0
3,3,0.03237,0.0,2.18,2.0
4,4,0.06905,0.0,2.18,2.0
...,...,...,...,...,...
501,501,0.06263,0.0,11.93,12.0
502,502,0.04527,0.0,11.93,12.0
503,503,0.06076,0.0,11.93,12.0
504,504,0.10959,0.0,11.93,12.0


法二 :<br>
`.insert`

In [33]:
copy2 = boston_data.copy()
copy2.insert(1,'round_INDUS',round(copy2['INDUS']))
copy2

Unnamed: 0,key,round_INDUS,CRIM,ZN,INDUS
0,0,2.0,0.00632,18.0,2.31
1,1,7.0,0.02731,0.0,7.07
2,2,7.0,0.02729,0.0,7.07
3,3,2.0,0.03237,0.0,2.18
4,4,2.0,0.06905,0.0,2.18
...,...,...,...,...,...
501,501,12.0,0.06263,0.0,11.93
502,502,12.0,0.04527,0.0,11.93
503,503,12.0,0.06076,0.0,11.93
504,504,12.0,0.10959,0.0,11.93


刪除column<br>
`del`<br>
`.pop`<br>
`.drop(,axis)`

法一 :

In [34]:
del copy2['round_INDUS']
copy2

Unnamed: 0,key,CRIM,ZN,INDUS
0,0,0.00632,18.0,2.31
1,1,0.02731,0.0,7.07
2,2,0.02729,0.0,7.07
3,3,0.03237,0.0,2.18
4,4,0.06905,0.0,2.18
...,...,...,...,...
501,501,0.06263,0.0,11.93
502,502,0.04527,0.0,11.93
503,503,0.06076,0.0,11.93
504,504,0.10959,0.0,11.93


法二 :

In [35]:
print(copy1.pop('round_INDUS'))
copy1

0       2.0
1       7.0
2       7.0
3       2.0
4       2.0
       ... 
501    12.0
502    12.0
503    12.0
504    12.0
505    12.0
Name: round_INDUS, Length: 506, dtype: float64


Unnamed: 0,key,CRIM,ZN,INDUS
0,0,0.00632,18.0,2.31
1,1,0.02731,0.0,7.07
2,2,0.02729,0.0,7.07
3,3,0.03237,0.0,2.18
4,4,0.06905,0.0,2.18
...,...,...,...,...
501,501,0.06263,0.0,11.93
502,502,0.04527,0.0,11.93
503,503,0.06076,0.0,11.93
504,504,0.10959,0.0,11.93


In [36]:
copy3 = boston_data.copy()
copy3.drop('CRIM',axis=1)

Unnamed: 0,key,ZN,INDUS
0,0,18.0,2.31
1,1,0.0,7.07
2,2,0.0,7.07
3,3,0.0,2.18
4,4,0.0,2.18
...,...,...,...
501,501,0.0,11.93
502,502,0.0,11.93
503,503,0.0,11.93
504,504,0.0,11.93


In [37]:
boston_data = boston_data.append(pd.DataFrame([[506,0,0,0]],columns=boston_data.columns))
boston_data

Unnamed: 0,key,CRIM,ZN,INDUS
0,0,0.00632,18.0,2.31
1,1,0.02731,0.0,7.07
2,2,0.02729,0.0,7.07
3,3,0.03237,0.0,2.18
4,4,0.06905,0.0,2.18
...,...,...,...,...
502,502,0.04527,0.0,11.93
503,503,0.06076,0.0,11.93
504,504,0.10959,0.0,11.93
505,505,0.04741,0.0,11.93


In [38]:
boston_data = boston_data.drop(1)
boston_data

Unnamed: 0,key,CRIM,ZN,INDUS
0,0,0.00632,18.0,2.31
2,2,0.02729,0.0,7.07
3,3,0.03237,0.0,2.18
4,4,0.06905,0.0,2.18
5,5,0.02985,0.0,2.18
...,...,...,...,...
502,502,0.04527,0.0,11.93
503,503,0.06076,0.0,11.93
504,504,0.10959,0.0,11.93
505,505,0.04741,0.0,11.93


`.loc`<br>
用index的標籤來取出資料

In [39]:
stock_data = pd.read_csv('STOCK_DAY_0050_202010.csv')
stock_data.loc[:5,['date','open','close']].to_csv('STOCK1.csv',index=False)
stock_data.loc[3:7,['date','open','high']].to_csv('STOCK2.csv',index=False)

In [40]:
stock_data

Unnamed: 0,date,open,high,low,close
0,109/10/05,103.45,104.05,103.0,103.05
1,109/10/06,104.0,104.35,103.85,104.25
2,109/10/07,104.0,105.0,103.5,104.8
3,109/10/08,105.45,106.35,105.3,106.2
4,109/10/12,106.7,107.7,106.7,107.05
5,109/10/13,107.35,107.6,106.2,107.1
6,109/10/14,107.05,107.2,106.45,106.7
7,109/10/15,106.5,106.5,105.1,105.7
8,109/10/16,105.7,106.3,105.1,105.25
9,109/10/19,105.65,106.6,105.6,106.6


In [41]:
stock_data.loc[stock_data.open<104]

Unnamed: 0,date,open,high,low,close
0,109/10/05,103.45,104.05,103.0,103.05
18,109/10/30,103.55,103.6,102.7,103.0


In [42]:
stock_data.loc[(stock_data.open<104)&(stock_data.close>103),['open','close']]

Unnamed: 0,open,close
0,103.45,103.05


In [45]:
stock_data.loc[3:6]

Unnamed: 0,date,open,high,low,close
3,109/10/08,105.45,106.35,105.3,106.2
4,109/10/12,106.7,107.7,106.7,107.05
5,109/10/13,107.35,107.6,106.2,107.1
6,109/10/14,107.05,107.2,106.45,106.7


In [46]:
stock_data.iloc[3:6]

Unnamed: 0,date,open,high,low,close
3,109/10/08,105.45,106.35,105.3,106.2
4,109/10/12,106.7,107.7,106.7,107.05
5,109/10/13,107.35,107.6,106.2,107.1


In [47]:
stock_data.iloc[3:6,:2]

Unnamed: 0,date,open
3,109/10/08,105.45
4,109/10/12,106.7
5,109/10/13,107.35


In [48]:
stock_data1=pd.read_csv('STOCK1.csv')
stock_data1

Unnamed: 0,date,open,close
0,109/10/05,103.45,103.05
1,109/10/06,104.0,104.25
2,109/10/07,104.0,104.8
3,109/10/08,105.45,106.2
4,109/10/12,106.7,107.05
5,109/10/13,107.35,107.1


In [49]:
stock_data2=pd.read_csv('STOCK2.csv')
stock_data2

Unnamed: 0,date,open,high
0,109/10/08,105.45,106.35
1,109/10/12,106.7,107.7
2,109/10/13,107.35,107.6
3,109/10/14,107.05,107.2
4,109/10/15,106.5,106.5


pd.concat([x,y],axis)<br>
直接concat，不針對column去掉重複

In [50]:
pd.concat([stock_data1,stock_data2],axis=0)

Unnamed: 0,date,open,close,high
0,109/10/05,103.45,103.05,
1,109/10/06,104.0,104.25,
2,109/10/07,104.0,104.8,
3,109/10/08,105.45,106.2,
4,109/10/12,106.7,107.05,
5,109/10/13,107.35,107.1,
0,109/10/08,105.45,,106.35
1,109/10/12,106.7,,107.7
2,109/10/13,107.35,,107.6
3,109/10/14,107.05,,107.2


In [51]:
pd.concat([stock_data1,stock_data2],axis=1)

Unnamed: 0,date,open,close,date.1,open.1,high
0,109/10/05,103.45,103.05,109/10/08,105.45,106.35
1,109/10/06,104.0,104.25,109/10/12,106.7,107.7
2,109/10/07,104.0,104.8,109/10/13,107.35,107.6
3,109/10/08,105.45,106.2,109/10/14,107.05,107.2
4,109/10/12,106.7,107.05,109/10/15,106.5,106.5
5,109/10/13,107.35,107.1,,,


交集 :<br>
pd.concat([x,y],axis,join='inner')

In [52]:
pd.concat([stock_data1,stock_data2],axis=0,join='inner')

Unnamed: 0,date,open
0,109/10/05,103.45
1,109/10/06,104.0
2,109/10/07,104.0
3,109/10/08,105.45
4,109/10/12,106.7
5,109/10/13,107.35
0,109/10/08,105.45
1,109/10/12,106.7
2,109/10/13,107.35
3,109/10/14,107.05


聯集 :<br>
pd.merge(x,y,on='',how='outer')<br>
針對column進行合併<br>

left: 拼接的左側DataFrame對象

right: 拼接的右側DataFrame對象

on: 要加入的列或索引級別名稱。必須在左側和右側DataFrame對像中找到。
    如果未傳遞且left_index和right_index為False，則DataFrame中的列的交集將被推斷為連接鍵。
    
left_on:左側DataFrame中的列或索引級別用作鍵。可以是列名，索引級名稱，也可以是長度等於DataFrame長度的數組。

right_on: 左側DataFrame中的列或索引級別用作鍵。可以是列名，索引級名稱，也可以是長度等於DataFrame長度的數組。

left_index: 如果為True，則使用左側DataFrame中的索引（行標籤）作為其連接鍵。
            對於具有MultiIndex（分層）的DataFrame，級別數必須與右側DataFrame中的連接鍵數相匹配。
            
right_index: 與left_index功能相似。

how: One of ‘left’, ‘right’, ‘outer’, ‘inner’. 默認inner。 

inner是取交集，outer取並集。

比如left：[‘A’,‘B’,‘C’];right[’'A,‘C’,‘D’]；inner取交集的話，left中出現的A會和right中出現的買一個A進行匹配拼接，

如果沒有是B，在right中沒有匹配到，則會丟失。 

'outer’取並集，出現的A會進行一一匹配，沒有同時出現的會將缺失的部分添加缺失值。

sort: 按字典順序通過連接鍵對結果DataFrame進行排序。默認為True，設置為False將在很多情況下顯著提高性能。

https://blog.csdn.net/brucewong0516/article/details/82707492

In [53]:
pd.merge(stock_data1,stock_data2,on='date',how='outer')

Unnamed: 0,date,open_x,close,open_y,high
0,109/10/05,103.45,103.05,,
1,109/10/06,104.0,104.25,,
2,109/10/07,104.0,104.8,,
3,109/10/08,105.45,106.2,105.45,106.35
4,109/10/12,106.7,107.05,106.7,107.7
5,109/10/13,107.35,107.1,107.35,107.6
6,109/10/14,,,107.05,107.2
7,109/10/15,,,106.5,106.5


In [54]:
pd.merge(stock_data1,stock_data2,on='date',how='left')

Unnamed: 0,date,open_x,close,open_y,high
0,109/10/05,103.45,103.05,,
1,109/10/06,104.0,104.25,,
2,109/10/07,104.0,104.8,,
3,109/10/08,105.45,106.2,105.45,106.35
4,109/10/12,106.7,107.05,106.7,107.7
5,109/10/13,107.35,107.1,107.35,107.6


In [55]:
pd.merge(stock_data1,stock_data2,on='date',how='right')

Unnamed: 0,date,open_x,close,open_y,high
0,109/10/08,105.45,106.2,105.45,106.35
1,109/10/12,106.7,107.05,106.7,107.7
2,109/10/13,107.35,107.1,107.35,107.6
3,109/10/14,,,107.05,107.2
4,109/10/15,,,106.5,106.5


In [56]:
stock_data1_index = stock_data1.set_index('date')
stock_data2_index = stock_data2.set_index('date')

In [57]:
stock_data1_index.join(stock_data2_index,how='outer',lsuffix='_left',rsuffix='_right')

Unnamed: 0_level_0,open_left,close,open_right,high
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
109/10/05,103.45,103.05,,
109/10/06,104.0,104.25,,
109/10/07,104.0,104.8,,
109/10/08,105.45,106.2,105.45,106.35
109/10/12,106.7,107.05,106.7,107.7
109/10/13,107.35,107.1,107.35,107.6
109/10/14,,,107.05,107.2
109/10/15,,,106.5,106.5


# Numpy 運算

利用欄位名稱選取單行資料

In [58]:
# 利用欄位名稱選取單行資料

import pandas as pd

df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], index=['a', 'b'], columns=['A', 'B', 'C'])

print(df['A'])
print(df['B'])


a    1
b    4
Name: A, dtype: int64
a    2
b    5
Name: B, dtype: int64


利用欄位名稱選取多行資料

In [59]:
# 利用欄位名稱選取多行資料

import pandas as pd

df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], index=['a', 'b'], columns=['A', 'B', 'C'])

print(df[['A', 'B']])
print(df[['A', 'C']])

   A  B
a  1  2
b  4  5
   A  C
a  1  3
b  4  6


利用列索引位置選取單列/多列資料

In [60]:
# 利用列索引位置選取單列/多列資料

import pandas as pd

df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], index=['a', 'b'], columns=['A', 'B', 'C'])

print(df[0:1])
print(df[0:2])

   A  B  C
a  1  2  3
   A  B  C
a  1  2  3
b  4  5  6


用 loc, iloc, ix 取得行與列

`.loc`

In [61]:
# 用 loc, iloc, ix 取得行與列

import pandas as pd

df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], index=['a', 'b'], columns=['A', 'B', 'C'])

print(df.loc['a', 'A'])
print(df.loc['a', ['A', 'B']])
print(df.loc[['a', 'b'], 'A'])
print(df.loc[['a', 'b'], ['A', 'B']])

1
A    1
B    2
Name: a, dtype: int64
a    1
b    4
Name: A, dtype: int64
   A  B
a  1  2
b  4  5


`.iloc`<br>
利用index取值

In [63]:
import pandas as pd

df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], index=['a', 'b'], columns=['A', 'B', 'C'])

print(df.iloc[0, 0])
print(df.iloc[0, [0, 1]])
print(df.iloc[[0, 1], 0])
print(df.iloc[[0, 1], [0, 1]])

1
A    1
B    2
Name: a, dtype: int64
a    1
b    4
Name: A, dtype: int64
   A  B
a  1  2
b  4  5


In [72]:
import pandas as pd
import numpy as np

df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], index=['a', 'b'], columns=['A', 'B', 'C'])

print(df.ix[0, 'A'])
print(df.ix['a', [0, 1]])
print(df.ix[['a', 'b'], 0])
print(df.ix[[0, 1], ['A', 'B']])


AttributeError: 'DataFrame' object has no attribute 'ix'

用 iat, at 取得資料

In [67]:
# 用 iat, at 取得資料

print(df.loc['a', 'A'])
print(df.iloc[0, 1])

print(df.at['a', 'A'])
print(df.iat[0, 1])

1
2
1
2


根據條件篩選資料（遮罩）

In [68]:
# 根據條件篩選資料（遮罩）

print(df > 2)
#        A      B     C
# a  False  False  True
# b   True   True  True

print(df[df > 2])
#      A    B  C
# a  NaN  NaN  3
# b  4.0  5.0  6

       A      B     C
a  False  False  True
b   True   True  True
     A    B  C
a  NaN  NaN  3
b  4.0  5.0  6


In [69]:
print(df['A'] > 2)
# a    False
# b     True
# Name: A, dtype: bool

print(df[df['A'] > 2])
#    A  B  C
# b  4  5  6


a    False
b     True
Name: A, dtype: bool
   A  B  C
b  4  5  6
