Pandas数据规整 - 转换
===

---

In [1]:
import numpy as np
import pandas as pd

Pandas数据排序
---

.sort_index() 在指定轴上根据索引进行排序，索引排序后内容会跟随排序

In [2]:
b = pd.DataFrame(np.arange(20).reshape(4,5),index=['c','a','d','b'])
b

Unnamed: 0,0,1,2,3,4
c,0,1,2,3,4
a,5,6,7,8,9
d,10,11,12,13,14
b,15,16,17,18,19


#### sort_index() 按索引排序

In [3]:
b.sort_index()  # 默认按行索引排序，默认升序

Unnamed: 0,0,1,2,3,4
a,5,6,7,8,9
b,15,16,17,18,19
c,0,1,2,3,4
d,10,11,12,13,14


In [5]:
b.sort_index(axis=1, ascending=False)  # 按列索引排序，降序

Unnamed: 0,4,3,2,1,0
c,4,3,2,1,0
a,9,8,7,6,5
d,14,13,12,11,10
b,19,18,17,16,15


#### sort_values() 按值排序

In [6]:
dates = pd.date_range('20130101', periods = 10)
dates
df = pd.DataFrame(np.random.randn(10,4), index = dates, columns = ['A','B','C','D'])
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.220145,0.92786,-0.772562,0.528004
2013-01-02,0.64809,0.446625,0.112287,0.173115
2013-01-03,-0.778347,-0.98092,-0.288666,-0.927742
2013-01-04,-1.821275,-0.509233,-1.169413,0.192717
2013-01-05,1.328411,-0.032662,-0.026609,0.891312
2013-01-06,0.533473,-1.050066,1.867268,-0.055653
2013-01-07,-1.367985,0.233292,-2.817717,0.309683
2013-01-08,1.347572,1.284031,0.287255,0.600558
2013-01-09,1.497534,-1.491182,2.107624,-0.613372
2013-01-10,0.342294,-0.497035,0.748494,0.868377


In [8]:
# 默认按行排序（这一列的所有行）
df.sort_values(by='A')  # 指定排序基准列
df.sort_values(by='A', ascending=False)  # 倒序

Unnamed: 0,A,B,C,D
2013-01-09,1.497534,-1.491182,2.107624,-0.613372
2013-01-08,1.347572,1.284031,0.287255,0.600558
2013-01-05,1.328411,-0.032662,-0.026609,0.891312
2013-01-02,0.64809,0.446625,0.112287,0.173115
2013-01-06,0.533473,-1.050066,1.867268,-0.055653
2013-01-10,0.342294,-0.497035,0.748494,0.868377
2013-01-01,-0.220145,0.92786,-0.772562,0.528004
2013-01-03,-0.778347,-0.98092,-0.288666,-0.927742
2013-01-07,-1.367985,0.233292,-2.817717,0.309683
2013-01-04,-1.821275,-0.509233,-1.169413,0.192717


In [11]:
# 按列排序（一行的所有列）
df.sort_values(axis=1, by='2013-01-01', ascending=False)

Unnamed: 0,B,D,A,C
2013-01-01,0.92786,0.528004,-0.220145,-0.772562
2013-01-02,0.446625,0.173115,0.64809,0.112287
2013-01-03,-0.98092,-0.927742,-0.778347,-0.288666
2013-01-04,-0.509233,0.192717,-1.821275,-1.169413
2013-01-05,-0.032662,0.891312,1.328411,-0.026609
2013-01-06,-1.050066,-0.055653,0.533473,1.867268
2013-01-07,0.233292,0.309683,-1.367985,-2.817717
2013-01-08,1.284031,0.600558,1.347572,0.287255
2013-01-09,-1.491182,-0.613372,1.497534,2.107624
2013-01-10,-0.497035,0.868377,0.342294,0.748494


In [12]:
a = pd.DataFrame(np.arange(12).reshape(3,4), index=['a','b','c'])
a
b = pd.DataFrame(np.arange(20).reshape(4,5), index=['c','a','d','b'])
b

c = a + b
c

Unnamed: 0,0,1,2,3,4
a,5.0,7.0,9.0,11.0,
b,19.0,21.0,23.0,25.0,
c,8.0,10.0,12.0,14.0,
d,,,,,


关于排序中的缺失值问题

排序不论升序降序，缺失值永远排在最后

In [15]:
c.sort_values(by=0)  # 升序，缺失值在最后
c.sort_values(by=0, ascending=False)  # 降序，缺失值还在最后

Unnamed: 0,0,1,2,3,4
b,19.0,21.0,23.0,25.0,
c,8.0,10.0,12.0,14.0,
a,5.0,7.0,9.0,11.0,
d,,,,,


---


随机排列和随机采样
---


### 随机排列

利用numpy.random.permutation函数可以实现对Series或DataFrame的列的随机排序工作（permuting，随机重排序）

通过需要排列的轴的长度调用permutation，可产生一个表示新顺序的整数数组：

In [4]:
# 随机排列序列
a = [1,2,3,4,5,6,7]
a
np.random.permutation(a)

array([3, 6, 5, 4, 7, 2, 1])

In [5]:
df = pd.DataFrame(np.arange(5 * 4).reshape((5, 4)))
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [7]:
df.index
df.columns.values

array([0, 1, 2, 3], dtype=int64)

In [15]:
df.loc[np.random.permutation(df.index)]

Unnamed: 0,0,1,2,3
4,16,17,18,19
3,12,13,14,15
1,4,5,6,7
0,0,1,2,3
2,8,9,10,11


In [9]:
# 打乱行索引
np.random.permutation(df.index)

array([4, 0, 1, 2, 3], dtype=int64)

In [16]:
df.loc[:,np.random.permutation(df.columns)]

Unnamed: 0,1,3,2,0
0,1,3,2,0
1,5,7,6,4
2,9,11,10,8
3,13,15,14,12
4,17,19,18,16


In [10]:
df.loc[[4, 0, 1, 2, 3] ]
# df.loc[np.random.permutation(df.index)]

Unnamed: 0,0,1,2,3
4,16,17,18,19
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15


随机重排行索引和列索引

In [17]:
index = np.random.permutation(df.index)
index

columns = np.random.permutation(df.columns)
columns

df.loc[index, columns]

Unnamed: 0,3,2,1,0
4,19,18,17,16
1,7,6,5,4
0,3,2,1,0
3,15,14,13,12
2,11,10,9,8


### 随机采样

choice(),从一个序列中随机抽取某些值

In [150]:
a = [1,2,3,4,5,6,7]

np.random.choice(a)
np.random.choice(a, size=3)  # 放回抽样
np.random.choice(a, size=3, replace=False)  # 不放回抽样，不重复抽取

array([1, 5, 4])

#### 随机采样

In [18]:
a =[1,2,3,4,5,6,7]
np.random.choice(a)

6

In [23]:
np.random.choice(a,size =3,replace =False)

array([5, 6, 1])

In [151]:
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [24]:
df.loc[np.random.choice(df.index,size =3,replace =False)]

Unnamed: 0,0,1,2,3
1,4,5,6,7
3,12,13,14,15
4,16,17,18,19


In [162]:
# 按行索引采样

df.index

index = np.random.choice(df.index, size=3, replace=False)
df.loc[index]

Unnamed: 0,0,1,2,3
2,8,9,10,11
3,12,13,14,15
0,0,1,2,3


In [172]:
# 按行、按列随机采样
df.index
df.columns

index = np.random.choice(df.index, size=3, replace=False)
columns = np.random.choice(df.columns, size=2, replace=False)

df.loc[index, columns]

Unnamed: 0,3,2
3,15,14
0,3,2
2,11,10


重新索引（修改索引）
---

reindex() ，重新索引，创建一个适应新索引的新对象

一种变相的**查询方式**，类似在查询中加入新行新列

* 直接赋值修改索引
* set_index(), reset_index()：普通行列和索引互相转换
* rename是将原索引某些值替换为新值
* reindex则是将整个索引重建（并不替换索引值，而是增减索引或改变顺序，原索引对应的值关系不变）
    * reindex可以理解为一种查询方式
        * loc查询如果索引值不存在，会报警告
        * reindex查询如何索引值不存在，会新增一行或一列新值，值为缺失值

In [29]:
obj = pd.Series([4.5,7,2,-5.3], index=['d','b','a','c'])
obj

d    4.5
b    7.0
a    2.0
c   -5.3
dtype: float64

In [174]:
obj.index

Index(['d', 'b', 'a', 'c'], dtype='object')

In [175]:
obj.index.values

array(['d', 'b', 'a', 'c'], dtype=object)

#### 直接修改索引

有问题，不合适

* 索引修改后，值没有跟着变化
* 修改值必须和原索引长度保持一致，不能增加或删除索引

In [176]:
obj.index = ['a', 'b', 'c', 'd']  # 索引修改，单个赋值不行，整体赋值可以
obj

a    4.5
b    7.0
c    2.0
d   -5.3
dtype: float64

In [178]:
# obj.index = ['a','b','c']  # 长度不一致，报错

使用rename修改索引

只能替换已有索引

In [179]:
obj

a    4.5
b    7.0
c    2.0
d   -5.3
dtype: float64

In [32]:
b= obj.rename({'a': 'aaa', 'b': 'bbb', 'x': 'xx'})

In [34]:
b

d      4.5
bbb    7.0
aaa    2.0
c     -5.3
dtype: float64

In [33]:
obj

d    4.5
b    7.0
a    2.0
c   -5.3
dtype: float64

#### 正规做法：使用reindex()重新索引

In [181]:
obj

a    4.5
b    7.0
c    2.0
d   -5.3
dtype: float64

In [35]:
obj.reindex(['b', 'd', 'a'])  # 删除值

b    7.0
d    4.5
a    2.0
dtype: float64

In [36]:
obj

d    4.5
b    7.0
a    2.0
c   -5.3
dtype: float64

In [38]:
obj

d    4.5
b    7.0
a    2.0
c   -5.3
dtype: float64

In [37]:
obj.reindex(['a','b','c','d','e'])  # 增加值

a    2.0
b    7.0
c   -5.3
d    4.5
e    NaN
dtype: float64

In [39]:
obj.loc[['a', 'b' ,'c', 'd', 'e']]  # 结果同上，唯一区别是警告，建议用reindex()

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  """Entry point for launching an IPython kernel.


a    2.0
b    7.0
c   -5.3
d    4.5
e    NaN
dtype: float64

In [185]:
# 新增的缺失值，填充默认值
obj.reindex(['b','d','a','c','e'], fill_value=0)

b    7.0
d   -5.3
a    4.5
c    2.0
e    0.0
dtype: float64

实际应用

In [189]:
obj2 = pd.Series(['blue','purple','yellow'], index = [0,2,4])
obj2

0      blue
2    purple
4    yellow
dtype: object

In [190]:
range(6)

range(0, 6)

In [191]:
obj2.reindex(range(6))

0      blue
1       NaN
2    purple
3       NaN
4    yellow
5       NaN
dtype: object

In [193]:
# 填充缺失值，指定填充值
obj2.reindex(range(6), fill_value=0)

0      blue
1         0
2    purple
3         0
4    yellow
5         0
dtype: object

In [196]:
# 前向，后向填充
obj2.reindex(range(6), method='ffill')
obj2.reindex(range(6), method='bfill')

0      blue
1    purple
2    purple
3    yellow
4    yellow
5       NaN
dtype: object

#### DataFrame的索引重建

In [42]:
frame = pd.DataFrame(np.random.randint(30,100,(3,3)), index = ['语文','数学','英语'], columns = ['张三','李四','王五'])
frame

Unnamed: 0,张三,李四,王五
语文,75,54,73
数学,89,55,82
英语,79,96,95


In [199]:
# 重建行索引
frame.reindex(['数学', '英语', '语文'])
frame.reindex(['数学', '编程', '语文'])

Unnamed: 0,张三,李四,王五
数学,6.0,9.0,73.0
编程,,,
语文,2.0,45.0,29.0


In [202]:
# 重建列索引
frame.reindex(['张三', '赵六', '王五'], axis=1)
frame.reindex(columns=['张三', '赵六', '王五'])

Unnamed: 0,张三,赵六,王五
语文,2,,29
数学,6,,73
英语,62,,37


In [203]:
# 重建行列索引
frame.reindex(index=['语文', '编程', '英语'], columns=['张三', '赵六', '王五'])

Unnamed: 0,张三,赵六,王五
语文,2.0,,29.0
编程,,,
英语,62.0,,37.0


reindex()类似loc查询，但查询的可以是没有的索引

In [212]:
frame.loc[['语文', '编程', '英语'], ['张三', '赵六', '王五']]  # 警告，建议使用reindex方法

Unnamed: 0,张三,赵六,王五
语文,20.0,,79.0
编程,,,
英语,27.0,,80.0


例2

In [205]:
frame

Unnamed: 0,张三,李四,王五
语文,2,45,29
数学,6,9,73
英语,62,65,37


In [213]:
frame.index = [1,2,4]  # 索引可以直接赋值修改，但是不能直接修改单个索引
frame.columns = [1,2,3]
frame

Unnamed: 0,1,2,3
1,20,12,79
2,64,71,99
4,27,25,80


In [207]:
frame[2]

1    45
2     9
4    65
Name: 2, dtype: int32

In [208]:
# 重建行列索引
frame.reindex(index=[1,2,10,4], columns=[3,1,5,2])

Unnamed: 0,3,1,5,2
1,29.0,2.0,,45.0
2,73.0,6.0,,9.0
10,,,,
4,37.0,62.0,,65.0


In [214]:
# 弃用写法，使用上面写法
frame.loc[[1,2,10,4], [3,1,5,2]]  # 重复执行不报警告

Unnamed: 0,3,1,5,2
1,79.0,20.0,,12.0
2,99.0,64.0,,71.0
10,,,,
4,80.0,27.0,,25.0


In [216]:
frame.reindex([1,2,10,4])
frame.reindex([1,2,10,4], method='ffill')  # 前向填充，按实际行索引排序填充

Unnamed: 0,1,2,3
1,20,12,79
2,64,71,99
10,27,25,80
4,27,25,80


带有重复值的轴索引
---

许多Pandas函数要求标签唯一，但这不是强制的

In [217]:
obj = pd.Series(range(5), index=['a','a','b','b','c'])
obj

a    0
a    1
b    2
b    3
c    4
dtype: int64

In [218]:
obj['a']

a    0
a    1
dtype: int64

In [222]:
obj['a'].values
obj['a'].values[0]

0

In [223]:
# 索引的is_unique属性判断索引值是否唯一
obj.index.is_unique

False