# numpy

### 基本使用

In [1]:
import numpy as np

In [2]:
# 创建ndarray
data1 = [1,2,3,4.5]
arr1 = np.array(data1)
arr1

array([ 1. ,  2. ,  3. ,  4.5])

In [3]:
# 维度数
arr1.ndim

1

In [4]:
# 形状
arr1.shape

(4,)

In [5]:
# 数据类型
arr1.dtype

dtype('float64')

In [6]:
# zeros和ones
np.zeros((3,5))

array([[ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.]])

In [7]:
np.ones(3)

array([ 1.,  1.,  1.])

In [10]:
# 可以根据另一个数组的形状创建新的ones/zeros数组
np.ones_like(arr1)

array([ 1.,  1.,  1.,  1.])

In [8]:
# empty。注意，emtpy创建没有具体值的数组，但都是垃圾值
np.empty((2,3))

array([[  6.94926844e-310,   2.04975505e-316,   5.30613856e-317],
       [  6.94925441e-310,   6.94925441e-310,   6.94926241e-310]])

In [9]:
# arange
np.arange(10)

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [11]:
# 创建nxn单位矩阵
np.eye(5)

array([[ 1.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.],
       [ 0.,  0.,  0.,  1.,  0.],
       [ 0.,  0.,  0.,  0.,  1.]])

### 数据类型

In [13]:
# astype进行类型转换
# 注意，会创建新数组
arr2 = np.array([1,2,3])
arr2.dtype

dtype('int64')

In [14]:
float_arr = arr2.astype(float)
float_arr

array([ 1.,  2.,  3.])

In [16]:
arr3 = np.array(['1','22','333'])
arr3.dtype

dtype('<U3')

In [17]:
arr3_int = arr3.astype(int)
arr3_int

array([  1,  22, 333])

### 数组与标量

In [18]:
arr4 = np.array([1,2,3])
arr4

array([1, 2, 3])

In [19]:
arr4 * 5

array([ 5, 10, 15])

### 索引与切片

In [20]:
# 注意，切片是视图，会直接修改到原数据上
arr5 = np.arange(10)
arr5

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [21]:
arr5[2:5] = 4
arr5

array([0, 1, 4, 4, 4, 5, 6, 7, 8, 9])

In [22]:
# 显式要求复制
arr6 = arr5.copy()
arr6

array([0, 1, 4, 4, 4, 5, 6, 7, 8, 9])

In [23]:
# 更高维度
arr7 = np.array([[1,2,3],[4,5,6]])
arr7

array([[1, 2, 3],
       [4, 5, 6]])

In [24]:
arr7[0,1]

2

In [25]:
arr7[:, :1]

array([[1],
       [4]])

In [26]:
arr7[:1, :]

array([[1, 2, 3]])

### 布尔型索引

In [27]:
names = np.array(['Bob', 'Joe', 'Will', 'Bob'])
names

array(['Bob', 'Joe', 'Will', 'Bob'],
      dtype='<U4')

In [28]:
names == 'Bob'

array([ True, False, False,  True], dtype=bool)

In [34]:
data = np.random.randn(4,3)
data

array([[-0.1920943 , -0.07973945,  0.30195219],
       [-0.55032431, -0.07438037, -0.31195519],
       [ 0.05839994,  0.59304159, -0.7079488 ],
       [-0.28031847, -1.43568186,  2.44058628]])

In [35]:
# 注意，是从一维角度上的索引，即选取了为true的第0和4行
data[names == 'Bob']

array([[-0.1920943 , -0.07973945,  0.30195219],
       [-0.28031847, -1.43568186,  2.44058628]])

In [37]:
data[(names == 'Bob') | (names == 'Will')]

array([[-0.1920943 , -0.07973945,  0.30195219],
       [ 0.05839994,  0.59304159, -0.7079488 ],
       [-0.28031847, -1.43568186,  2.44058628]])

上面的例子可能没特别体现出布尔索引的作用，实际上：

In [38]:
# 将data中所有负数设置为0
data[data < 0] = 0
data

array([[ 0.        ,  0.        ,  0.30195219],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.05839994,  0.59304159,  0.        ],
       [ 0.        ,  0.        ,  2.44058628]])

### 花式索引
利用整数数组进行索引。
- 注意，花式索引会复制数据到新数组中。

In [39]:
arr8 = np.arange(32).reshape((8,4))

In [40]:
arr8

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15],
       [16, 17, 18, 19],
       [20, 21, 22, 23],
       [24, 25, 26, 27],
       [28, 29, 30, 31]])

In [42]:
arr8[[4,3,0,6]]
# 选出了第4、3、0、6行

array([[16, 17, 18, 19],
       [12, 13, 14, 15],
       [ 0,  1,  2,  3],
       [24, 25, 26, 27]])

In [44]:
arr8[[1,5,7,2],[0,3,1,2]]

array([ 4, 23, 29, 10])

In [45]:
arr8[[1,5,7,2]][:, [0,3,1,2]]

array([[ 4,  7,  5,  6],
       [20, 23, 21, 22],
       [28, 31, 29, 30],
       [ 8, 11,  9, 10]])

### 数组转置

In [47]:
arr9 = np.arange(16).reshape((4,4))
arr9

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15]])

In [48]:
arr9.T #T或transpose均可

array([[ 0,  4,  8, 12],
       [ 1,  5,  9, 13],
       [ 2,  6, 10, 14],
       [ 3,  7, 11, 15]])

In [49]:
arr9

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15]])

In [50]:
arr9.transpose()

array([[ 0,  4,  8, 12],
       [ 1,  5,  9, 13],
       [ 2,  6, 10, 14],
       [ 3,  7, 11, 15]])

In [51]:
arr10 = np.arange(16).reshape((2,2,4))
arr10

array([[[ 0,  1,  2,  3],
        [ 4,  5,  6,  7]],

       [[ 8,  9, 10, 11],
        [12, 13, 14, 15]]])

In [52]:
# 更高维度的transpose必须指定维度了。
arr10.transpose((1,0,2)) #不会改变原数据

array([[[ 0,  1,  2,  3],
        [ 8,  9, 10, 11]],

       [[ 4,  5,  6,  7],
        [12, 13, 14, 15]]])

In [53]:
arr10.swapaxes(1,2) # 不会改变原数据

array([[[ 0,  4],
        [ 1,  5],
        [ 2,  6],
        [ 3,  7]],

       [[ 8, 12],
        [ 9, 13],
        [10, 14],
        [11, 15]]])

In [54]:
arr10

array([[[ 0,  1,  2,  3],
        [ 4,  5,  6,  7]],

       [[ 8,  9, 10, 11],
        [12, 13, 14, 15]]])

### 通用函数

In [55]:
# sqrt/ exp
arr11 = np.arange(10)
arr11

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [56]:
np.sqrt(arr11)

array([ 0.        ,  1.        ,  1.41421356,  1.73205081,  2.        ,
        2.23606798,  2.44948974,  2.64575131,  2.82842712,  3.        ])

In [57]:
np.exp(arr11)

array([  1.00000000e+00,   2.71828183e+00,   7.38905610e+00,
         2.00855369e+01,   5.45981500e+01,   1.48413159e+02,
         4.03428793e+02,   1.09663316e+03,   2.98095799e+03,
         8.10308393e+03])

In [60]:
x = np.random.randn(8)
x

array([-0.36658431, -0.0652528 , -1.24696049, -0.32118567,  0.58602633,
       -1.03346972,  1.32712143, -0.30942174])

In [61]:
y = np.random.randn(8)
y

array([-0.31226546, -0.63054313, -0.9858939 ,  0.89245589, -2.03442676,
       -2.56709715, -0.18830626,  0.46129051])

In [63]:
np.maximum(x, y)

array([-0.31226546, -0.0652528 , -0.9858939 ,  0.89245589,  0.58602633,
       -1.03346972,  1.32712143,  0.46129051])

In [64]:
np.abs(x)

array([ 0.36658431,  0.0652528 ,  1.24696049,  0.32118567,  0.58602633,
        1.03346972,  1.32712143,  0.30942174])

In [65]:
np.sign(x)

array([-1., -1., -1., -1.,  1., -1.,  1., -1.])

In [66]:
np.ceil(x)

array([-0., -0., -1., -0.,  1., -1.,  2., -0.])

In [67]:
np.floor(x)

array([-1., -1., -2., -1.,  0., -2.,  1., -1.])

In [68]:
# 四舍五入
np.rint(x)

array([-0., -0., -1., -0.,  1., -1.,  1., -0.])

In [69]:
np.add(x, y)

array([-0.67884977, -0.69579594, -2.2328544 ,  0.57127022, -1.44840043,
       -3.60056687,  1.13881517,  0.15186878])

### np.where
利用数组运算来替代循环，可以有效提高效率。

In [3]:
xarr = np.array([1.1, 1.2, 1.3, 1.4, 1.5])
yarr = np.array([2.1, 2.2, 2.3, 2.4, 2.5])
cond = np.array([True, False, True, True, False])

In [5]:
# 如果cond为True则选x中的值，否则选y中的值。
# 普通写法
result = [(x if c else y) for x,y,c in zip(xarr, yarr, cond)]
result

[1.1000000000000001, 2.2000000000000002, 1.3, 1.3999999999999999, 2.5]

In [6]:
# np.where 写法
result2 = np.where(cond, xarr, yarr)
result2

array([ 1.1,  2.2,  1.3,  1.4,  2.5])

In [7]:
# 又例，将矩阵中所有正值设置为2
arr11 = np.random.randn(4,4)
arr11

array([[ 1.90026198, -1.01409733, -1.08896413, -1.32945217],
       [ 0.72239659,  1.49454124,  0.74509934, -1.10491221],
       [-0.29526824, -0.67128472,  0.30198071, -1.84463636],
       [ 0.07033194, -0.23361588,  1.49238093,  0.20184756]])

In [9]:
np.where(arr11 > 0, 2, arr11)

array([[ 2.        , -1.01409733, -1.08896413, -1.32945217],
       [ 2.        ,  2.        ,  2.        , -1.10491221],
       [-0.29526824, -0.67128472,  2.        , -1.84463636],
       [ 2.        , -0.23361588,  2.        ,  2.        ]])

In [10]:
# 注意，以上arr11本身没有改变
arr11

array([[ 1.90026198, -1.01409733, -1.08896413, -1.32945217],
       [ 0.72239659,  1.49454124,  0.74509934, -1.10491221],
       [-0.29526824, -0.67128472,  0.30198071, -1.84463636],
       [ 0.07033194, -0.23361588,  1.49238093,  0.20184756]])

### 数学和统计方法

In [14]:
arr12 = np.random.randn(5,4)
arr12

array([[-0.15799537,  0.62498425, -0.75612024, -0.30340824],
       [ 1.3149259 ,  0.00320547, -1.2088047 , -0.50315742],
       [-0.34999942,  1.50942342,  0.22698809, -0.9521637 ],
       [ 0.65514894, -0.07761108,  0.04268914,  1.26077156],
       [-0.06501599,  0.2623725 , -0.01173315, -1.49532901]])

In [15]:
# 可以直接从数组对象调用
arr12.mean()

0.00095854718104708696

In [16]:
arr12.sum() 

0.019170943620941738

In [17]:
# 也可以作为函数，将数组传参进去
np.sum(arr12)

0.019170943620941738

In [22]:
# 可以指定轴。 比如下面就是在轴1上操作，即对每一行的各列进行均值。
arr12.mean(axis=1)

array([-0.1481349 , -0.09845768,  0.1085621 ,  0.47024964, -0.32742642])

In [19]:
# cumsum
arr13 = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8]])

In [20]:
arr13.cumsum()

array([ 0,  1,  3,  6, 10, 15, 21, 28, 36], dtype=int32)

In [23]:
arr13.cumsum(0)

array([[ 0,  1,  2],
       [ 3,  5,  7],
       [ 9, 12, 15]], dtype=int32)

In [24]:
# cumprod
arr13.cumprod(1)

array([[  0,   0,   0],
       [  3,  12,  60],
       [  6,  42, 336]], dtype=int32)

In [25]:
arr13.cumprod(0)

array([[ 0,  1,  2],
       [ 0,  4, 10],
       [ 0, 28, 80]], dtype=int32)

### 用于布尔型数组的方法

In [3]:
arr14 = np.random.randn(100)
arr14

array([-0.07004865,  1.51995261, -0.01783735,  1.2557842 ,  0.34873856,
        0.33985502, -0.26400787,  0.12217391,  2.00080344,  0.95048326,
       -0.32198472, -0.177675  ,  0.88276257,  1.16605555, -0.43414242,
        0.40876917,  0.205961  , -0.54387717, -0.80221   , -0.81375639,
       -0.17063964, -0.5083119 ,  1.04817235, -1.59101211, -1.68407241,
       -0.31479141, -1.02807653, -0.69654968, -0.71236932, -1.06194064,
       -1.43052471, -0.4231525 ,  0.64112863, -0.43835153,  2.82005455,
       -0.56721246,  0.82761083, -1.58568204, -0.40120716, -1.12216723,
       -0.52829621, -0.86330583,  1.49834384,  0.80689993,  1.07374236,
        2.77927268, -0.12537306,  0.87883402, -0.76575775, -1.92614606,
        2.19223143, -0.48724915, -1.77486801,  0.30192852,  2.24161314,
        0.21527938,  0.20778155,  0.52888445, -0.95642157,  0.51114772,
        0.42399953, -0.94458545,  1.3322681 , -0.43631112, -0.27233558,
       -0.57545056, -1.02465184,  1.28059161,  1.11999185,  0.13

In [5]:
# 布尔True认为是1，False是0， 所以sum可以统计True的个数
(arr14 > 0).sum()

41

In [6]:
# any和all
bools = np.array([False, True, False])
bools.any()

True

In [7]:
bools.all()

False

### 排序

In [9]:
arr15 = np.random.randn(8)
arr15

array([-0.02752718, -0.46597028,  0.86262303, -1.56132486, -0.01512583,
       -0.73590839,  0.18396684, -1.2637109 ])

In [10]:
# 对象自身上调用sort会就地排序
arr15.sort()
arr15

array([-1.56132486, -1.2637109 , -0.73590839, -0.46597028, -0.02752718,
       -0.01512583,  0.18396684,  0.86262303])

In [11]:
# 多维可以按轴排序
arr16 = np.random.randn(5, 3)
arr16

array([[-0.8840487 ,  0.97765132, -1.97928042],
       [-1.25526275, -0.86469377,  1.29439127],
       [-0.77276296,  0.03225839, -1.39951738],
       [ 0.48761615, -0.45539716, -0.18653752],
       [-1.27866177, -1.46178005, -1.36740365]])

In [13]:
arr16.sort(1)
arr16

array([[-1.97928042, -0.8840487 ,  0.97765132],
       [-1.25526275, -0.86469377,  1.29439127],
       [-1.39951738, -0.77276296,  0.03225839],
       [-0.45539716, -0.18653752,  0.48761615],
       [-1.46178005, -1.36740365, -1.27866177]])

In [14]:
# 如果通过np方法调用sort则是复制
arr17 = np.random.randn(10)
arr17

array([-1.57650752,  0.57263232,  0.75914283, -0.49850677,  0.90661078,
        1.16393308,  0.49089284, -0.36203335, -0.77059989,  0.7398848 ])

In [15]:
np.sort(arr17)

array([-1.57650752, -0.77059989, -0.49850677, -0.36203335,  0.49089284,
        0.57263232,  0.7398848 ,  0.75914283,  0.90661078,  1.16393308])

In [16]:
arr17

array([-1.57650752,  0.57263232,  0.75914283, -0.49850677,  0.90661078,
        1.16393308,  0.49089284, -0.36203335, -0.77059989,  0.7398848 ])

### 集合运算

In [18]:
ints = np.array([4, 2, 3, 2, 4, 7, 2, 1])
# unique 得到唯一(且排好序)
np.unique(ints)

array([1, 2, 3, 4, 7])

In [19]:
# intersect1d  注意，结果已排序
ints2 = np.array([1, 9, 9, 8, 2])
np.intersect1d(ints, ints2)

array([1, 2])

In [21]:
# union1d  注意，结果已排序
np.union1d(ints, ints2)

array([1, 2, 3, 4, 7, 8, 9])

In [22]:
# in1d 表示x的元素是否在y中的布尔数组
np.in1d(ints, ints2)

array([False,  True, False,  True, False, False,  True,  True], dtype=bool)

In [23]:
# setdiff1d 元素在x且不在y中
np.setdiff1d(ints, ints2)

array([3, 4, 7])

In [24]:
# setxor1d 异或，找出同时只存在于某一个数组中的
np.setxor1d(ints, ints2)

array([3, 4, 7, 8, 9])

### 读写文件

#### 二进制形式

In [25]:
arr18 = np.arange(10)

In [27]:
# save
np.save('arr18', arr18)

In [28]:
# load
np.load('arr18.npy')

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [30]:
# savez 多个数组保存到一个压缩文件中
np.savez('arr18x2.npz', a=arr18, b=arr18)

In [31]:
arch = np.load('arr18x2.npz')
arch['a']

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [32]:
arch['b']

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

#### 文本形式

In [35]:
arr19 = np.arange(10).reshape((2, 5))
arr19

array([[0, 1, 2, 3, 4],
       [5, 6, 7, 8, 9]])

In [38]:
# 保存文本
np.savetxt('arr19.txt', arr19, delimiter=',')

In [39]:
# 载入文本
arr20 = np.loadtxt('arr19.txt', delimiter=',')
arr20

array([[ 0.,  1.,  2.,  3.,  4.],
       [ 5.,  6.,  7.,  8.,  9.]])

### 线性代数

In [40]:
# 默认通过*对两个矩阵相乘，得到的是元素级的，即对应元素相乘的结果
# dot可以进行数学意义上的矩阵相乘
x = np.array([[1,2,3],[4,5,6]])
y = np.array([[6,23],[-1,7],[8,9]])

In [41]:
x

array([[1, 2, 3],
       [4, 5, 6]])

In [42]:
y

array([[ 6, 23],
       [-1,  7],
       [ 8,  9]])

In [43]:
# dot点乘
x.dot(y)

array([[ 28,  64],
       [ 67, 181]])

In [44]:
# 另一种做法
np.dot(x,y)

array([[ 28,  64],
       [ 67, 181]])

In [46]:
#  inv 求逆
X = np.random.randn(5, 5)
mat = X.T.dot(X)
np.linalg.inv(mat)

array([[ 0.14752437, -0.06764169, -0.19801416, -0.10892769,  0.01280471],
       [-0.06764169,  0.63000918,  0.62192966,  0.3315618 , -0.17780176],
       [-0.19801416,  0.62192966,  2.09450673,  0.61645948,  0.06729601],
       [-0.10892769,  0.3315618 ,  0.61645948,  0.43551364, -0.07520212],
       [ 0.01280471, -0.17780176,  0.06729601, -0.07520212,  0.18226222]])

更多详细内容参考文档。

### 随机数

numpy提供了比python的random功能更丰富的随机数，且大量数据时效率要快很多。

In [51]:
# randn
arr21 = np.random.randn(1, 5)
arr21

array([[-0.11406592, -1.59281805,  0.4071151 , -0.72869222, -0.42348632]])

In [50]:
np.random.normal(size=(2, 4))

array([[-0.09863918,  1.86903364, -0.10280812, -0.27169161],
       [-0.2500016 ,  0.22256565,  0.4264522 ,  0.15114664]])

In [60]:
# shuffle
arr22 = np.array([1,2,3,4,5,6])
np.random.shuffle(arr22)
arr22

array([2, 4, 3, 1, 5, 6])

产生随机数的方法
seed
permutation
shuffle
rand
randint
randn
binomial
beta
chisquare
gamma
uniform