# 2.1 理解Python中的数据类型

## 2.1.1 Python整型不仅仅是一个整型

在python中定义一个整型，例如x=10000时，x并不是一个“原生”整型，而是一个指针，指向一个C语言的复合结构体。如下：
```c
struct _longobject {
    long ob_refcnt;
    PyTypeObject *ob_type;
    size_t ob_size;
    long ob_digit[1];
}
```

## 2.1.2 Python列表不仅仅是一个列表

创建一个整型值列表

In [1]:
L = list(range(10))
L

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [2]:
type(L[0])

int

或者创建一个字符串列表

In [3]:
L2 = [str(c) for c in L]
L2

['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']

In [4]:
type(L2[0])

str

创建一个异构的列表

In [5]:
L3 = [True, "2", 3.0, 4]
[type(item) for item in L3]

[bool, str, float, int]

## 2.1.3 Python中的固定类型数组

In [6]:
import array
L = list(range(10))
A = array.array('i', L)
A

array('i', [0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

## 2.1.4 从Python列表创建数组

创建ndarray整型数组

In [7]:
import numpy as np
np.array([1, 4, 2, 5, 3])

array([1, 4, 2, 5, 3])

> NumPy要求数组必须包含同一类型的数据，如果类型不匹配，NumPy将会向上转换（如果可行）

In [8]:
np.array([3.14, 4, 2, 3])

array([3.14, 4.  , 2.  , 3.  ])

可以使用dtype明确设数组的数据类型

In [9]:
np.array([1, 2, 3, 4], dtype='float32')

array([1., 2., 3., 4.], dtype=float32)

NumPy数组可以被指定为多维的

In [10]:
np.array([range(i, i + 3) for i in [2, 4, 6]])

array([[2, 3, 4],
       [4, 5, 6],
       [6, 7, 8]])

## 2.1.5 从头创建数组

In [11]:
np.zeros(10, dtype=int)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [12]:
np.ones((3, 5), dtype=float)

array([[1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.]])

In [13]:
np.full((3, 5), 3.14)

array([[3.14, 3.14, 3.14, 3.14, 3.14],
       [3.14, 3.14, 3.14, 3.14, 3.14],
       [3.14, 3.14, 3.14, 3.14, 3.14]])

In [14]:
np.arange(0, 20, 2)

array([ 0,  2,  4,  6,  8, 10, 12, 14, 16, 18])

In [15]:
np.linspace(0, 1, 5)

array([0.  , 0.25, 0.5 , 0.75, 1.  ])

In [16]:
np.random.random((3, 3))

array([[0.61107973, 0.04853689, 0.75628882],
       [0.35353679, 0.13111344, 0.6679042 ],
       [0.32466172, 0.56985552, 0.10252965]])

In [17]:
np.random.normal(0, 1, (3, 3))

array([[-1.58561967,  1.73515783, -0.69768858],
       [-1.98880383, -0.59359532, -0.37142168],
       [-0.18380234, -1.73288075,  1.07133946]])

In [18]:
np.random.randint(0, 10, (3, 3))

array([[5, 9, 1],
       [4, 2, 7],
       [6, 6, 0]])

In [19]:
np.eye(3)

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [20]:
np.empty(3)

array([1., 1., 1.])

## 2.1.6 NumPy标准数据类型

> bool_, int_, intc, intp, int8, int16, int32, int64, uint8, uint16, uint32, uint64, float_, float16, float32, float64, complex_, complex64, complex128

# 2.2 NumPy数组基础

# 2.2.1 NumPy数组的属性

In [21]:
import numpy as np
np.random.seed(0)

x1 = np.random.randint(10, size=6)
x2 = np.random.randint(10, size=(3, 4))
x3 = np.random.randint(10, size=(3, 4, 5))

In [22]:
x1

array([5, 0, 3, 3, 7, 9])

In [23]:
x2

array([[3, 5, 2, 4],
       [7, 6, 8, 8],
       [1, 6, 7, 7]])

In [24]:
x3

array([[[8, 1, 5, 9, 8],
        [9, 4, 3, 0, 3],
        [5, 0, 2, 3, 8],
        [1, 3, 3, 3, 7]],

       [[0, 1, 9, 9, 0],
        [4, 7, 3, 2, 7],
        [2, 0, 0, 4, 5],
        [5, 6, 8, 4, 1]],

       [[4, 9, 8, 1, 1],
        [7, 9, 9, 3, 6],
        [7, 2, 0, 3, 5],
        [9, 4, 4, 6, 4]]])

## 每个数组有ndim（数组的维度）、shape（数组每个维度的大小）和size（数组的总大小）属性

In [25]:
x3.ndim

3

In [26]:
x3.shape

(3, 4, 5)

In [27]:
x3.size

60

## dtype（数组的数据类型）

In [28]:
x3.dtype

dtype('int64')

## itemsize（每个数组元素字节大小）8bytes，以及表示数组总字节大小的属性nbytes

In [29]:
x3.itemsize

8

In [30]:
x3.nbytes

480

> 一般来说，我们可以认为nbytes等于itemsize和size的乘积大小。

# 2.2.2 数组索引：获取单个元素

In [31]:
x1

array([5, 0, 3, 3, 7, 9])

In [32]:
x1[0]

5

In [33]:
x1[4]

7

In [34]:
x1[-1]

9

In [35]:
x1[-2]

7

In [36]:
x2

array([[3, 5, 2, 4],
       [7, 6, 8, 8],
       [1, 6, 7, 7]])

In [37]:
x2[0, 0]

3

In [38]:
x2[2, -1]

7

In [39]:
x2[0, 0] = 12
x2

array([[12,  5,  2,  4],
       [ 7,  6,  8,  8],
       [ 1,  6,  7,  7]])

In [40]:
x1[0] = 3.14159 # 这将被截断，因为numpy的数组是固定类型的
x1

array([3, 0, 3, 3, 7, 9])

# 2.2.3 数组切片：获取子数组

1. 一维子数组

In [41]:
x = np.arange(10)
x

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [42]:
x[:5] # 前五个元素

array([0, 1, 2, 3, 4])

In [43]:
x[5:] # 索引5之后的元素

array([5, 6, 7, 8, 9])

In [44]:
x[4:7] # 中间的子数组

array([4, 5, 6])

In [45]:
x[::2] # 每隔一个元素

array([0, 2, 4, 6, 8])

In [46]:
x[1::2] # 每隔一个元素，索引从1开始

array([1, 3, 5, 7, 9])

In [47]:
x[::-1] # 所有元素，逆序的

array([9, 8, 7, 6, 5, 4, 3, 2, 1, 0])

In [48]:
x[5::-2] # 从索引5开始每隔一个元素逆序

array([5, 3, 1])

2. 多维子数组

In [49]:
x2

array([[12,  5,  2,  4],
       [ 7,  6,  8,  8],
       [ 1,  6,  7,  7]])

In [50]:
x2[:2, :3] # 两行，三列

array([[12,  5,  2],
       [ 7,  6,  8]])

In [51]:
x2[:3, ::2] # 所有行，每隔一列

array([[12,  2],
       [ 7,  8],
       [ 1,  7]])

In [52]:
x2[::-1, ::-1] # 逆序

array([[ 7,  7,  6,  1],
       [ 8,  8,  6,  7],
       [ 4,  2,  5, 12]])

3. 获取数组的行和列

In [53]:
x2[:, 0] # x2的第一列

array([12,  7,  1])

In [54]:
x2[0, :] # x2的第一行

array([12,  5,  2,  4])

In [55]:
x2[0] # 省略空的切片，效果与上一行

array([12,  5,  2,  4])

4. 非副本视图的子数组

In [56]:
x2

array([[12,  5,  2,  4],
       [ 7,  6,  8,  8],
       [ 1,  6,  7,  7]])

In [57]:
x2_sub = x2[:2, :2]
x2_sub

array([[12,  5],
       [ 7,  6]])

In [58]:
x2_sub[0, 0] = 99
x2_sub

array([[99,  5],
       [ 7,  6]])

In [59]:
x2

array([[99,  5,  2,  4],
       [ 7,  6,  8,  8],
       [ 1,  6,  7,  7]])

可以看到，修改子数组，原始数组也被修改了。这是因为处理非常大的数据集时，可以获取或处理这些数据集的片段，而不用复制底层的数据缓存。

5. 创建数组的副本

In [60]:
x2_sub_copy = x2[:2, :2].copy()
x2_sub_copy

array([[99,  5],
       [ 7,  6]])

In [61]:
x2_sub_copy[0, 0] = 42
x2_sub_copy

array([[42,  5],
       [ 7,  6]])

In [62]:
x2

array([[99,  5,  2,  4],
       [ 7,  6,  8,  8],
       [ 1,  6,  7,  7]])

## 2.2.4 数组的变形

In [63]:
grid = np.arange(1, 10).reshape((3, 3)) # reshape返回的也是非副本视图，修改了元素则会影响本来的数据
grid

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [64]:
x = np.array([1, 2, 3])
x.reshape((1, 3))

array([[1, 2, 3]])

In [65]:
x[np.newaxis, :]

array([[1, 2, 3]])

In [66]:
x.reshape((3, 1))

array([[1],
       [2],
       [3]])

In [67]:
x[:, np.newaxis]

array([[1],
       [2],
       [3]])

## 2.2.5 数组拼接和分裂

1. 数组的拼接

In [68]:
x = np.array([1, 2, 3])
y = np.array([3, 2, 1])
np.concatenate([x, y])

array([1, 2, 3, 3, 2, 1])

In [69]:
z = [99, 99, 99]
np.concatenate([x, y, z])

array([ 1,  2,  3,  3,  2,  1, 99, 99, 99])

In [70]:
grid = np.array([[1, 2, 3], [4, 5, 6]])
np.concatenate([grid, grid])

array([[1, 2, 3],
       [4, 5, 6],
       [1, 2, 3],
       [4, 5, 6]])

In [71]:
np.concatenate([grid, grid], axis=1)

array([[1, 2, 3, 1, 2, 3],
       [4, 5, 6, 4, 5, 6]])

In [72]:
x = np.array([1, 2, 3])
grid = np.array([[9, 8, 7], [6, 5, 4]])
np.vstack([x, grid])

array([[1, 2, 3],
       [9, 8, 7],
       [6, 5, 4]])

In [73]:
y = np.array([[99], [99]])
np.hstack([grid, y])

array([[ 9,  8,  7, 99],
       [ 6,  5,  4, 99]])

2. 数组的分裂

In [74]:
x = [1, 2, 3, 99, 99, 3, 2, 1]
x1, x2, x3 = np.split(x, [3, 5])
x1, x2, x3

(array([1, 2, 3]), array([99, 99]), array([3, 2, 1]))

In [75]:
grid = np.arange(16).reshape((4, 4))
grid

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15]])

In [76]:
upper, lower = np.vsplit(grid, [2])
upper, lower

(array([[0, 1, 2, 3],
        [4, 5, 6, 7]]), array([[ 8,  9, 10, 11],
        [12, 13, 14, 15]]))

In [77]:
left, right = np.hsplit(grid, [2])
left, right

(array([[ 0,  1],
        [ 4,  5],
        [ 8,  9],
        [12, 13]]), array([[ 2,  3],
        [ 6,  7],
        [10, 11],
        [14, 15]]))

# 2.3 NumPy数组的计算：通用函数

## 2.3.1 缓慢的循环

对数组的每个元素求倒数

In [78]:
import numpy as np
np.random.seed(0)

def compute_reciprocals(values):
    output = np.empty(len(values))
    for i in range(len(values)):
        output[i] = 1.0 / values[i]
    return output

values = np.random.randint(1, 10, size=5)
compute_reciprocals(values)

array([0.16666667, 1.        , 0.25      , 0.25      , 0.125     ])

In [81]:
big_array = np.random.randint(1, 100, size=1000000)
%timeit compute_reciprocals(big_array)

3.18 s ± 106 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


这里处理的瓶颈并不是运算本身，而是Cpython在每次循环时必须做数据类型的检查和函数的调度。每次进行倒数运算时，Python首先检查对象的类型，并且动态查找可以使用该数据类型的正确函数。

## 2.3.2 通用函数介绍

NumPy为很多类型的操作提供了非常方便的、静态类型的、可编译程序的接口，也被称作**向量**操作。

In [82]:
1.0/values

array([0.16666667, 1.        , 0.25      , 0.25      , 0.125     ])

In [83]:
%timeit (1.0 / big_array)

1.19 ms ± 15.4 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


NumPy中的向量操作是通过**通用函数**实现的。前面我们看过了标量和数组的运算，但是也可以对两个数组进行运算。

In [84]:
np.arange(5) / np.arange(1, 6)

array([0.        , 0.5       , 0.66666667, 0.75      , 0.8       ])

多维数组的运算

In [85]:
x = np.arange(9).reshape((3, 3))
2 ** x

array([[  1,   2,   4],
       [  8,  16,  32],
       [ 64, 128, 256]])

## 2.3.3 探索NumPy的通用函数

1. 数组的运算

In [86]:
x = np.arange(4)

In [87]:
x

array([0, 1, 2, 3])

In [88]:
x + 5

array([5, 6, 7, 8])

In [89]:
x - 5

array([-5, -4, -3, -2])

In [90]:
x * 2

array([0, 2, 4, 6])

In [91]:
x / 2

array([0. , 0.5, 1. , 1.5])

In [92]:
x // 2

array([0, 0, 1, 1])

In [93]:
-x

array([ 0, -1, -2, -3])

In [94]:
x ** 2

array([0, 1, 4, 9])

In [95]:
x % 2

array([0, 1, 0, 1])

In [96]:
-(0.5 * x + 1) ** 2

array([-1.  , -2.25, -4.  , -6.25])

In [97]:
np.add(x, 2)

array([2, 3, 4, 5])

2. 绝对值

In [98]:
x = np.array([-2, -1, 0, 1, 2])
abs(x)

array([2, 1, 0, 1, 2])

In [99]:
np.absolute(x)

array([2, 1, 0, 1, 2])

In [100]:
np.abs(x)

array([2, 1, 0, 1, 2])

In [101]:
# 处理负数
x = np.array([3 - 4j, 4 - 3j, 2 + 0j, 0 + 1j])
np.abs(x)

array([5., 5., 2., 1.])

3. 三角函数

In [102]:
theta = np.linspace(0, np.pi, 3)
theta

array([0.        , 1.57079633, 3.14159265])

In [103]:
np.sin(theta)

array([0.0000000e+00, 1.0000000e+00, 1.2246468e-16])

In [105]:
np.cos(theta)

array([ 1.000000e+00,  6.123234e-17, -1.000000e+00])

In [106]:
np.tan(theta)

array([ 0.00000000e+00,  1.63312394e+16, -1.22464680e-16])

In [107]:
x = [-1, 0, 1]
x

[-1, 0, 1]

In [108]:
np.arcsin(x)

array([-1.57079633,  0.        ,  1.57079633])

In [109]:
np.arccos(x)

array([3.14159265, 1.57079633, 0.        ])

In [110]:
np.arctan(x)

array([-0.78539816,  0.        ,  0.78539816])

4. 指数和对数

In [111]:
x = [1, 2, 3]
x

[1, 2, 3]

In [112]:
np.exp(x)

array([ 2.71828183,  7.3890561 , 20.08553692])

In [113]:
np.exp2(x)

array([2., 4., 8.])

In [114]:
np.power(3, x)

array([ 3,  9, 27])

In [115]:
x = [1, 2, 4, 10]
x

[1, 2, 4, 10]

In [116]:
np.log(x)

array([0.        , 0.69314718, 1.38629436, 2.30258509])

In [117]:
np.log2(x)

array([0.        , 1.        , 2.        , 3.32192809])

In [118]:
np.log10(x)

array([0.        , 0.30103   , 0.60205999, 1.        ])

In [119]:
# 还有一些对非常小的输入值可以保持较好的精度
x = [0, 0.001, 0.01, 0.1]
np.expm1(x) # exp(x) - 1

array([0.        , 0.0010005 , 0.01005017, 0.10517092])

In [120]:
np.log1p(x) # log(1 + x)

array([0.        , 0.0009995 , 0.00995033, 0.09531018])

5. 专用的通用函数

In [121]:
from scipy import special
x = [1, 5, 10]
special.gamma(x)

array([1.0000e+00, 2.4000e+01, 3.6288e+05])

In [122]:
special.gammaln(x)

array([ 0.        ,  3.17805383, 12.80182748])

In [123]:
special.beta(x, 2)

array([0.5       , 0.03333333, 0.00909091])

In [124]:
x = np.array([0, 0.3, 0.7, 1.0])
special.erf(x)

array([0.        , 0.32862676, 0.67780119, 0.84270079])

In [125]:
special.erfc(x)

array([1.        , 0.67137324, 0.32219881, 0.15729921])

In [126]:
special.erfinv(x)

array([0.        , 0.27246271, 0.73286908,        inf])

## 2.3.4 高级的通用函数特性

1. 指定输出

In [127]:
x = np.arange(5)
y = np.arange(5)
np.multiply(x, 10, out=y)
y

array([ 0, 10, 20, 30, 40])

In [128]:
y = np.zeros(10)
np.power(2, x, out=y[::2])
y

array([ 1.,  0.,  2.,  0.,  4.,  0.,  8.,  0., 16.,  0.])

2. 聚合

In [129]:
x = np.arange(1, 6)
np.add.reduce(x)

15

In [130]:
np.multiply.reduce(x)

120

In [131]:
np.add.accumulate(x)

array([ 1,  3,  6, 10, 15])

In [132]:
np.multiply.accumulate(x)

array([  1,   2,   6,  24, 120])

3. 外积

In [133]:
x = np.arange(1, 6)
np.multiply.outer(x, x)

array([[ 1,  2,  3,  4,  5],
       [ 2,  4,  6,  8, 10],
       [ 3,  6,  9, 12, 15],
       [ 4,  8, 12, 16, 20],
       [ 5, 10, 15, 20, 25]])

In [None]:
# 聚合：最小值、最大值和其他值