# NumPy基本操作 --- sklearn

## 1. 创建数组

In [1]:
# 导入 numpy 模块
import numpy as np

# 将列表转换为 Numpy 数组
a = np.array([1, 2, 3])
a

array([1, 2, 3])

In [2]:
# 产看数组对象类型
a.dtype

dtype('int64')

In [3]:
# 将浮点数列转为 Numpy 数组
b = np.array([1.2, 2.3, 3.4])
b.dtype

dtype('float64')

In [4]:
# 也可转换多维数组
c = np.array([[1, 2, 3], [4, 5, 6]])
c

array([[1, 2, 3],
       [4, 5, 6]])

In [5]:
# 生成 2x3 的全 0 数组
np.zeros((2, 3))

array([[0., 0., 0.],
       [0., 0., 0.]])

In [6]:
# 生成 3x4 的全 1 数组，并指定数据类型为 np.int16
np.ones((3, 4), dtype=np.int16)

array([[1, 1, 1, 1],
       [1, 1, 1, 1],
       [1, 1, 1, 1]], dtype=int16)

In [7]:
# 生成 大于等于10 小于30 步长为5 的数组
np.arange(10, 30, 5)

array([10, 15, 20, 25])

In [8]:
# 生成 3x2 符合 (0, 1)均匀分布的随机数组
np.random.rand(3, 2)

array([[0.45667279, 0.00681609],
       [0.93026545, 0.16368408],
       [0.98382862, 0.10588511]])

In [9]:
# 生成 0~2 范围内 整形 且长度为 5 的数组
np.random.randint(3, size=5)

array([0, 1, 2, 1, 2])

In [10]:
# 生成一个符合标准正太分布的数组
np.random.randn(3)

array([ 0.38150011,  0.14636759, -1.10300297])

## 2. 数组的索引与切片

In [11]:
a = np.arange(10)**2
a

array([ 0,  1,  4,  9, 16, 25, 36, 49, 64, 81])

In [14]:
# 索引
print(a[2])

# 切片 ---> 通过 -1 从后往前达到翻转的效果
print(a[1:4], "\n", a[::-1])

4
[1 4 9] 
 [81 64 49 36 25 16  9  4  1  0]


In [15]:
b = np.random.random((3, 3))
b

array([[0.71744952, 0.4112284 , 0.22662955],
       [0.42898677, 0.55040553, 0.04591322],
       [0.50311722, 0.50685634, 0.29932366]])

In [17]:
# 获取 第2行 第3列 的数据
print(b[1, 2])

# 获取第二列数据
print(b[:, 1])

# 获取第3列的前两行数据
print(b[:2, 2])

0.0459132237812695
[0.4112284  0.55040553 0.50685634]
[0.22662955 0.04591322]


## 3. 数组的基础运算

In [19]:
A = np.array([[1, 1], [0, 1]])
B = np.array([[2, 0], [3, 4]])

In [20]:
# 矩阵元素乘积（按元素相乘）
A * B

array([[2, 0],
       [0, 4]])

In [21]:
# 矩阵的点积（行乘列）
A.dot(B)

array([[5, 4],
       [3, 4]])

In [22]:
# 矩阵求 逆
np.linalg.inv(A)

array([[ 1., -1.],
       [ 0.,  1.]])

In [23]:
# 矩阵求行列式
np.linalg.det(A)

1.0

## 4. 数组维度变换

In [24]:
a = np.floor(10 * np.random.random((3, 4)))
a

array([[5., 2., 3., 4.],
       [2., 0., 8., 4.],
       [9., 2., 1., 2.]])

In [26]:
# 查看数组维度
print("shape: ", a.shape)

# 数组拉平
print("ravel: ", a.ravel())

# 将数组改变形状为 2行6列
print("reshape: ", a.reshape(2, 6))

# 数组的转置
print("T: ", a.T)

shape:  (3, 4)
ravel:  [5. 2. 3. 4. 2. 0. 8. 4. 9. 2. 1. 2.]
reshape:  [[5. 2. 3. 4. 2. 0.]
 [8. 4. 9. 2. 1. 2.]]
T:  [[5. 2. 9.]
 [2. 0. 2.]
 [3. 8. 1.]
 [4. 4. 2.]]


In [27]:
# 改变维度时可以通过 -1 自动计算该维度的数值
a.reshape(3, -1)

array([[5., 2., 3., 4.],
       [2., 0., 8., 4.],
       [9., 2., 1., 2.]])

## 5. 数组的合并与切分

In [30]:
# 按行合并
np.hstack((A, B))

array([[1, 1, 2, 0],
       [0, 1, 3, 4]])

In [31]:
# 按列合并
np.vstack((A, B))

array([[1, 1],
       [0, 1],
       [2, 0],
       [3, 4]])

In [33]:
C = np.arange(16).reshape(4, 4)

# 竖着切成两个数组
np.hsplit(C, 2)

[array([[ 0,  1],
        [ 4,  5],
        [ 8,  9],
        [12, 13]]),
 array([[ 2,  3],
        [ 6,  7],
        [10, 11],
        [14, 15]])]

In [34]:
# 横着切成两个数组
np.vsplit(C, 2)

[array([[0, 1, 2, 3],
        [4, 5, 6, 7]]),
 array([[ 8,  9, 10, 11],
        [12, 13, 14, 15]])]

## 6. sklearn实现对数几率回归

In [37]:
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression

In [38]:
# 导入数据
X, y = load_iris(return_X_y=True)

In [41]:
X[0], y[0]

(array([5.1, 3.5, 1.4, 0.2]), 0)

In [43]:
# 拟合模型
clf = LogisticRegression(random_state=0).fit(X, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [44]:
# 以上只是一个警告
# 预测前两行
clf.predict(X[:2, :])

array([0, 0])

In [45]:
# 概率预测
clf.predict_proba(X[:2, :])

array([[9.81799409e-01, 1.82005762e-02, 1.43509289e-08],
       [9.71722782e-01, 2.82771875e-02, 3.00214335e-08]])

In [46]:
# 模型分类准确率
clf.score(X, y)

0.9733333333333334

## 7. PPT里的习题

y_test=[1,1,0,1,0,0,1,0,1,0,1,0,1,0,1,0,1,1,0,0,1,0,1,1,1,0,0,0,1,0]
y_prd=[1,0,1,1,0,1,1,0,1,0,0,0,1,0,1,0,1,1,0,1,1,1,1,1,1,0,1,0,1,1]

使用 sklearn 计算 Accuracy、Precision、Recall、Specificity、F1 score和AUC

In [48]:
y_test = np.array([1,1,0,1,0,0,1,0,1,0,1,0,1,0,1,0,1,1,0,0,1,0,1,1,1,0,0,0,1,0])
y_prd = np.array([1,0,1,1,0,1,1,0,1,0,0,0,1,0,1,0,1,1,0,1,1,1,1,1,1,0,1,0,1,1])

In [49]:
# 生成混淆矩阵 
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_prd)

array([[ 9,  6],
       [ 2, 13]])

In [51]:
# 计算准确率
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_prd)  # (9+13)/(9+6+2+13)

0.7333333333333333

In [56]:
# 计算精确率
from sklearn.metrics import precision_score
precision_score(y_test, y_prd, pos_label=1, average='binary')  # 9/(9+6)  只看二分类问题中标签为 1 的 准确率

0.6842105263157895

In [57]:
# 计算召回率
from sklearn.metrics import recall_score
recall_score(y_test, y_prd)  # 9/(9+2)

0.8666666666666667

In [58]:
# 计算 F1
from sklearn.metrics import f1_score
f1_score(y_test, y_prd)  # (2*9)/(2*9+6+2)

0.7647058823529413

In [60]:
# AUC
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, y_prd)

0.7333333333333333