In [1]:
import numpy as np

## 小结
1. NAN: Not A Number
    1. 不是一个数字，但是属于浮点型
    2. 与任意值进行运算的结果都是NAN
    3. NAN != NAN
    4. 通过np.isnan来判断某个值是不是nan
2. INF：Infinity
    * 无穷大，在除数为0的时候会出现
3. 处理值的方式
    1. 删除值/删除行
    2. 替换值

In [2]:
# 构造3行5列的数组，元素为0到9之间的随机整数
data = np.random.randint(0, 10, size=(3, 5))
print(data)

[[3 3 3 0 7]
 [8 0 2 4 9]
 [8 6 8 5 9]]


In [3]:
# 把a中第1行第1列的元素换成NAN
data[1, 1] = np.nan

ValueError: cannot convert float NaN to integer

In [4]:
# 将data的数据类型转换为浮点型
data = data.astype(np.float)
data[1, 1] = np.nan
print(data)

[[ 3.  3.  3.  0.  7.]
 [ 8. nan  2.  4.  9.]
 [ 8.  6.  8.  5.  9.]]


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  data = data.astype(np.float)


In [5]:
print(data/0)

[[inf inf inf nan inf]
 [inf nan inf inf inf]
 [inf inf inf inf inf]]


  print(data/0)
  print(data/0)


In [6]:
# nan值不能比
np.nan == np.nan

False

In [7]:
data

array([[ 3.,  3.,  3.,  0.,  7.],
       [ 8., nan,  2.,  4.,  9.],
       [ 8.,  6.,  8.,  5.,  9.]])

In [8]:
# 识别data中的nan元素
np.isnan(data)

array([[False, False, False, False, False],
       [False,  True, False, False, False],
       [False, False, False, False, False]])

In [10]:
# 删除data中nan元素
data[~np.isnan(data)]

array([3., 3., 3., 0., 7., 8., 2., 4., 9., 8., 6., 8., 5., 9.])

In [14]:
# 获取data中nan元素的位置信息
np.where(np.isnan(data))

(array([1]), array([1]))

In [15]:
# 获取data中nan元素的行下标信息
np.where(np.isnan(data))[0]

array([1])

In [16]:
# 获取data中nan元素的行下标信息
lines = np.where(np.isnan(data))[0]
# 删除data中有nan值的行
np.delete(data, lines, axis=0)

array([[3., 3., 3., 0., 7.],
       [8., 6., 8., 5., 9.]])

In [17]:
# 读取nan_scores.csv到scores里
scores = np.loadtxt("data/nan_scores.csv", delimiter=",", skiprows=1)

ValueError: could not convert string to float: ''

In [30]:
# 读取nan_scores.csv到scores里
scores = np.loadtxt("data/nan_scores.csv", delimiter=",", skiprows=1, dtype=np.str)
# 查看
scores

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  scores = np.loadtxt("data/nan_scores.csv", delimiter=",", skiprows=1, dtype=np.str)


array([['59', '89'],
       ['90', '32'],
       ['78', '45.5'],
       ['34', ''],
       ['', '56'],
       ['23', '56']], dtype='<U4')

In [31]:
# 把空字符串转换成nan
scores[scores == ""] = np.nan
scores

array([['59', '89'],
       ['90', '32'],
       ['78', '45.5'],
       ['34', 'nan'],
       ['nan', '56'],
       ['23', '56']], dtype='<U4')

In [32]:
# 数组dtype都换成浮点型
scores1 = scores.astype (np.float)
scores1

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  scores1 = scores.astype (np.float)


array([[59. , 89. ],
       [90. , 32. ],
       [78. , 45.5],
       [34. ,  nan],
       [ nan, 56. ],
       [23. , 56. ]])

In [33]:
# 把scores1中的nan替换成0
scores1[np.isnan(scores1)] = 0
scores1

array([[59. , 89. ],
       [90. , 32. ],
       [78. , 45.5],
       [34. ,  0. ],
       [ 0. , 56. ],
       [23. , 56. ]])

In [34]:
scores1.sum(axis=1)

array([148. , 122. , 123.5,  34. ,  56. ,  79. ])

In [37]:
scores2 = scores.astype (np.float)
scores2

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  scores2 = scores.astype (np.float)


array([[59. , 89. ],
       [90. , 32. ],
       [78. , 45.5],
       [34. ,  nan],
       [ nan, 56. ],
       [23. , 56. ]])

In [43]:
# 每门课把nan替换成均值
for x in range(scores2.shape[1]):
    col = scores2[:,x] # 取第x列的所有元素
    mean = col[~np.isnan(col)].mean() # 找到当前列的所有非nan值，并求它们的均值
    col[np.isnan(col)] = mean # 用求出来的均值替换nan值
scores2

array([[59. , 89. ],
       [90. , 32. ],
       [78. , 45.5],
       [34. , 55.7],
       [56.8, 56. ],
       [23. , 56. ]])