## [101 NumPy Exercises for Data Analysis (Python)](https://www.machinelearningplus.com/python/101-numpy-exercises-python/)

In [1]:
import numpy as np

### 1. Import numpy as np and see the version
Q. Import numpy as np and print the version number.

In [20]:
np.__version__

'1.17.4'

### 2. How to create a 1D array?
Q. Create a 1D array of numbers from 0 to 9

In [21]:
# np.arange(0, 10)

np.arange(10)

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

### 3. How to create a boolean array?

Q. Create a 3×3 numpy array of all True’s

In [22]:
np.full((3, 3), True, dtype=bool)

array([[ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True]])

### 4. How to extract items that satisfy a given condition from 1D array?

Q. Extract all odd numbers from arr

In [29]:
arr = np.arange(10)
arr % 2 == 1

array([False,  True, False,  True, False,  True, False,  True, False,
        True])

In [30]:
arr[arr % 2 == 1]

array([1, 3, 5, 7, 9])

### 5. How to replace items that satisfy a condition with another value in numpy array?
Q. Replace all odd numbers in arr with -1

In [32]:
arr = np.arange(10)
arr[arr % 2 == 1] = -1
arr

array([ 0, -1,  2, -1,  4, -1,  6, -1,  8, -1])

### 6. How to replace items that satisfy a condition without affecting the original array?
Q. Replace all odd numbers in arr with -1 without changing arr

In [35]:
arr = np.arange(10)
out = np.where(arr % 2 == 1, -1, arr)
print(arr)
out

[0 1 2 3 4 5 6 7 8 9]


array([ 0, -1,  2, -1,  4, -1,  6, -1,  8, -1])

### 7. How to reshape an array?

Q. Convert a 1D array to a 2D array with 2 rows

In [38]:
np.arange(10).reshape(2, 5)

array([[0, 1, 2, 3, 4],
       [5, 6, 7, 8, 9]])

In [39]:
np.arange(10).reshape(2, -1)  # setting to -1 automatically decides the number of cols

array([[0, 1, 2, 3, 4],
       [5, 6, 7, 8, 9]])

### 8. How to stack two arrays vertically?

Q. Stack arrays a and b vertically

In [42]:
a = np.arange(10).reshape(2, -1)
b = np.repeat(1, 10).reshape(2, -1)

In [47]:
# method 1:

np.vstack([a, b])  # 坚直 
# np.hstack([a, b]) # 水平

array([[0, 1, 2, 3, 4],
       [5, 6, 7, 8, 9],
       [1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1]])

In [48]:
# method 2:

np.concatenate([a, b], axis=0)

array([[0, 1, 2, 3, 4],
       [5, 6, 7, 8, 9],
       [1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1]])

In [49]:
# method 3:
np.r_[a, b]

array([[0, 1, 2, 3, 4],
       [5, 6, 7, 8, 9],
       [1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1]])

### 9. How to stack two arrays horizontally?

Q. Stack the arrays a and b horizontally.

In [50]:
a = np.arange(10).reshape(2, -1)
b = np.repeat(1, 10).reshape(2, -1)

In [54]:
# method 1:

np.hstack([a, b])

array([[0, 1, 2, 3, 4, 1, 1, 1, 1, 1],
       [5, 6, 7, 8, 9, 1, 1, 1, 1, 1]])

In [55]:
# method 2:

np.concatenate([a, b], axis=1)

array([[0, 1, 2, 3, 4, 1, 1, 1, 1, 1],
       [5, 6, 7, 8, 9, 1, 1, 1, 1, 1]])

In [56]:
# method 3:

np.c_[a, b]

array([[0, 1, 2, 3, 4, 1, 1, 1, 1, 1],
       [5, 6, 7, 8, 9, 1, 1, 1, 1, 1]])

### 10. How to generate custom sequences in numpy without hardcoding?


Q. Create the following pattern without hardcoding. Use only numpy functions and the below input array a.

In [59]:
a = np.array([1, 2, 3])
a

array([1, 2, 3])

In [63]:
np.repeat(a, 3)

array([1, 1, 1, 2, 2, 2, 3, 3, 3])

In [64]:
np.tile(a, 3)

array([1, 2, 3, 1, 2, 3, 1, 2, 3])

In [65]:
np.r_[np.repeat(a, 3), np.tile(a, 3)]

array([1, 1, 1, 2, 2, 2, 3, 3, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3])

### 11. How to get the common items between two python numpy arrays?

Q. Get the common items between a and b

In [66]:
a = np.array([1,2,3,2,3,4,3,4,5,6])
b = np.array([7,2,10,2,7,4,9,4,9,8])

In [72]:
# intersect : 相交

np.intersect1d(a, b)

array([2, 4])

### 12. How to remove from one array those items that exist in another?


Q. From array a remove all items present in array b

In [73]:
a = np.array([1,2,3,4,5])
b = np.array([5,6,7,8,9])

In [77]:
a[a != np.intersect1d(a, b)]

array([1, 2, 3, 4])

In [78]:
np.setdiff1d(a, b)

array([1, 2, 3, 4])

### 13. How to get the positions where elements of two arrays match?

Q. Get the positions where elements of a and b match

In [79]:
a = np.array([1,2,3,2,3,4,3,4,5,6])
b = np.array([7,2,10,2,7,4,9,4,9,8])

In [81]:
np.where(a == b)

(array([1, 3, 5, 7], dtype=int64),)

### 14. How to extract all numbers between a given range from a numpy array?


Q. Get all items between 5 and 10 from a.

In [82]:
a = np.array([2, 6, 1, 9, 10, 3, 27])

In [95]:
index = np.where((a >= 5) & (a <= 10))
a[index]

array([ 6,  9, 10])

In [101]:
# np.logical_and

index = np.where(np.logical_and(a >= 5, a <= 10))
a[index]

array([ 6,  9, 10])

In [104]:
a[(a >= 5) & (a <= 10)]

array([ 6,  9, 10])

### 15. How to make a python function that handles scalars to work on numpy arrays?

Q. Convert the function maxx that works on two scalars, to work on two arrays.

In [106]:
def maxx(x, y):
    """Get the maximum of two items"""
    if x >= y:
        return x
    else:
        return y

In [107]:
a = np.array([5, 7, 9, 8, 6, 4, 5])
b = np.array([6, 3, 4, 8, 9, 7, 1])

In [111]:
# method 1: 

for i, j in zip(a, b):
    print(max(i, j), end=' ')

6 7 9 8 9 7 5 

In [113]:
# method 2:

pair_max = np.vectorize(maxx, otypes=[float])

pair_max(a, b)

array([6., 7., 9., 8., 9., 7., 5.])

### 16. How to swap two columns in a 2d numpy array?


Q. Swap columns 1 and 2 in the array arr.

In [114]:
arr = np.arange(9).reshape(3, 3)
arr

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [117]:
arr[ : , [1, 0, 2]]

array([[1, 0, 2],
       [4, 3, 5],
       [7, 6, 8]])

In [122]:
arr[ : , [2, 0]]  # 显示第2行与第0行

array([[2, 0],
       [5, 3],
       [8, 6]])

### 17. How to swap two rows in a 2d numpy array?


Q. Swap rows 1 and 2 in the array arr:

In [123]:
arr = np.arange(9).reshape(3,3)
arr

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [127]:
# method 1:

arr[[2, 1, 0]]

# arr[[2, 1, 0], : ]

array([[6, 7, 8],
       [3, 4, 5],
       [0, 1, 2]])

In [129]:
arr[[2, 1, 0], : ]

array([[6, 7, 8],
       [3, 4, 5],
       [0, 1, 2]])

### 18. How to reverse the rows of a 2D array?


Q. Reverse the rows of a 2D array arr.

In [130]:
arr = np.arange(9).reshape(3,3)
arr

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [131]:
arr[ : : -1]

array([[6, 7, 8],
       [3, 4, 5],
       [0, 1, 2]])

### 19. How to reverse the columns of a 2D array?


Q. Reverse the columns of a 2D array arr.

In [132]:
arr = np.arange(9).reshape(3,3)
arr

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [134]:
arr[ : , [2, 1, 0]]

array([[2, 1, 0],
       [5, 4, 3],
       [8, 7, 6]])

In [136]:
arr[ : , : : -1]

array([[2, 1, 0],
       [5, 4, 3],
       [8, 7, 6]])

### 20. How to create a 2D array containing random floats between 5 and 10?


Q. Create a 2D array of shape 5x3 to contain random decimal numbers between 5 and 10.

In [156]:
# method 1: 

rand_arr = np.random.randint(5, 10, (5, 3)) + np.random.random((5, 3))
rand_arr

array([[7.95864993, 6.88336004, 7.15552662],
       [9.16694863, 5.62140361, 9.34176571],
       [5.35172687, 6.45319133, 8.223013  ],
       [9.40752719, 5.5692627 , 7.91106618],
       [6.96340278, 5.55548901, 9.38178456]])

In [158]:
# method 2:

np.random.uniform(5, 10, (5, 3))

array([[7.2507635 , 9.21156897, 6.14371406],
       [7.30644138, 9.15348389, 9.06296335],
       [7.46612705, 9.75773713, 9.22488925],
       [6.12535014, 6.20858407, 6.53439729],
       [7.88311319, 8.22941359, 6.7181375 ]])

### 21. How to print only 3 decimal places in python numpy array?


Q. Print or show only 3 decimal places of the numpy array rand_arr.

In [159]:
rand_arr = np.random.random((5, 3))

In [163]:
rand_arr

array([[0.899, 0.609, 0.097],
       [0.235, 0.321, 0.856],
       [0.349, 0.251, 0.801],
       [0.657, 0.561, 0.708],
       [0.955, 0.827, 0.076]])

In [165]:
np.set_printoptions(precision=3)
rand_arr[ : 4]

array([[0.899, 0.609, 0.097],
       [0.235, 0.321, 0.856],
       [0.349, 0.251, 0.801],
       [0.657, 0.561, 0.708]])

### 22. How to pretty print a numpy array by suppressing the scientific notation (like 1e10)?


Q. Pretty print rand_arr by suppressing the scientific notation (like 1e10)

In [171]:
# reset printoptions to default

np.set_printoptions(suppress=False)

# Create the random array

np.random.seed(100)
rand_arr = np.random.random([3, 3]) / 1e3
rand_arr

array([[5.434e-04, 2.784e-04, 4.245e-04],
       [8.448e-04, 4.719e-06, 1.216e-04],
       [6.707e-04, 8.259e-04, 1.367e-04]])

In [172]:
np.set_printoptions(suppress=True, precision=6)  # precision is optional
rand_arr

array([[0.000543, 0.000278, 0.000425],
       [0.000845, 0.000005, 0.000122],
       [0.000671, 0.000826, 0.000137]])

### 23. How to limit the number of items printed in output of numpy array?


Q. Limit the number of items printed in python numpy array a to a maximum of 6 elements.

In [173]:
a = np.arange(15)

In [175]:
np.set_printoptions(threshold=6)
a

array([ 0,  1,  2, ..., 12, 13, 14])

### 24. How to print the full numpy array without truncating


Q. Print the full numpy array a without truncating.

In [176]:
np.set_printoptions(threshold=6)
a = np.arange(15)
a

array([ 0,  1,  2, ..., 12, 13, 14])

### 25. How to import a dataset with numbers and texts keeping the text intact in python numpy?


Q. Import the iris dataset keeping the text intact.

In [182]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')
names = ('sepallength', 'sepalwidth', 'petallength', 'petalwidth', 'species')

# Print the first 3 rows
iris[:3]

array([[b'5.1', b'3.5', b'1.4', b'0.2', b'Iris-setosa'],
       [b'4.9', b'3.0', b'1.4', b'0.2', b'Iris-setosa'],
       [b'4.7', b'3.2', b'1.3', b'0.2', b'Iris-setosa']], dtype=object)

## 26. How to extract a particular column from 1D array of tuples?

Q. Extract the text column species from the 1D iris imported in previous question.

In [3]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_ld = np.genfromtxt(url, delimiter=',', dtype=None, encoding='utf-8')

In [22]:
iris_ld.shape

(150,)

In [26]:
# Soltuion
species = np.array([row[4] for row in iris_ld])
species[ : 5]

array(['Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa'], dtype='<U15')

## 27. How to convert a 1d array of tuples to a 2d numpy array?

Q. Convert the 1D iris to 2D array iris_2d by omitting the species text field.

In [33]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_1d = np.genfromtxt(url, delimiter=',', dtype=None, encoding='utf-8')

In [38]:
# Method 1 : Convert each row to a list and get the first 4 items

iris_2d = np.array([row.tolist()[ : 4] for row in iris_ld])  # 截取前 4 个元素
iris_2d[ : 4]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2]])

In [40]:
# Method 2: Import only the first 4 columns from source url

iris_2d = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0, 1, 2, 3])
iris_2d[ : 4]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2]])

## 28. How to compute the mean, median, standard deviation of a numpy array?


Q. Find the mean, median, standard deviation of iris's `sepallength` (1st column)

In [46]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object', encoding='utf-8')

In [51]:
sepallength = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0])
sepallength

array([5.1, 4.9, 4.7, 4.6, 5. , 5.4, 4.6, 5. , 4.4, 4.9, 5.4, 4.8, 4.8,
       4.3, 5.8, 5.7, 5.4, 5.1, 5.7, 5.1, 5.4, 5.1, 4.6, 5.1, 4.8, 5. ,
       5. , 5.2, 5.2, 4.7, 4.8, 5.4, 5.2, 5.5, 4.9, 5. , 5.5, 4.9, 4.4,
       5.1, 5. , 4.5, 4.4, 5. , 5.1, 4.8, 5.1, 4.6, 5.3, 5. , 7. , 6.4,
       6.9, 5.5, 6.5, 5.7, 6.3, 4.9, 6.6, 5.2, 5. , 5.9, 6. , 6.1, 5.6,
       6.7, 5.6, 5.8, 6.2, 5.6, 5.9, 6.1, 6.3, 6.1, 6.4, 6.6, 6.8, 6.7,
       6. , 5.7, 5.5, 5.5, 5.8, 6. , 5.4, 6. , 6.7, 6.3, 5.6, 5.5, 5.5,
       6.1, 5.8, 5. , 5.6, 5.7, 5.7, 6.2, 5.1, 5.7, 6.3, 5.8, 7.1, 6.3,
       6.5, 7.6, 4.9, 7.3, 6.7, 7.2, 6.5, 6.4, 6.8, 5.7, 5.8, 6.4, 6.5,
       7.7, 7.7, 6. , 6.9, 5.6, 7.7, 6.3, 6.7, 7.2, 6.2, 6.1, 6.4, 7.2,
       7.4, 7.9, 6.4, 6.3, 6.1, 7.7, 6.3, 6.4, 6. , 6.9, 6.7, 6.9, 5.8,
       6.8, 6.7, 6.7, 6.3, 6.5, 6.2, 5.9])

In [54]:
mu, med, sd = np.mean(sepallength), np.median(sepallength), np.std(sepallength)
print(f"mu: {mu}, med: {med}, sd: {sd}")

mu: 5.843333333333334, med: 5.8, sd: 0.8253012917851409


## 29. How to normalize an array so the values range exactly between 0 and 1?


Q. Create a normalized form of iris's sepallength whose values range exactly between 0 and 1 so that the minimum has value 0 and maximum has value 1.

In [55]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
sepallength = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0])

In [61]:
# Method 1

Smax, Smin = sepallength.max(), sepallength.min()
S = (sepallength - Smin) / (Smax - Smin)
S

array([0.22222222, 0.16666667, 0.11111111, 0.08333333, 0.19444444,
       0.30555556, 0.08333333, 0.19444444, 0.02777778, 0.16666667,
       0.30555556, 0.13888889, 0.13888889, 0.        , 0.41666667,
       0.38888889, 0.30555556, 0.22222222, 0.38888889, 0.22222222,
       0.30555556, 0.22222222, 0.08333333, 0.22222222, 0.13888889,
       0.19444444, 0.19444444, 0.25      , 0.25      , 0.11111111,
       0.13888889, 0.30555556, 0.25      , 0.33333333, 0.16666667,
       0.19444444, 0.33333333, 0.16666667, 0.02777778, 0.22222222,
       0.19444444, 0.05555556, 0.02777778, 0.19444444, 0.22222222,
       0.13888889, 0.22222222, 0.08333333, 0.27777778, 0.19444444,
       0.75      , 0.58333333, 0.72222222, 0.33333333, 0.61111111,
       0.38888889, 0.55555556, 0.16666667, 0.63888889, 0.25      ,
       0.19444444, 0.44444444, 0.47222222, 0.5       , 0.36111111,
       0.66666667, 0.36111111, 0.41666667, 0.52777778, 0.36111111,
       0.44444444, 0.5       , 0.55555556, 0.5       , 0.58333

In [63]:
# Method 2 ：

S = (sepallength - Smin) / sepallength.ptp()
S

array([0.22222222, 0.16666667, 0.11111111, 0.08333333, 0.19444444,
       0.30555556, 0.08333333, 0.19444444, 0.02777778, 0.16666667,
       0.30555556, 0.13888889, 0.13888889, 0.        , 0.41666667,
       0.38888889, 0.30555556, 0.22222222, 0.38888889, 0.22222222,
       0.30555556, 0.22222222, 0.08333333, 0.22222222, 0.13888889,
       0.19444444, 0.19444444, 0.25      , 0.25      , 0.11111111,
       0.13888889, 0.30555556, 0.25      , 0.33333333, 0.16666667,
       0.19444444, 0.33333333, 0.16666667, 0.02777778, 0.22222222,
       0.19444444, 0.05555556, 0.02777778, 0.19444444, 0.22222222,
       0.13888889, 0.22222222, 0.08333333, 0.27777778, 0.19444444,
       0.75      , 0.58333333, 0.72222222, 0.33333333, 0.61111111,
       0.38888889, 0.55555556, 0.16666667, 0.63888889, 0.25      ,
       0.19444444, 0.44444444, 0.47222222, 0.5       , 0.36111111,
       0.66666667, 0.36111111, 0.41666667, 0.52777778, 0.36111111,
       0.44444444, 0.5       , 0.55555556, 0.5       , 0.58333

## 30. How to compute the softmax score?

Q. Compute the softmax score of sepallength.

In [68]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
sepallength = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0])

In [69]:
sepallength

array([5.1, 4.9, 4.7, 4.6, 5. , 5.4, 4.6, 5. , 4.4, 4.9, 5.4, 4.8, 4.8,
       4.3, 5.8, 5.7, 5.4, 5.1, 5.7, 5.1, 5.4, 5.1, 4.6, 5.1, 4.8, 5. ,
       5. , 5.2, 5.2, 4.7, 4.8, 5.4, 5.2, 5.5, 4.9, 5. , 5.5, 4.9, 4.4,
       5.1, 5. , 4.5, 4.4, 5. , 5.1, 4.8, 5.1, 4.6, 5.3, 5. , 7. , 6.4,
       6.9, 5.5, 6.5, 5.7, 6.3, 4.9, 6.6, 5.2, 5. , 5.9, 6. , 6.1, 5.6,
       6.7, 5.6, 5.8, 6.2, 5.6, 5.9, 6.1, 6.3, 6.1, 6.4, 6.6, 6.8, 6.7,
       6. , 5.7, 5.5, 5.5, 5.8, 6. , 5.4, 6. , 6.7, 6.3, 5.6, 5.5, 5.5,
       6.1, 5.8, 5. , 5.6, 5.7, 5.7, 6.2, 5.1, 5.7, 6.3, 5.8, 7.1, 6.3,
       6.5, 7.6, 4.9, 7.3, 6.7, 7.2, 6.5, 6.4, 6.8, 5.7, 5.8, 6.4, 6.5,
       7.7, 7.7, 6. , 6.9, 5.6, 7.7, 6.3, 6.7, 7.2, 6.2, 6.1, 6.4, 7.2,
       7.4, 7.9, 6.4, 6.3, 6.1, 7.7, 6.3, 6.4, 6. , 6.9, 6.7, 6.9, 5.8,
       6.8, 6.7, 6.7, 6.3, 6.5, 6.2, 5.9])

In [71]:
def softmax(x):
    """Compute softmax values for each sets of scores in x.
    https://stackoverflow.com/questions/34968722/how-to-implement-the-softmax-function-in-python
    """
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

softmax(sepallength)

array([0.00221959, 0.00181724, 0.00148783, 0.00134625, 0.00200836,
       0.00299613, 0.00134625, 0.00200836, 0.00110221, 0.00181724,
       0.00299613, 0.00164431, 0.00164431, 0.00099732, 0.0044697 ,
       0.00404435, 0.00299613, 0.00221959, 0.00404435, 0.00221959,
       0.00299613, 0.00221959, 0.00134625, 0.00221959, 0.00164431,
       0.00200836, 0.00200836, 0.00245302, 0.00245302, 0.00148783,
       0.00164431, 0.00299613, 0.00245302, 0.00331123, 0.00181724,
       0.00200836, 0.00331123, 0.00181724, 0.00110221, 0.00221959,
       0.00200836, 0.00121813, 0.00110221, 0.00200836, 0.00221959,
       0.00164431, 0.00221959, 0.00134625, 0.00271101, 0.00200836,
       0.01483991, 0.00814432, 0.01342771, 0.00331123, 0.00900086,
       0.00404435, 0.00736928, 0.00181724, 0.00994749, 0.00245302,
       0.00200836, 0.00493978, 0.0054593 , 0.00603346, 0.00365948,
       0.01099368, 0.00365948, 0.0044697 , 0.006668  , 0.00365948,
       0.00493978, 0.00603346, 0.00736928, 0.00603346, 0.00814

## 31. How to find the percentile scores of a numpy array?

Q. Find the 5th and 95th percentile of iris's sepallength

In [72]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
sepallength = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0])

In [73]:
np.percentile(sepallength, q=[5, 95])

array([4.6  , 7.255])

## 32. How to insert values at random positions in an array?

Q. Insert **np.nan** values at 20 random positions in iris_2d dataset

In [75]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_2d = np.genfromtxt(url, delimiter=',', dtype='object')

In [91]:
# Method 1:

i, j = np.where(iris_2d)

# i, j contain the row numbers and column numbers of 600 elements of iris_x
np.random.seed(100)
iris_2d[np.random.choice((i), 20), np.random.choice((j), 20)] = np.nan

# print first 10 rows

iris_2d[ : 10]

array([[b'5.1', b'3.5', b'1.4', b'0.2', b'Iris-setosa'],
       [b'4.9', b'3.0', b'1.4', b'0.2', b'Iris-setosa'],
       [b'4.7', b'3.2', b'1.3', b'0.2', b'Iris-setosa'],
       [b'4.6', b'3.1', b'1.5', b'0.2', b'Iris-setosa'],
       [b'5.0', b'3.6', b'1.4', b'0.2', b'Iris-setosa'],
       [b'5.4', b'3.9', b'1.7', b'0.4', b'Iris-setosa'],
       [b'4.6', b'3.4', b'1.4', b'0.3', b'Iris-setosa'],
       [b'5.0', b'3.4', b'1.5', b'0.2', b'Iris-setosa'],
       [b'4.4', nan, b'1.4', b'0.2', b'Iris-setosa'],
       [b'4.9', b'3.1', b'1.5', b'0.1', b'Iris-setosa']], dtype=object)

In [92]:
# Method 2 :

np.random.seed(100)
iris_2d[np.random.randint(150, size=20), np.random.randint(4, size=20)] = np.nan

# print first 10 rows

iris_2d[ : 10]

array([[b'5.1', b'3.5', b'1.4', b'0.2', b'Iris-setosa'],
       [b'4.9', b'3.0', b'1.4', b'0.2', b'Iris-setosa'],
       [b'4.7', b'3.2', b'1.3', b'0.2', b'Iris-setosa'],
       [b'4.6', b'3.1', b'1.5', b'0.2', b'Iris-setosa'],
       [b'5.0', b'3.6', b'1.4', b'0.2', b'Iris-setosa'],
       [b'5.4', b'3.9', b'1.7', b'0.4', b'Iris-setosa'],
       [b'4.6', b'3.4', b'1.4', b'0.3', b'Iris-setosa'],
       [b'5.0', b'3.4', b'1.5', b'0.2', b'Iris-setosa'],
       [b'4.4', nan, b'1.4', b'0.2', b'Iris-setosa'],
       [b'4.9', b'3.1', b'1.5', b'0.1', b'Iris-setosa']], dtype=object)

## 33. How to find the position of missing values in numpy array?


Q. Find the number and position of missing values in iris_2d's sepallength (1st column)

In [93]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_2d = np.genfromtxt(url, delimiter=',', dtype='float')
iris_2d[np.random.randint(150, size=20), np.random.randint(4, size=20)] = np.nan

In [97]:
# Solution :
print("Number of missing values:", np.isnan(iris_2d[ : , 0]).sum())
print("Position of missing values:", np.where(np.isnan(iris_2d[ : , 0])))

Number of missing values: 5
Position of missing values: (array([ 38,  80, 106, 113, 121]),)


## 34. How to filter a numpy array based on two or more conditions?

Q. Filter the rows of iris_2d that has petallength (3rd column) > 1.5 and sepallength (1st column) < 5.0

In [98]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_2d = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0,1,2,3])

In [108]:
condition = (iris_2d[ : , 2] > 1.5) & (iris_2d[ : , 0] < 5.0)
iris_2d[condition]

array([[4.8, 3.4, 1.6, 0.2],
       [4.8, 3.4, 1.9, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [4.9, 2.4, 3.3, 1. ],
       [4.9, 2.5, 4.5, 1.7]])

## 35. How to drop rows that contain a missing value from a numpy array?

Q. Select the rows of iris_2d that does not have any nan value.

In [116]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_2d = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0,1,2,3])
iris_2d[np.random.randint(150, size=20), np.random.randint(4, size=20)] = np.nan

In [119]:
# Solution
# No direct numpy function for this
# Method 1 :

any_nan_in_row = np.array([~np.any(np.isnan(row)) for row in iris_2d])
iris_2d[any_nan_in_row][ : 5]

array([[4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [4.6, 3.4, 1.4, 0.3]])

In [120]:
# Method 2 :

iris_2d[np.sum(np.isnan(iris_2d), axis = 1) == 0][ : 5]

array([[4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [4.6, 3.4, 1.4, 0.3]])

## 36. How to find the correlation between two columns of a numpy array?

Q. Find the correlation between SepalLength(1st column) and PetalLength(3rd column) in iris_2d

In [129]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0,1,2,3])

In [130]:
# Solution 1 :

np.corrcoef(iris[ : , 0], iris[ : , 2])[0, 1]

0.8717541573048718

In [131]:
# Solution 2 :

from scipy.stats import pearsonr

corr, p_value = pearsonr(iris[ : , 0], iris[ : , 2])
corr

0.8717541573048712

## 7. How to find if a given array has any null values?


Q. Find out if iris_2d has any missing values.

In [132]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_2d = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0,1,2,3])

In [133]:
np.isnan(iris_2d).any()

False

## 38. How to replace all missing values with 0 in a numpy array?

Q. Replace all ccurrences of nan with 0 in numpy array

In [134]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_2d = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0,1,2,3])
iris_2d[np.random.randint(150, size=20), np.random.randint(4, size=20)] = np.nan

In [139]:
iris_2d[np.isnan(iris_2d)] = 0

iris_2d[ : 4]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2]])

## 39. How to find the count of unique values in a numpy array?


Q. Find the unique values and the count of unique values in iris's species

In [140]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')
names = ('sepallength', 'sepalwidth', 'petallength', 'petalwidth', 'species')

In [141]:
# Extract the species columns as an array
species = np.array([row.tolist()[4] for row in iris])

# Get the unique values and counts
np.unique(species, return_counts=True)

(array([b'Iris-setosa', b'Iris-versicolor', b'Iris-virginica'],
       dtype='|S15'), array([50, 50, 50]))

## 40. How to convert a numeric to a categorical (text) array?


Q. Bin the petal length (3rd) column of iris_2d to form a text array, such that if petal length is:

-   Less than 3 --> 'small'
-   3-5 --> 'medium'
-  '>=5 --> 'large'

In [142]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')
names = ('sepallength', 'sepalwidth', 'petallength', 'petalwidth', 'species')

In [144]:
# Bin petallength
petal_length_bin = np.digitize(iris[ : , 2].astype('float'), [0, 3, 5, 10])

# Map it to respective category
label_map = {1: "small", 2: "median", 3: "large", 4: np.nan}
petal_length_cat = [label_map[x] for x in petal_length_bin]

# view 
petal_length_cat[ : 4]

['small', 'small', 'small', 'small']