# 101 Numpy Exercises

### https://www.machinelearningplus.com/python/101-numpy-exercises-python/

In [1]:
import numpy as np

In [2]:
print(np.__version__)

1.18.4


In [3]:
arr = np.arange(10)
arr

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [4]:
# 3x3 array of true
np.full((3,3), True)

array([[ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True]])

In [5]:
# alt method
np.ones((3,3), dtype=bool)

array([[ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True]])

In [6]:
# obtain every other number starting at index 1
arr[1::2]

array([1, 3, 5, 7, 9])

In [7]:
# obtain only odds
arr[arr % 2 == 1]

array([1, 3, 5, 7, 9])

In [8]:
# set odd numbers to -1
arr[arr % 2 == 1] = -1
arr

array([ 0, -1,  2, -1,  4, -1,  6, -1,  8, -1])

In [9]:
arr = np.arange(10)
arr

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [10]:
out = np.where(arr % 2 == 1, -1, arr)
out

array([ 0, -1,  2, -1,  4, -1,  6, -1,  8, -1])

In [11]:
np.reshape(arr, (2, -1))

array([[0, 1, 2, 3, 4],
       [5, 6, 7, 8, 9]])

In [12]:
a = np.arange(10).reshape(2,-1)
b = np.repeat(1, 10).reshape(2,-1)

In [13]:
a + b

array([[ 1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10]])

In [14]:
# concat arrays vertically Method 1
np.concatenate((a, b))

array([[0, 1, 2, 3, 4],
       [5, 6, 7, 8, 9],
       [1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1]])

In [15]:
# concat arrays vertically Method 2
np.vstack((a,b))

array([[0, 1, 2, 3, 4],
       [5, 6, 7, 8, 9],
       [1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1]])

In [16]:
# concat arrays horizonally, maintaining row count Method 1
np.concatenate((a,b),axis = 1)

array([[0, 1, 2, 3, 4, 1, 1, 1, 1, 1],
       [5, 6, 7, 8, 9, 1, 1, 1, 1, 1]])

In [17]:
# concat arrays horizonally, maintaining row count Method 2
np.hstack((a,b))

array([[0, 1, 2, 3, 4, 1, 1, 1, 1, 1],
       [5, 6, 7, 8, 9, 1, 1, 1, 1, 1]])

In [18]:
a = np.array([1,2,3,2,3,4,3,4,5,6])
b = np.array([7,2,10,2,7,4,9,4,9,8])

In [19]:
# find all points of intersection
a[a == b]

array([2, 2, 4, 4])

In [20]:
# find unique intersects or arrays
np.intersect1d(a,b)

array([2, 4])

In [21]:
#remove elements of b from a
np.setdiff1d(a,b)

array([1, 3, 5, 6])

In [22]:
# gives indexes where elements match
np.where(a == b)

(array([1, 3, 5, 7]),)

In [23]:
# number between 5 and 10 incl

a[(a >= 5) & (a <= 10)]

array([5, 6])

In [24]:
def maxx(x,y):
    if x >= y:
        return x
    else:
        return y

In [25]:
# Vectorize uses a function, otypes sets data type to return
pair_max = np.vectorize(maxx, otypes=[float])
a = np.array([5, 7, 9, 8, 6, 4, 5])
b = np.array([6, 3, 4, 8, 9, 7, 1])
pair_max(a,b)

array([6., 7., 9., 8., 9., 7., 5.])

In [26]:
arr = np.arange(9).reshape(3,3)
arr

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [27]:
# swap first two columns
arr[:,[1,0,2]]

array([[1, 0, 2],
       [4, 3, 5],
       [7, 6, 8]])

In [28]:
# swap last two columns
arr[:,[0,2,1]]

array([[0, 2, 1],
       [3, 5, 4],
       [6, 8, 7]])

In [29]:
# swap first two rows
arr[[1,0,2],:]

array([[3, 4, 5],
       [0, 1, 2],
       [6, 7, 8]])

In [30]:
# reverse the rows
arr[::-1]

array([[6, 7, 8],
       [3, 4, 5],
       [0, 1, 2]])

In [31]:
# reverse the columns
arr[:,::-1]

array([[2, 1, 0],
       [5, 4, 3],
       [8, 7, 6]])

In [32]:
# create an array with random ints
rand_ints = np.random.randint(low=5, high=10, size=(5,3))
rand_ints

array([[8, 7, 5],
       [9, 5, 6],
       [9, 9, 5],
       [9, 7, 7],
       [6, 5, 9]])

In [33]:
# create an array with random floats
rand_floats = np.random.random((5,3))
rand_floats

array([[0.03544827, 0.15211371, 0.91024465],
       [0.78909288, 0.78221137, 0.38749298],
       [0.59168483, 0.23610797, 0.80913487],
       [0.92381268, 0.60089591, 0.10688098],
       [0.51185431, 0.02312751, 0.00394536]])

In [34]:
# combine the arrays
rand_ints + rand_floats

array([[8.03544827, 7.15211371, 5.91024465],
       [9.78909288, 5.78221137, 6.38749298],
       [9.59168483, 9.23610797, 5.80913487],
       [9.92381268, 7.60089591, 7.10688098],
       [6.51185431, 5.02312751, 9.00394536]])

In [35]:
# Alt method
rand_arr = np.random.uniform(5,10, size=(5,3))
rand_arr

array([[7.82624121, 8.55385754, 6.70243397],
       [6.95724202, 7.34819885, 8.98239881],
       [6.0372766 , 8.66877101, 7.85707581],
       [8.26301099, 6.71738392, 7.44995898],
       [5.00267204, 5.45064714, 8.46751301]])

In [36]:
# only 3 decimal places universal setting
np.set_printoptions(precision=3)
rand_arr

array([[7.826, 8.554, 6.702],
       [6.957, 7.348, 8.982],
       [6.037, 8.669, 7.857],
       [8.263, 6.717, 7.45 ],
       [5.003, 5.451, 8.468]])

In [37]:
# Reset printoptions to default
np.set_printoptions(precision=8, suppress=False)
# Create the random array
np.random.seed(100)
rand_arr = np.random.random([3,3])/1e3
rand_arr

array([[5.43404942e-04, 2.78369385e-04, 4.24517591e-04],
       [8.44776132e-04, 4.71885619e-06, 1.21569121e-04],
       [6.70749085e-04, 8.25852755e-04, 1.36706590e-04]])

In [38]:
# suppress scientific notation
np.set_printoptions(suppress=True, precision=8)  # precision is optional
rand_arr

array([[0.0005434 , 0.00027837, 0.00042452],
       [0.00084478, 0.00000472, 0.00012157],
       [0.00067075, 0.00082585, 0.00013671]])

In [39]:
# limit number of items shown in array
np.set_printoptions(threshold=6)
np.arange(15)

array([ 0,  1,  2, ..., 12, 13, 14])

In [40]:
# import the iris dataset
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')
names = ('sepallength', 'sepalwidth', 'petallength', 'petalwidth', 'species')

In [41]:
iris

array([[b'5.1', b'3.5', b'1.4', b'0.2', b'Iris-setosa'],
       [b'4.9', b'3.0', b'1.4', b'0.2', b'Iris-setosa'],
       [b'4.7', b'3.2', b'1.3', b'0.2', b'Iris-setosa'],
       ...,
       [b'6.5', b'3.0', b'5.2', b'2.0', b'Iris-virginica'],
       [b'6.2', b'3.4', b'5.4', b'2.3', b'Iris-virginica'],
       [b'5.9', b'3.0', b'5.1', b'1.8', b'Iris-virginica']], dtype=object)

In [42]:
iris.shape

(150, 5)

In [43]:
iris_1d = np.genfromtxt(url, delimiter=',', dtype=None, encoding=None)
iris_1d.shape

(150,)

In [44]:
species = np.array([row[4] for row in iris_1d])
species[:5]

array(['Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa'], dtype='<U15')

In [45]:
# convert each row to a list and get only first 4 items to get only first 4 columns

iris_2d = np.array([row.tolist()[:4] for row in iris_1d])
iris_2d[:5]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2]])

In [46]:
# import only first 4 columns from source
iris_2d = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0,1,2,3])
iris_2d[:5]


array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2]])

In [47]:
# get mean, median and std of column 0
sepallength = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0])
mu, med, std = np.mean(sepallength), np.median(sepallength), np.std(sepallength)
mu, med, std


(5.843333333333334, 5.8, 0.8253012917851409)

In [48]:
#normalize array between 0 and 1
Smin, Smax = sepallength.min(), sepallength.max()
S = (sepallength - Smin)/(Smax - Smin)
S

array([0.22222222, 0.16666667, 0.11111111, ..., 0.61111111, 0.52777778,
       0.44444444])

In [49]:
# compute softmax score
def softmax(x):
    """Compute softmax values for each sets of scores in x.
    https://stackoverflow.com/questions/34968722/how-to-implement-the-softmax-function-in-python"""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

softmax(sepallength)

array([0.00221959, 0.00181724, 0.00148783, ..., 0.00900086, 0.006668  ,
       0.00493978])

In [50]:
sepallength

array([5.1, 4.9, 4.7, ..., 6.5, 6.2, 5.9])

In [51]:
np.exp(sepallength - np.max(sepallength))

array([0.06081006, 0.04978707, 0.0407622 , ..., 0.24659696, 0.18268352,
       0.13533528])

In [52]:
# find the 5 and 95 percentile scores of an array
np.percentile(sepallength, q=[5,95])

array([4.6  , 7.255])

In [53]:
# insert 20 random nan values in iris dataset
i, j = np.where(iris_2d) #row and column numbers
iris_2d[np.random.choice((i), 20), np.random.choice((j), 20)] = np.nan
iris_2d

array([[nan, 3.5, 1.4, 0.2],
       [nan, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       ...,
       [6.5, 3. , 5.2, 2. ],
       [6.2, 3.4, 5.4, 2.3],
       [5.9, 3. , 5.1, 1.8]])

In [54]:
# find number of nan values in first column
print('Number of missing values: \n', np.isnan(iris_2d[:,0]).sum())

Number of missing values: 
 6


In [55]:
# find positions of nan values in first column
print('Positions of missing values: \n', np.where(np.isnan(iris_2d[:,0])))

Positions of missing values: 
 (array([ 0,  1, 15, 23, 84, 96]),)


In [56]:
# filter the rows with 3rd column > 1.5 and 1st column < 5.0
condition = (iris_2d[:,2]>1.5) & (iris_2d[:,0]<5.0)
iris_2d[condition]

array([[4.8, 3.4, 1.6, 0.2],
       [4.8, 3.4, 1.9, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [4.9, 2.4, 3.3, nan],
       [4.9, 2.5, 4.5, 1.7]])

In [57]:
iris_2d[:5]

array([[nan, 3.5, 1.4, 0.2],
       [nan, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2]])

In [58]:
# drop rows with missing values
any_nan_in_row = np.array([~np.any(np.isnan(row)) for row in iris_2d])
iris_2d[any_nan_in_row][:5]

array([[4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3]])

In [59]:
# alt method
iris_2d[np.sum(np.isnan(iris_2d), axis = 1) == 0][:5]

array([[4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3]])

In [60]:
# find correlation between columns 1 and 3
iris = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0,1,2,3])
np.corrcoef(iris[:, 0], iris[:, 2])[0, 1]

0.8717541573048718

In [61]:
# find if any null values in an array
np.isnan(iris_2d).any()

True

In [62]:
# replace all missing values with 0
iris_2d[np.isnan(iris_2d)] = 0
np.isnan(iris_2d).any()

False

In [63]:
# find unique value count in an array
iris = np.genfromtxt(url, delimiter=',', dtype='object')
species = np.array([row.tolist()[4] for row in iris]) # extract species column as array
np.unique(species, return_counts=True)


(array([b'Iris-setosa', b'Iris-versicolor', b'Iris-virginica'],
       dtype='|S15'),
 array([50, 50, 50]))

In [64]:
# convert numeric to categorical bins
petal_length_bin = np.digitize(iris[:, 2].astype('float'), [0, 3, 5, 10])
label_map = {1: 'small', 2: 'medium', 3: 'large', 4: np.nan}
petal_length_cat = [label_map[x] for x in petal_length_bin]
petal_length_cat[:4]

['small', 'small', 'small', 'small']

In [65]:
# create new column from existing column
iris_2d = np.genfromtxt(url, delimiter=',', dtype='object')
sepallength = iris_2d[:,0].astype('float')
petallength = iris_2d[:, 2].astype('float')
volume = (np.pi * petallength * (sepallength**2))/3
# Introduce new dimension to match iris_2d's
volume = volume[:, np.newaxis]
# Add the new column
out = np.hstack([iris_2d, volume])
out[:4]

array([[b'5.1', b'3.5', b'1.4', b'0.2', b'Iris-setosa',
        38.13265162927291],
       [b'4.9', b'3.0', b'1.4', b'0.2', b'Iris-setosa',
        35.200498485922445],
       [b'4.7', b'3.2', b'1.3', b'0.2', b'Iris-setosa', 30.0723720777127],
       [b'4.6', b'3.1', b'1.5', b'0.2', b'Iris-setosa',
        33.238050274980004]], dtype=object)

In [66]:
# probabilistic sampling
a = np.array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'])
species_out = np.random.choice(a, 150, p=[0.5, 0.25, 0.25])
species_out

array(['Iris-setosa', 'Iris-setosa', 'Iris-virginica', ..., 'Iris-setosa',
       'Iris-versicolor', 'Iris-versicolor'], dtype='<U15')

In [67]:
# Approach 2: Probablistic Sampling (preferred)
np.random.seed(100)
probs = np.r_[np.linspace(0, 0.500, num=50), np.linspace(0.501, .750, num=50), np.linspace(.751, 1.0, num=50)]
index = np.searchsorted(probs, np.random.random(150))
species_out = species[index]
print(np.unique(species_out, return_counts=True))

(array([b'Iris-setosa', b'Iris-versicolor', b'Iris-virginica'],
      dtype='|S15'), array([77, 37, 36]))


In [68]:
# Get value of second longest petallength of species setosa

# Get the species and petal length columns
petal_len_setosa = iris[iris[:, 4] == b'Iris-setosa', [2]].astype('float')
# Get the second last value
np.unique(np.sort(petal_len_setosa))[-2]

1.7

In [69]:
# sort by column 0 (Sepallength)
iris[iris[:,0].argsort()][:10]


array([[b'4.3', b'3.0', b'1.1', b'0.1', b'Iris-setosa'],
       [b'4.4', b'3.2', b'1.3', b'0.2', b'Iris-setosa'],
       [b'4.4', b'3.0', b'1.3', b'0.2', b'Iris-setosa'],
       ...,
       [b'4.6', b'3.4', b'1.4', b'0.3', b'Iris-setosa'],
       [b'4.6', b'3.2', b'1.4', b'0.2', b'Iris-setosa'],
       [b'4.7', b'3.2', b'1.3', b'0.2', b'Iris-setosa']], dtype=object)

In [70]:
# find most frequent value in numpy array
vals, counts = np.unique(iris[:, 2], return_counts=True)
vals[np.argmax(counts)]

b'1.5'

In [71]:
# 46