## Working with Numpy

Import package numpy

Exercises questions are taken from https://www.machinelearningplus.com/python/101-numpy-exercises-python/

Solutions are my own unless specified. 

In [1]:
import numpy as np

Create an array


In [2]:
list1 = [1,2,3,4]
array1 = np.array(list1)
array1

array([1, 2, 3, 4])

Creating a boolean array


In [3]:
arr_boolean = np.random.choice([True,False], size=[3,3], p= [1,0])
arr_boolean

array([[ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True]])

Other way of creating all elemets as true


In [4]:
arr_boolean = np.full((3,3), True, dtype=bool)
arr_boolean

array([[ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True]])

Extracting specific elements from an array 

In [5]:

arr_test = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
arr_test


array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [6]:
## extracts odd elements
arr_test[np.where(arr_test%2 != 0)]

array([1, 3, 5, 7, 9])

Replacing elements with other values based on a condition

In [7]:
arr_test[arr_test % 2 != 0] = -1
arr_test


array([ 0, -1,  2, -1,  4, -1,  6, -1,  8, -1])

Replacting element of the array without modifying original array


In [8]:
arr_test = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
arr_test

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [9]:
## copying elemens of array
arr_copy = arr_test.copy()

In [10]:
## replacting just the copied array
arr_copy[arr_copy%2 != 0]=-1
print(arr_copy)
print(arr_test)

[ 0 -1  2 -1  4 -1  6 -1  8 -1]
[0 1 2 3 4 5 6 7 8 9]


RESHAPING an array

In [11]:
arr_test

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [12]:
##changing it to (2,5) array
arr_test.reshape(2,5)

array([[0, 1, 2, 3, 4],
       [5, 6, 7, 8, 9]])

Stacking two arrays

In [13]:
array1 = np.zeros([2,5])
array2 = np.ones([2,5])
print(array1)
print(array2)

[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
[[1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1.]]


In [14]:
np.concatenate([array1,array2], axis =0)

array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.]])

In [15]:
## another way
np.r_[array1,array2]

array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.]])

In [16]:
##for column wide stack, we can use eitehr of following
np.concatenate([array1,array2], axis =1)
np.c_[array1,array2]

array([[0., 0., 0., 0., 0., 1., 1., 1., 1., 1.],
       [0., 0., 0., 0., 0., 1., 1., 1., 1., 1.]])

**Repeating** elements of an array 

In [17]:
arr_test = np.array([1,2,3])
print(arr_test)
print(np.repeat(arr_test,  3))


[1 2 3]
[1 1 1 2 2 2 3 3 3]


In [18]:
print(np.tile(arr_test,3))

[1 2 3 1 2 3 1 2 3]


**Common elements in 2 arrays**

In [19]:
arr1 = np.array([1,2,4,5,6,67,8,8,9])
arr2 = np.array([2,3,4,55,7,7,8,9,9])
common_list =[]
for element in arr1:
    if element in arr2:
        common_list.append(element)
        
print(np.array(common_list))

[2 4 8 8 9]


In [20]:
## using numpy function
print(np.intersect1d(arr1,arr2))

[2 4 8 9]


Removing common eleements from array1 

In [21]:
np.setdiff1d(arr1,arr2)

array([ 1,  5,  6, 67])

**Index of common** elements in array

In [22]:
np.where(arr1 == arr2)

(array([2, 6, 8], dtype=int64),)

Extracting **elements within a range** in array

In [23]:
print(arr1)

## extracting elements b/w 5 and 10

arr1[(arr1>=5)  & (arr1<=10)]

[ 1  2  4  5  6 67  8  8  9]


array([5, 6, 8, 8, 9])

**Vectorizing functions**

In [24]:
def maxx(x, y):
    
    if x >= y:
        return x
    else:
        return y

maxx(1, 5)

5

In [25]:
maxx(arr1,arr2)

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [None]:
## vectorizing
maxx_v = np.vectorize(maxx)
maxx_v(arr1,arr2)

**Swappning columns**

In [None]:
arr1 = arr1.reshape(3,3)
arr1

In [None]:
arr1[:, (0,2,1)]

In [None]:
## for swapping rows
arr1[[0,2,1], :]


**Reversing rows/columns** of an array 

In [None]:
arr1

In [None]:
arr1[: , ::-1]


**Arrays with random numbers**

In [None]:
np.random.randint(low =5 , high=10, size= [3,3])

Setting print options

In [None]:
np.set_printoptions(precision=3)
np.set_printoptions(suppress=True)
np.set_printoptions(threshold=6) ## setting thresholds on number of elements displayed in an array

**Importing** dataset

In [None]:
##np.genfromtxt('<weblink>', delimiter = ',',dtype ='object')

**Mean/Median/Std**

In [None]:
print(np.mean(arr1), np.median(arr1), np.std(arr1))

Inserting values at random positions in array

In [None]:

arr1[np.random.choice([0,1,2], size=1), np.random.choice([0,1,2], size =1)]= 55
arr1


**Finding index of null values** in array **(np.isnan) function**

In [None]:
arr1 =arr1.astype('float')
arr1[1,1],arr1[2,1]= np.nan, np.nan
arr1

In [None]:
indexes =np.where(np.isnan(arr1))
print(indexes)
arr1[indexes] 

**Dropping rows** containing **null values**


Without numpy functions

In [None]:
rows_with_nulls =np.array([np.any(np.isnan(row)) for row in arr1])
print(rows_with_nulls)
## getting non null rows
arr1[~rows_with_nulls]

**Replacing nas** with 0

In [None]:
print(arr1)
arr1[indexes] =0
arr1

**Filtering** based on muliple column values

In [None]:
arr1[(arr1[:,1]<=2) & (arr1[:,0]>2)]

Finding **correlation** between two columns in an array/matrix


Using in built numpy function - **np.corrcoef()**

In [None]:
np.random.seed(1232)
arr_rand = np.random.randint(low =0, high =10, size = [3,3])
print (arr_rand)
np.corrcoef(arr_rand[:,1], arr_rand[:,2])

## just getting value of correlation b/w columns with index 1 and 2
print("correlation is ", np.corrcoef(arr_rand[:,1], arr_rand[:,2])[0,1])

Correlation using **scipy** package

In [None]:
from scipy.stats.stats import pearsonr
corr, p_val = pearsonr(arr_rand[:,1], arr_rand[:,2])
print("corr is ", corr, " and p value is ", p_val)

so result is not statisicially significant which makes sense as numbers are chose radomly and each column of sample space just have one element

Getting **count of unique values** in an array

In [None]:
np.unique(arr_rand, return_counts=True)

**Converting data types**of array

In [None]:
list1 = [1,2,3,4]
array1 = np.array(list1)
array1

In [None]:
array1.astype('str')


**Create new column from existing cols in array**

In [None]:
np.random.seed(50)
array1 = np.random.randint(low= 0, high =10, size = [3,3])
print(array1)


In [None]:
##lets say new column is sum of second and third col
co = array1[:,1]+array1[:,2]
print(co)

In [None]:
## adding this column to original array
np.c_[array1, co]

**Random sammpling based on probabilities**

In [None]:
list2=  ['a','b','c','d']
array2 = np.array(list2)
print(array2)

In [None]:
np.random.choice(array2, size = 10, p =[0.7,0.1,0.1,0.1])

**Sort an array by a column**


In [None]:
np.random.seed(121)
np.set_printoptions(threshold =20)
big_array = np.random.randint(0,10, [10,3])
print(big_array)

In [None]:
## lets sort the array by column2(index1)
element_indexes_based_on_sort_col2 = np.argsort(big_array[:, 1])
big_array[element_indexes_based_on_sort_col2,:]
## array is now sorted by col2

In [None]:
## sorting in one line of code
big_array[big_array[:,1].argsort(),:]

**Finding most frequent** element in numpy array

In [None]:
my_tuple = np.unique(big_array,return_counts = True)
print(my_tuple)
print(np.where(my_tuple[1] == np.max(my_tuple[1])))## max count index
print (my_tuple[0][np.where(my_tuple[1] == np.max(my_tuple[1]))]) ## value corresponding to maximum count




In [None]:
## other way
values , counts = np.unique(big_array, return_counts=True)
print(values[np.argmax(counts)])

**Finding first occurence** of a particular element based on a condition

In [27]:
print(big_array)
print(np.where(big_array[:,1]==0)[0])


NameError: name 'big_array' is not defined

**Row-wise count of all elements in array**

In [214]:
np.random.seed(14)
arr_test =np.random.randint(0,10, [5,10])
print(arr_test)


num_counts =[np.unique(row, return_counts=True) for row in arr_test]

##for row in num_counts:
  ##  print(row[1][np.where(row[0] ==6)])
arr_counts=[[   int(row[1][row[0]==i])  if i in row[0] else 0 for i in range(10)]for row in num_counts]
                
print('arr_counts')           
print(arr_counts)


[[8 6 7 9 6 0 8 9 7 6]
 [0 7 4 0 6 4 5 5 8 5]
 [9 8 7 3 4 8 1 6 0 9]
 [5 6 2 1 7 8 3 1 9 1]
 [5 8 9 5 3 9 9 4 2 0]]
arr_counts
[[1, 0, 0, 0, 0, 0, 3, 2, 2, 2], [2, 0, 0, 0, 2, 3, 1, 1, 1, 0], [1, 1, 0, 1, 1, 0, 1, 1, 2, 2], [0, 3, 1, 1, 0, 1, 1, 1, 1, 1], [1, 0, 1, 1, 1, 2, 0, 0, 1, 3]]


Array - arr_counts shows/tells  how many  times [0,1,2,3,4,5,6,7,8,9] occurred in each row respectively.


**Flattening an array**

In [217]:
np.reshape(arr_test, [1, arr_test.shape[0]*arr_test.shape[1]])

array([[8, 6, 7, 9, 6, 0, 8, 9, 7, 6, 0, 7, 4, 0, 6, 4, 5, 5, 8, 5, 9, 8,
        7, 3, 4, 8, 1, 6, 0, 9, 5, 6, 2, 1, 7, 8, 3, 1, 9, 1, 5, 8, 9, 5,
        3, 9, 9, 4, 2, 0]])

In [219]:
## using numpy functions
np.concatenate(arr_test)

array([8, 6, 7, 9, 6, 0, 8, 9, 7, 6, 0, 7, 4, 0, 6, 4, 5, 5, 8, 5, 9, 8,
       7, 3, 4, 8, 1, 6, 0, 9, 5, 6, 2, 1, 7, 8, 3, 1, 9, 1, 5, 8, 9, 5,
       3, 9, 9, 4, 2, 0])

**One hot encoding** for a numpy array

In [222]:
np.random.seed(12)
test_array = np.random.randint(0,4, [10])
print(test_array)

[3 3 2 1 1 2 3 3 0 0]


In [234]:



[ [1. if element == test_array[i] else 0. for element in np.unique(test_array)  ]    for i in range(test_array.size)]


[[0.0, 0.0, 0.0, 1.0],
 [0.0, 0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0, 0.0],
 [0.0, 1.0, 0.0, 0.0],
 [0.0, 1.0, 0.0, 0.0],
 [0.0, 0.0, 1.0, 0.0],
 [0.0, 0.0, 0.0, 1.0],
 [0.0, 0.0, 0.0, 1.0],
 [1.0, 0.0, 0.0, 0.0],
 [1.0, 0.0, 0.0, 0.0]]

**Changing categories in array to numeric values**

In [281]:
np.random.seed(12)
test_array = np.random.choice(['a','b','c','d','e','f'], [15])
print(test_array)

['d' 'd' 'f' 'b' 'c' 'd' 'd' 'e' 'a' 'b' 'e' 'f' 'f' 'b' 'c']


In [282]:
[np.where(np.unique(test_array)==element)[0][0] for element in test_array]


[3, 3, 5, 1, 2, 3, 3, 4, 0, 1, 4, 5, 5, 1, 2]

**Ranking elements in numpy** argsort().argsortz

In [296]:
np.random.seed(15)
test_array = np.random.randint(10,20,[10])
print(test_array)

print(test_array.argsort().argsort())


[18 15 15 17 10 17 15 16 11 17]
[9 2 3 6 0 7 4 5 1 8]


In [299]:
## finding rank in 2d array
np.random.seed(15)
test_array2 = np.random.randint(10,20,[2,5])
test_array2

array([[18, 15, 15, 17, 10],
       [17, 15, 16, 11, 17]])

In [300]:
np.ravel(test_array2).argsort().argsort().reshape(2,5)

array([[9, 2, 3, 6, 0],
       [7, 4, 5, 1, 8]], dtype=int64)

**Maximum value row-wiise**

In [303]:
print(test_array2)
np.max(test_array2, axis =1)

[[18 15 15 17 10]
 [17 15 16 11 17]]


array([18, 17])

In [304]:
## other way
np.apply_along_axis(np.max, arr=test_array2, axis=1)

array([18, 17])

In [305]:
## other operations
## min/max of each row

np.apply_along_axis(np.min, arr=test_array2, axis=1)/np.apply_along_axis(np.max, arr=test_array2, axis=1)

array([0.556, 0.647])

In [309]:
## other way
np.apply_along_axis(lambda x: np.min(x)/np.max(x), arr=test_array2, axis=1)

array([0.556, 0.647])

**Finding duplicate records** in a numpy array

In [336]:
print(test_array)
np.random.seed(13)
dummy_array = np.random.choice([True], [10])
print(dummy_array)
counts = np.unique(test_array, return_index=True)
counts

[18 15 15 17 10 17 15 16 11 17]
[ True  True  True  True  True  True  True  True  True  True]


(array([10, 11, 15, 16, 17, 18]), array([4, 8, 1, 7, 3, 0], dtype=int64))

In [338]:
##unique positions 
counts[1]

array([4, 8, 1, 7, 3, 0], dtype=int64)

In [339]:
##setting those unique positions as False
dummy_array[counts[1]]= False
dummy_array

array([False, False,  True, False, False,  True,  True, False, False,
        True])

**Mean of a numeric column grouped by a categorical column in a 2D numpy array**

In [367]:
np.random.seed(12)
test_array1 = np.random.choice(['a','b','c','d','e','f'], [15])
print(test_array1)
np.random.seed(12)
test_array = np.random.randint(1,10, [15])
print(test_array)

## combine both matrices
array_combined = np.c_[test_array1, test_array]
print(array_combined)


['d' 'd' 'f' 'b' 'c' 'd' 'd' 'e' 'a' 'b' 'e' 'f' 'f' 'b' 'c']
[7 2 3 4 4 1 7 2 5 6 3 7 1 6 9]
[['d' '7']
 ['d' '2']
 ['f' '3']
 ['b' '4']
 ['c' '4']
 ['d' '1']
 ['d' '7']
 ['e' '2']
 ['a' '5']
 ['b' '6']
 ['e' '3']
 ['f' '7']
 ['f' '1']
 ['b' '6']
 ['c' '9']]


In [387]:
means =[np.mean(array_combined[array_combined[:,0]==element,1].astype('int')) for element in np.unique(array_combined[:,0])]

## combining with categories\
print(np.c_[np.unique(array_combined[:,0]), means])

[['a' '5.0']
 ['b' '5.333333333333333']
 ['c' '6.5']
 ['d' '4.25']
 ['e' '2.5']
 ['f' '3.6666666666666665']]


**WORKING WITH IMAGES**

In [389]:
## solution taken from https://www.machinelearningplus.com/python/101-numpy-exercises-python/
from io import BytesIO
from PIL import Image
import PIL, requests

# Import image from URL
URL = 'https://upload.wikimedia.org/wikipedia/commons/b/ba/Data_visualization_process_v1.png'
response = requests.get(URL)

# Read it as Image
I = Image.open(BytesIO(response.content))

# Optionally resize
I = I.resize([150,150])

# Convert to numpy array
arr = np.asarray(I)

# Optionaly Convert it back to an image and show
im = PIL.Image.fromarray(np.uint8(arr))
Image.Image.show(im)

**Finding local maxima in an array**

Local maxima is the value an array which is surrounded by two smaller values

In [406]:
print(test_array)
list_to_hold_values =[]

for i in range(1,len(test_array)-1):
    if (test_array[i] >test_array[i-1] and test_array[i] >test_array[i+1]):
        list_to_hold_values.append(i)

np.array(list_to_hold_values)
    
    

[7 2 3 4 4 1 7 2 5 6 3 7 1 6 9]


array([ 6,  9, 11])

In [407]:

## using numpy functions (source - https://www.machinelearningplus.com/python/101-numpy-exercises-python/)
print(test_array)
print(np.diff(test_array)) ## calculate differenc from next element
print(np.sign(np.diff(test_array))) ## -1 for negative values , +1 for positive values
print(np.diff(np.sign(np.diff(test_array))))

doublediff = np.diff(np.sign(np.diff(test_array)))
peak_locations = np.where(doublediff == -2)[0] + 1
peak_locations


[7 2 3 4 4 1 7 2 5 6 3 7 1 6 9]
[-5  1  1  0 -3  6 -5  3  1 -3  4 -6  5  3]
[-1  1  1  0 -1  1 -1  1  1 -1  1 -1  1  1]
[ 2  0 -1 -1  2 -2  2  0 -2  2 -2  2  0]


array([ 6,  9, 11], dtype=int64)

**Subtracting 1d array from 2d array**

In [422]:
a_2d = np.array([[3,3,3],[4,4,4],[5,5,5]])
b_1d = np.array([1,2,3])

## subtracting each element of second array from corresponding row in first array
print([a_2d[i,:]-b_1d[i] for i in range(a_2d.shape[1])])

[array([2, 2, 2]), array([2, 2, 2]), array([2, 2, 2])]


In [424]:
## other way
print(a_2d-b_1d.reshape(3,1))

## or
a_2d-b_1d[:,None]

[[2 2 2]
 [2 2 2]
 [2 2 2]]


array([[2, 2, 2],
       [2, 2, 2],
       [2, 2, 2]])

find the index of n'th repetition of an item in an array

In [426]:
#Find the index of 5th repetition of number 1 in x.
x = np.array([1, 2, 1, 1, 3, 4, 3, 1, 1, 2, 1, 1, 2])

In [432]:
np.where(x==1)[0][5-1]## 5 th repition will be at index 4 so  5-1

8

**np.arrange() function**

In [434]:
np.arange(5,30,3) ##np.arrange(start,end, stepsize)

array([ 5,  8, 11, 14, 17, 20, 23, 26, 29])