# ***Vectorization***

***Vectorizing code is a technique that will typically enable you to create faster and more readable code. Vectorization is the process of performing computation on a set of values at once instead of explicitly looping through individual elements one at a time. ***

In [1]:
import numpy as np

# create our data
a = np.array([1, 2, 3])
b = np.array([2, 3, 4])
c = np.array([[2, 3, 4]])

In [2]:
type(a)
print(a.shape)
print(c.shape)

(3,)
(1, 3)


In [3]:
# nonvectorized approach
c = []  # Start with an empty array that we will populate in the loop
for i in range(len(a)):
    c.append(a[i] + b[i])

print("Nonvectorized approach ->", c)

Nonvectorized approach -> [3, 5, 7]


In [4]:
# vectorized approach
c = a + b

print("Vectorized approach ->", c)

Vectorized approach -> [3 5 7]


*** the notation of the vectorized approach is far clearer than the non-vectorized approach. It says exactly what is happening - an element-wise addition of vectors. This approach is even clearer with 2D matrices since 2D matrices would require nested loops. In data science we are often working with vectors or matrices of data and need to perform element-wise operations on them as we discussed before, so vectorized notation is often preferable for clarity of the code.***

In [5]:
def double_nonvectorized(array):
    doubled = array.copy()
    for i in range(len(array)):
        doubled[i] = array[i] * 2
    return doubled

In [6]:
def double_vectorized(array):
    return array * 2

In [7]:
array = np.array([1, 2, 3, 4])
print("Nonvectorized = ", double_nonvectorized(array))
print("Vectorized    = ", double_vectorized(array))

Nonvectorized =  [2 4 6 8]
Vectorized    =  [2 4 6 8]


In [8]:
import time


def timer(function, argument, num_runs):
    total_time = 0
    # Rerun the code num_runs times
    for i in range(num_runs):
        t0 = time.time()  # Capture the initial time
        function(argument)  # Run the function we're timing
        t1 = time.time()  # Capture the final time
        total_time += t1 - t0
    return total_time / num_runs  # Return the average across the runs

In [9]:
big_array = np.arange(1000000)
num_runs = 5

time_nonvectorized = timer(double_nonvectorized, big_array, num_runs)
time_vectorized = timer(double_vectorized, big_array, num_runs)

print("Nonvectorized code took ", time_nonvectorized, "s")
print("Vectorized code took    ", time_vectorized, "s")
print("Vectorized code was ", time_nonvectorized / time_vectorized, " times faster")

Nonvectorized code took  0.5241479873657227 s
Vectorized code took     0.001785421371459961 s
Vectorized code was  293.57102796250297  times faster


# ***Loop Vectorizification***

In [10]:
import time
import numpy
import array
# a and b is an array of array with int of 8 bytes size
a = array.array('q')
for i in range(50000):
  a.append(i);
  b = array.array('q')
for i in range(50000, 100000):
  b.append(i)
  # classic dot product of vectors implementation
start_time = time.process_time()
classic_dot_product = 0.0;
for i in range(len(a)):
  classic_dot_product += a[i] * b[i]
end_time = time.process_time()
print('classic_dot_product = '+ str(classic_dot_product));
print('Computation time using loops = ' + str(1000*(end_time-start_time)) + 'ms')
vectorised_start_time = time.process_time()
vectorised_dot_product = numpy.dot(a, b)
vectorised_end_time = time.process_time()
print('\nvectorised_dot_product = '+str(vectorised_dot_product))
print('Computation time using vectorization = '+str(1000*(vectorised_end_time-vectorised_start_time))+ 'ms')

classic_dot_product = 104164166675000.0
Computation time using loops = 16.798377000000198ms

vectorised_dot_product = 104164166675000
Computation time using vectorization = 1.2374350000001755ms


***Outer Product***

In [11]:
import time
import numpy
import array
# a and b is an array of array with int of 2 bytes size
a = array.array('i')
for i in range(300):
  a.append(i);
  b = array.array('i')
for i in range(300, 600):
  b.append(i)
# classic outer product of vectors implementation
start_time = time.process_time()
classic_outer_product = numpy.zeros((300, 300))
for i in range(len(a)):
  for j in range(len(b)):
    classic_outer_product[i][j]= a[i]*b[j]
end_time = time.process_time()
print('outer_product = '+ str(classic_outer_product));
print('Computation time using loops = ' + str(1000*(end_time - start_time )) + 'ms')
vectorised_start_time = time.process_time()
vectorised_outer_product = numpy.outer(a, b)
vectorised_end_time = time.process_time()
print('vectorised_outer_product = '+str(vectorised_outer_product));
print('Computation time using vectorization = '+str(1000*(vectorised_end_time - vectorised_start_time ))+'ms')


outer_product = [[     0.      0.      0. ...      0.      0.      0.]
 [   300.    301.    302. ...    597.    598.    599.]
 [   600.    602.    604. ...   1194.   1196.   1198.]
 ...
 [ 89100.  89397.  89694. ... 177309. 177606. 177903.]
 [ 89400.  89698.  89996. ... 177906. 178204. 178502.]
 [ 89700.  89999.  90298. ... 178503. 178802. 179101.]]
Computation time using loops = 46.209578999999664ms
vectorised_outer_product = [[     0      0      0 ...      0      0      0]
 [   300    301    302 ...    597    598    599]
 [   600    602    604 ...   1194   1196   1198]
 ...
 [ 89100  89397  89694 ... 177309 177606 177903]
 [ 89400  89698  89996 ... 177906 178204 178502]
 [ 89700  89999  90298 ... 178503 178802 179101]]
Computation time using vectorization = 1.4567709999999678ms


***Element wise Product***

In [12]:
import time
import numpy
import array
# a and b is an array of array with int of 2 bytes size
a = array.array('i')
for i in range(20000):
  a.append(i);
  b = array.array('i')
for i in range(20000, 40000):
  b.append(i)
# classic element wise product of vectors implementation
vector = numpy.zeros((20000))
start_time = time.process_time()
for i in range(len(a)):
  vector[i]= a[i]*b[i]
end_time = time.process_time()
print('Element wise Product = '+ str(vector));
print('Computation time using loops = ' + str(1000*(end_time - start_time )) + 'ms')
vectorised_end_time = time.process_time()
vector = numpy.multiply(a, b)
vectorised_start_time = time.process_time()
print('Element wise Product = '+str(vector));
print('Computation time using vectorization = '+str(1000*(vectorised_end_time - vectorised_start_time ))+'ms')

Element wise Product = [0.00000000e+00 2.00010000e+04 4.00040000e+04 ... 7.99820009e+08
 7.99880004e+08 7.99940001e+08]
Computation time using loops = 6.911990999999951ms
Element wise Product = [        0     20001     40004 ... 799820009 799880004 799940001]
Computation time using vectorization = -0.15551499999943985ms


***Sum of numbers***

In [13]:
import time
import numpy as np
start = time.time()
total = 0
# sum using loop
for item in range(0, 300):
  total = total + item
print('Sum:' + str(total))
end = time.time()
print('Computation time using loop is:' +str(end - start))
# using vectorization
vectorised_start_time = time.time()
print('Sum:' + str(np.sum(np.arange(300))))
vectorised_end_time= time.time()
print('Computation time using vectorization is:' +str(vectorised_end_time - vectorised_start_time ))

Sum:44850
Computation time using loop is:0.0002243518829345703
Sum:44850
Computation time using vectorization is:0.00019407272338867188


# ***For Loops***

***Without Vectorization***

In [14]:
import time
start = time.time()

# iterative sum
total = 0
# iterating through 1.5 Million numbers
for item in range(0, 1500000):
    total = total + item

print('sum is:' + str(total))
end = time.time()
print(end - start)
#1124999250000
#0.14 Seconds

sum is:1124999250000
0.20457816123962402


# ***Vectorization***

In [15]:
import numpy as np

start = time.time()
# vectorized sum - using numpy for vectorization
# np.arange create the sequence of numbers from 0 to 1499999
print(np.sum(np.arange(1500000)))
end = time.time()
print(end - start)

##1124999250000
##0.008 Seconds

1124999250000
0.013508796691894531


***For loops on Pandas***

In [16]:
import numpy as np
import pandas as pd
df = pd.DataFrame(np.random.randint(0, 50, size=(50000, 4)), columns=('a','b','c','d'))
df.shape
# (5000000, 5)
df.head()

Unnamed: 0,a,b,c,d
0,49,4,1,25
1,30,13,25,30
2,2,2,1,1
3,17,13,6,1
4,9,30,25,26


In [17]:
df.shape

(50000, 4)

In [18]:
out = np.array(df)
np.var(out)

207.661864906775

In [None]:
np.linalg.det(out)

In [20]:
np.correlate([1, 2, 3], [0, 1, 0.5], "full")

array([0.5, 2. , 3.5, 3. , 0. ])

In [24]:
import numpy as np
eigenvalues, eigenvectors = np.linalg.eig(np.diag((10, 20, 15)))

In [25]:
eigenvectors

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [22]:
eigenvalues

array([1., 2., 3.])

# ***Without Vectorization***

In [None]:
import time
start = time.time()

# Iterating through DataFrame using iterrows
for idx, row in df.iterrows():
    # creating a new column
    df.at[idx,'ratio'] = 100 * (row["d"] / row["c"]+1)
end = time.time()
print(end - start)
### 325.6 Seconds

**Vectorization**

In [None]:
start = time.time()
df["ratio"] = 100 * (df["d"] / df["c"])

end = time.time()
print(end - start)
### 0.12 seconds

***ُSolving ML problem***

In [None]:
import numpy as np
# setting initial values of m
m = np.random.rand(1,5)

# input values for 5 million rows
x = np.random.rand(5000000,5)

***ٌWithout Vectorization***

In [None]:
import numpy as np
m = np.random.rand(1,5)
x = np.random.rand(5000000,5)
zer =  np.random.rand(5000000,1)
total = 0
tic = time.process_time()
for i in range(0,5000000):
    total = 0
    for j in range(0,5):
        total = total + x[i][j]*m[0][j]

    zer[i] = total
toc = time.process_time()
print ("Computation time = " + str((toc - tic)) + "seconds")
####Computation time = 24.5 seconds

In [None]:
tic = time.process_time()

#dot product
np.dot(x,m.T)
toc = time.process_time()
print ("Computation time = " + str((toc - tic)) + "seconds")
####Computation time = 0.107 seconds

***Broadcasting***

In [None]:
import numpy as np

# Create a NumPy array and add a scalar
a = np.array([1, 2, 3, 4])
b = 2

# Perform broadcasting
result = a + b

print(result)

# ***Universal Functions (ufuncs)***

In [None]:
import numpy as np

# Create a NumPy array
a = np.array([1, 2, 3, 4])

# Calculate the square of each element
result = np.square(a)

print(result)

# ***Vectorized Indexing and Slicing***

In [None]:
import numpy as np

# Create a NumPy array
a = np.array([1, 2, 3, 4, 5])

# Use fancy indexing to select specific elements
indices = np.array([0, 2, 4])
result = a[indices]

print(result)

In [None]:
import numpy as np

# Create a NumPy array
a = np.array([1, 2, 3, 4, 5])

# Use boolean indexing to filter elements
condition = a > 2
result = a[condition]

print(result)

***Conditional Operations***

In [26]:
import numpy as np

# Create a NumPy array
a = np.array([1, 2, 3, 4, 5])

# Use np.where to apply a condition
result = np.where(a > 4, 'high', 'low')

print(result)

['low' 'low' 'low' 'low' 'high']


***Custom ufuncs***

In [None]:
import numpy as np

# Define a custom function
def my_func(x):
    return x /0.5

# Convert the custom function to a ufunc
my_ufunc = np.frompyfunc(my_func, 1, 1)

# Create a NumPy array
a = np.array([1, 2, 3, 4])

# Apply the custom ufunc
result = my_ufunc(a)

print(result)