## Improving Python code examples

In [2]:
import time
import numpy as np
import pandas as pd
import pyworms
import math

In [1]:
for x in range(500):
    # this line will be executed 500 times
    for y in range(300):
        # this lines will be executed 150,000 times!
        for z in range(200):
            # this line will be executed 30,000,000 times!
            3 + 7
# Time complexity is O(xyz) (matches O(n^3) pattern)

In [12]:
samples = 1000000

count = 0
start = time.perf_counter()
for x in range(samples):
    count += 1
end = time.perf_counter()
print(f'loop sum: {round(end - start, 4)} s')

start = time.perf_counter()
sum(range(samples))
end = time.perf_counter()
print(f'python sum: {round(end - start, 4)} s')

start = time.perf_counter()
np.sum(np.arange(samples))
end = time.perf_counter()
print(f'numpy sum: {round(end - start, 4)} s')

# Make sure to use numpy functions otherwise performance gains are negated!
start = time.perf_counter()
np.sum(np.array(range(samples)))
end = time.perf_counter()
print(f'wrong numpy sum: {round(end - start, 4)} s')

loop sum: 0.0654 s
python sum: 0.0247 s
numpy sum: 0.002 s
wrong numpy sum: 0.0688 s


In [5]:
samples = 1000000

result = [None] * samples
start = time.perf_counter()
for i in range(1, samples):
    result[i] = math.log(i)
end = time.perf_counter()
print(f'python natural log: {round(end - start, 4)} s')
print(result[:5])

vals = np.arange(1, samples)
start = time.perf_counter()

result = np.log(vals)

end = time.perf_counter()
print(f'numpy natural log: {round(end - start, 4)} s')
print(result[:5])

python natural log: 0.1765 s
[None, 0.0, 0.6931471805599453, 1.0986122886681098, 1.3862943611198906]
numpy natural log: 0.0152 s
[0.         0.69314718 1.09861229 1.38629436 1.60943791]


Code was running slow so first identified the slow portion.
Then simplified it using a built in function for faster runtime, and ease of use

In [2]:
taxa: list = ['Carteria marina', 'Coccopterum labyrinthus', 'Dunaliella tertiolecta', 'Halosphaera minor', 'Halosphaera viridis']
worms: list = []

start = time.perf_counter()
for species in taxa:  # iterate through DF for real version
        result: list = pyworms.aphiaRecordsByMatchNames(species)[0]  # full taxa records from WoRMS
        if len(result) > 0:
            worms.append(result)
end = time.perf_counter()
print(f'pyworms many calls: {(end - start)} s')

start = time.perf_counter()
worms = pyworms.aphiaRecordsByMatchNames(taxa)
end = time.perf_counter()
print(f'pyworms one call: {(end - start)} s')

pyworms many calls: 37.696094700018875 s
pyworms one call: 6.6481369999819435 s


In [None]:
# a function
data: list
def calc_val():
    for x in range(500):
        for y in range(500):
            # any lines executed here will run 250,000 times!
            if mld_calc(data[x][y]) < 0.3:
                # do something
                3 + 4
            elif mld_calc(data[x][y]) < 0.7:
                # do something
                4 + 3
            else:
                # do something
                4 + 4

Problems
- Nested loop
    - Sometimes ok but there is probably a better way
- mld_calc is getting called 500,000 times!
- data[x][y] is getting called 500,000 times!

In [None]:
# a function
data: list
result: list
def calc_val():
    for x in range(500):
        for y in range(500):
            # any lines executed here will run 250,000 times!
            mld_val = mld_calc(data[x][y])
            if mld_val < 0.3:
                # do something
                3 + 4
            elif mld_val < 0.7:
                # do something
                4 + 3
            else:
                # do something
                4 + 4