<br><br><br><br><br>

# Numpy and Pandas

In [12]:
# Start with a plausible problem: analyze a dataset of daily Newark temperatures since 1883.

import glob

for filename in glob.glob("data/newark-temperature-*.txt"):
    print("-------------------------------")
    print(filename)
    with open(filename) as file:
        print(file.read())

-------------------------------
data/newark-temperature-maxi.txt
52
43
32
23
27
30
28
28
32
32
16
15
18
16
19
17
20
21
24
24
29
29
32
39
44
39
37
31
43
37
40
36
36
43
20
19
43
46
25
38
54
38
36
35
45
49
48
20
29
36
15
25
32
37
38
37
36
40
33
40
38
31
33
29
36
41
45
44
40
40
53
51
49
39
31
34
43
39
42
52
46
35
48
55
48
44
41
43
55
55
74
51
55
75
62
48
38
63
58
46
59
51
55
62
50
67
52
61
59
41
58
56
55
60
52
55
50
71
62
58
48
72
47
55
65
63
70
69
72
79
84
74
60
65
76
61
61
62
64
76
87
85
89
71
80
66
58
62
70
70
71
70
72
78
76
90
90
74
76
75
89
92
74
72
79
85
68
64
86
92
97
92
97
77
81
78
60
61
68
77
81
83
84
81
80
85
83
84
87
83
78
82
86
92
76
80
89
88
90
86
88
87
92
82
83
89
96
79
83
79
87
85
85
84
84
76
85
86
82
86
92
88
89
91
81
78
86
88
81
84
79
69
79
82
76
79
92
89
87
83
83
79
75
72
75
72
77
83
77
79
72
73
79
68
72
74
76
75
72
69
65
82
75
70
78
71
72
75
55
58
65
57
61
70
72
70
57
69
63
73
71
78
70
67
67
70
74
59
49
62
64
56
60
65
68
63
69
63
55
65
60
50
45
47
54
61
60
45
49
61
58
50

In [13]:
# Read the averages into arrays.

avrg = []
with open("data/newark-temperature-avrg.txt") as file:
    for line in file.readlines():
        avrg.append(float(line))

mini = []
with open("data/newark-temperature-mini.txt") as file:
    for line in file.readlines():
        mini.append(float(line))

maxi = []
with open("data/newark-temperature-maxi.txt") as file:
    for line in file.readlines():
        maxi.append(float(line))

print("how many?", len(avrg), len(mini), len(maxi))    # having the same length is essential!
print("starts with", avrg[:3], mini[:3], maxi[:3])
print("ends with  ", avrg[-3:], mini[-3:], maxi[-3:])

how many? 42019 42019 42019
starts with [nan, nan, nan] [26.0, 34.0, 17.0] [52.0, 43.0, 32.0]
ends with   [68.0, 59.0, 47.0] [65.0, 46.0, nan] [73.0, 67.0, nan]


In [17]:
# The minima and maxima are more complete than the averages.

import math

print("fraction good in avrg:", sum(0 if math.isnan(x) else 1 for x in avrg) / len(avrg))
print("fraction good in mini:", sum(0 if math.isnan(x) else 1 for x in mini) / len(mini))
print("fraction good in maxi:", sum(0 if math.isnan(x) else 1 for x in maxi) / len(maxi))

fraction good in avrg: 0.11228253885147195
fraction good in mini: 0.9977629167757444
fraction good in maxi: 0.9978819105642686


In [25]:
%%timeit

# So let's "impute" averages: the measured average is ideal, but take (mini + maxi)/2 if unavailable.

imputed = []
for average, minimum, maximum in zip(avrg, mini, maxi):
    if math.isnan(average):
        imputed.append(0.5*(minimum + maximum))
    else:
        imputed.append(average)

6.73 ms ± 114 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [24]:
# Same thing in Numpy: load the data and impute missing averages.

import numpy

np_avrg = numpy.array(avrg)
np_mini = numpy.array(mini)
np_maxi = numpy.array(maxi)

print("how many?", len(np_avrg), len(np_mini), len(np_maxi))
print("starts with", np_avrg[:3], np_mini[:3], np_maxi[:3])
print("ends with  ", np_avrg[-3:], np_mini[-3:], np_maxi[-3:])

print()
print("fraction good in avrg:", numpy.isnan(np_avrg).sum() / len(np_avrg))
print("fraction good in mini:", numpy.isnan(np_mini).sum() / len(np_mini))
print("fraction good in maxi:", numpy.isnan(np_maxi).sum() / len(np_maxi))

how many? 42019 42019 42019
starts with [nan nan nan] [26. 34. 17.] [52. 43. 32.]
ends with   [68. 59. 47.] [65. 46. nan] [73. 67. nan]

fraction good in avrg: 0.887717461148528
fraction good in mini: 0.002237083224255694
fraction good in maxi: 0.002118089435731455


In [26]:
%%timeit

#                        condition             if true                  if false
np_imputed = numpy.where(numpy.isnan(np_avrg), 0.5*(np_mini + np_maxi), np_avrg)

79.1 µs ± 1.31 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


<br><br><br><br><br>

Your milage may vary, but <tt>6.73 ms</tt> versus <tt>79.1 µs</tt> is a factor of 85!

Factor 100‒1000 speedups from pure Python → Numpy are common, and they make the difference between 5 minutes (bathroom break) and 8 hours (overnight).

### Fundamentally different code order

Also notice that we had to change the code from

```python
imputed = []
for average, minimum, maximum in zip(avrg, mini, maxi):
    if math.isnan(average):
        imputed.append(0.5*(minimum + maximum))
    else:
        imputed.append(average)
```

to

```python
np_imputed = numpy.where(numpy.isnan(np_avrg), 0.5*(np_mini + np_maxi), np_avrg)
```

Pure Python is step-by-step, which can be good or bad. Numpy is all-at-once, which can be good or bad.

**Step-by-step:**

   * is **good** because you can insert breakpoints and watch variables to debug the code; it's like a microscopic view with no abstraction;
   * is **bad** because the bigger picture can be lost when spread out among so many lines. (This is why I use list comprehensions.)

**All-at-once:**

   * is **good** because the composition of functions often reads like an English description of the problem to be solved;
   * is **bad** because many indexes need to align; it's hard to break the process apart to debug it. (I usually get a line of Numpy right on the fifth try. Error messages are your friend.)

**Trade-offs:**

Pure Python is generally easier to _write,_ making it good for prototyping. Numpy is often easier to _read_.

And, of course, Numpy is faster.

In [34]:
# Pure Python code order: acts on one DATUM at a time.
# Numpy code order: acts on one ATTRIBUTE at a time.

a = numpy.random.uniform(5, 10, 10000)
b = numpy.random.uniform(10, 20, 10000)
c = numpy.random.uniform(-0.1, 0.1, 10000)

# Computes one quadratic formula on ai, bi, ci before moving on to the next one.
roots1 = numpy.empty(10000, dtype=a.dtype)
for i in range(10000):
    roots1[i] = (-b[i] + math.sqrt(b[i]**2 - 4*a[i]*c[i])) / (2*a[i])

# Computes one step in the quadratic formula for all 10000 before moving on to the next step.
roots2 = (-b + numpy.sqrt(b**2 - 4*a*c)) / (2*a)

print(roots1[:10])
print(roots2[:10])

[-2.82926022e-03  2.12493732e-03  6.57279745e-04 -3.26031700e-03
 -4.97285725e-05  4.06599597e-03 -5.09504351e-03  8.70318282e-03
  1.02989470e-03  2.38153635e-03]
[-2.82926022e-03  2.12493732e-03  6.57279745e-04 -3.26031700e-03
 -4.97285725e-05  4.06599597e-03 -5.09504351e-03  8.70318282e-03
  1.02989470e-03  2.38153635e-03]


In [35]:
# The Numpy expression (-b + numpy.sqrt(b**2 - 4*a*c)) / (2*a) actually computes something like:

tmp1 = numpy.negative(b)            # -b
tmp2 = numpy.square(b)              # b**2
tmp3 = numpy.multiply(4, a)         # 4*a
tmp4 = numpy.multiply(tmp3, c)      # tmp3*c
tmp5 = numpy.subtract(tmp2, tmp4)   # tmp2 - tmp4
tmp6 = numpy.sqrt(tmp5)             # sqrt(tmp5)
tmp7 = numpy.add(tmp1, tmp6)        # tmp1 + tmp6
tmp8 = numpy.multiply(2, a)         # 2*a
roots3 = numpy.divide(tmp7, tmp8)   # tmp7 / tmp8

print(roots1[:10])
print(roots2[:10])
print(roots3[:10])

[-2.82926022e-03  2.12493732e-03  6.57279745e-04 -3.26031700e-03
 -4.97285725e-05  4.06599597e-03 -5.09504351e-03  8.70318282e-03
  1.02989470e-03  2.38153635e-03]
[-2.82926022e-03  2.12493732e-03  6.57279745e-04 -3.26031700e-03
 -4.97285725e-05  4.06599597e-03 -5.09504351e-03  8.70318282e-03
  1.02989470e-03  2.38153635e-03]
[-2.82926022e-03  2.12493732e-03  6.57279745e-04 -3.26031700e-03
 -4.97285725e-05  4.06599597e-03 -5.09504351e-03  8.70318282e-03
  1.02989470e-03  2.38153635e-03]
