# Review: Data Types and Summary Statistics

|Aggregate Stat | Quantitative Continuous | Quantitative Discrete | Qualitative Ordinal | Qualitative Nominal |
|------|------|------|------|------|
| unique values | yes* | yes | yes | yes |
| min | yes | yes | yes | no |
| max | yes | yes | yes | no |
| range |  |  |  |  |
| mean |  |  |  |  |
| median | | | |  |
| mode |  |  |  |  |
| variance |  |  |  |  |


## Copying numpy arrays

This is a reminder to use "deep copy" rather than "shallow copy"

In [1]:
import numpy as np

# let's try the obvious thing
nparray = np.array([[0, 1, 2, 3], [10, 11, 12, 13], [20, 21, 22, 23], [30, 31, 32, 33]])
nparray2 = nparray
print("nparray")
print(nparray)
print("nparray2")
print(nparray2)

nparray
[[ 0  1  2  3]
 [10 11 12 13]
 [20 21 22 23]
 [30 31 32 33]]
nparray2
[[ 0  1  2  3]
 [10 11 12 13]
 [20 21 22 23]
 [30 31 32 33]]


In [2]:
nparray[0,0] = 200
print("nparray")
print(nparray)
print("nparray2")
print(nparray2)

# whaaat just happened?

nparray
[[200   1   2   3]
 [ 10  11  12  13]
 [ 20  21  22  23]
 [ 30  31  32  33]]
nparray2
[[200   1   2   3]
 [ 10  11  12  13]
 [ 20  21  22  23]
 [ 30  31  32  33]]


In [3]:
# how do we stop that happening?? hint, what are we doing? we are *copying*
nparray2 = nparray.copy()
print("nparray")
print(nparray)
print("nparray2")
print(nparray2)

nparray
[[200   1   2   3]
 [ 10  11  12  13]
 [ 20  21  22  23]
 [ 30  31  32  33]]
nparray2
[[200   1   2   3]
 [ 10  11  12  13]
 [ 20  21  22  23]
 [ 30  31  32  33]]


In [4]:
nparray[0,0] = 0
print("nparray")
print(nparray)
print("nparray2")
print(nparray2)

nparray
[[ 0  1  2  3]
 [10 11 12 13]
 [20 21 22 23]
 [30 31 32 33]]
nparray2
[[200   1   2   3]
 [ 10  11  12  13]
 [ 20  21  22  23]
 [ 30  31  32  33]]


## Doing things to whole numpy arrays (broadcasting)

In [5]:
import numpy as np

nparray = np.array([[0, 1, 2, 3], [10, 11, 12, 13], [20, 21, 22, 23], [30, 31, 32, 33]])
print("nparray\n", nparray)
print("nparray shape\n", nparray.shape)

nparray
 [[ 0  1  2  3]
 [10 11 12 13]
 [20 21 22 23]
 [30 31 32 33]]
nparray shape
 (4, 4)


In [9]:
# what if I want every element in nparray * 2?
print(nparray)
nparray = nparray*2
print(nparray)

[[ 0  1  2  3]
 [10 11 12 13]
 [20 21 22 23]
 [30 31 32 33]]
[[ 0  2  4  6]
 [20 22 24 26]
 [40 42 44 46]
 [60 62 64 66]]


In [10]:
print(nparray)

[[ 0  2  4  6]
 [20 22 24 26]
 [40 42 44 46]
 [60 62 64 66]]


In [11]:
# what if I want every element in nparray / 2?
print(nparray/2)
# watch out!!
(nparray/2).dtype

[[ 0.  1.  2.  3.]
 [10. 11. 12. 13.]
 [20. 21. 22. 23.]
 [30. 31. 32. 33.]]


dtype('float64')

In [13]:
nparray = nparray/2
nparray = nparray.astype(int)
print(nparray)

[[ 0  0  1  1]
 [ 5  5  6  6]
 [10 10 11 11]
 [15 15 16 16]]


In [15]:
# let's get some summary statistics

data = np.genfromtxt('data/vehiclesNumeric.csv', dtype=int, delimiter=',', skip_header=1, encoding='utf8')
print(data[0:10]) # all the rows and the first column: data[:, 1]
print(data.shape)
# no for loops!
print("max", data.max(axis=0), "\nmin", data.min(axis=0), "\nmean", data.mean(axis=0, dtype=int))


[[7314278078       9988       2014     121259]
 [7313594854       5500       2010     114988]
 [7313219085       9500       2013     125000]
 [7311830338       9988       2014     121259]
 [7311365259      10888       2014     121259]
 [7307029274      11500       2012      64000]
 [7306472445       9800       2012      76117]
 [7304888907      19900       2018      35823]
 [7304871378       7500       2011      51000]
 [7314677292       4200       2009      93000]]
(3158, 4)
max [7317070740      55000       2021    9999999] 
min [7301645993          0       1999          0] 
mean [7311728502       7978       2012     112440]


In [26]:
# (review!) how do we assign value(s) to a row or column?
nparray[:1] = np.zeros(nparray.shape[1])
print(nparray)

# if we want the first column to be zeros
nparray[:, :2] = 0
print(nparray)

[[ 0  0  0  0]
 [ 0  5  6  6]
 [ 0 10 11 11]
 [ 0 15 16 16]]
[[ 0  0  0  0]
 [ 0  0  6  6]
 [ 0  0 11 11]
 [ 0  0 16 16]]


In [23]:
print(nparray[:, :1].shape)
print(nparray[:, :1])

(4, 1)
[[0]
 [0]
 [0]
 [0]]


In [24]:
print(nparray[:, 0].shape)
print(nparray[:, 0])

(4,)
[0 0 0 0]


In [27]:
# let's sum across each column
np.sum(nparray, axis=0)

array([ 0,  0, 33, 33])

In [28]:
# how would we sum across each row?
np.sum(nparray, axis=1)

array([ 0, 12, 22, 32])

In [33]:
# what if we had a tensor?
nptensorFloat = np.ones([3, 4, 5])
print(nptensorFloat)

np.sum(nptensorFloat, axis=2)
print(np.sum(nptensorFloat, axis=2).shape)

[[[1. 1. 1. 1. 1.]
  [1. 1. 1. 1. 1.]
  [1. 1. 1. 1. 1.]
  [1. 1. 1. 1. 1.]]

 [[1. 1. 1. 1. 1.]
  [1. 1. 1. 1. 1.]
  [1. 1. 1. 1. 1.]
  [1. 1. 1. 1. 1.]]

 [[1. 1. 1. 1. 1.]
  [1. 1. 1. 1. 1.]
  [1. 1. 1. 1. 1.]
  [1. 1. 1. 1. 1.]]]
(3, 4)


In [34]:
# what if we don't specify an axis?
print(np.sum(nptensorFloat))

60.0


In [None]:
# what other functions can we apply across axes?

In [None]:
# let's take it up a notch

nparrayRandomInt = np.random.randint(low=0, high=10, size=(3,4))
print(nparrayRandomInt)

print(nparrayRandomInt - np.min(nparrayRandomInt, axis=0))

# whaaat just happened? let's look at the shapes


In [None]:
# what if we try to do the subtract-min thing across axis 1?
print(nparrayRandomInt - np.min(nparrayRandomInt, axis=1))


In [None]:
# how can we fix that? make the arrays shape-compatible!
print(nparrayRandomInt - np.min(nparrayRandomInt, axis=1)[:, np.newaxis])

In [None]:
# is there another way to achieve this?
print(nparrayRandomInt - np.min(nparrayRandomInt, axis=1, keepdims=True))


## Why numpy?

Numpy is space efficient
(reference: https://www.geeksforgeeks.org/python-lists-vs-numpy-arrays/)

- very space efficient because it's based on C

In [35]:
# importing numpy package
import numpy as np
  
# importing system module
import sys
  
# declaring a list of 1000 elements 
S= range(1000)
  
# printing size of each element of the list
print("Size of each element of list in bytes: ",sys.getsizeof(S))
  
# printing size of the whole list
print("Size of the whole list in bytes: ",sys.getsizeof(S)*len(S))
  
# declaring a Numpy array of 1000 elements 
D= np.arange(1000)
  
# printing size of each element of the Numpy array
print("Size of each element of the Numpy array in bytes: ",D.itemsize)
  
# printing size of the whole Numpy array
print("Size of the whole Numpy array in bytes: ",D.size*D.itemsize)

Size of each element of list in bytes:  48
Size of the whole list in bytes:  48000
Size of each element of the Numpy array in bytes:  8
Size of the whole Numpy array in bytes:  8000


Numpy *can be* more time efficient (reference: https://stackoverflow.com/questions/9708783/numpy-vs-list-comprehension-which-is-faster)

In [36]:
import sys, numpy
import timeit #times things, use numpy for fast programming

def numpysum(n):
    a = numpy.arange(n) ** 2
    b = numpy.arange(n) ** 3
    return a + b

def pythonsum(n):
    a = [i ** 2 for i in range(n)]
    b = [i ** 3 for i in range(n)]
    return [a[i] + b[i] for i in range(n)]

for size in [10, 100, 1000]:
    print("size", size)
    print("time with python", timeit.timeit(lambda: pythonsum(size)))
    print("time with numpy", timeit.timeit(lambda: numpysum(size)))

size 10
time with python 8.557022005999897
time with numpy 4.4259566390001055
size 100
time with python 69.8246585930001
time with numpy 4.723551878000308
size 1000
time with python 770.5584602989998
time with numpy 10.63126297999952
