In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Numpy
https://numpy.org/doc/stable/
- Implemented by C
- Based on **CPU** (Vectorization)
- Doesn't support Python generic, DL/ML functions

## + DL + GPU
- Pytorch(Facebook) -> Tensor based on NUMPY
- Tensorflow(Google) -> based on NUMPY
- SKlearn -> based on NUMPY, but doesn't support GPU
  - 

## NDArray
```Block memory + Indexing Scheme + Data Type Descriptor```
- Row-major
- Homogeneous Type
  - All elements in ndarray are the same type
  - Faster
- Implemented by **Array**, not **Linked List**
  - Python List is implemented by Linked List
- If objects have ```\_\_array\_\_``` method, it can be compatible with NDArray
  - ```np.array``` method is used for converting object to NDArray
  
### dType
- It's important for the time and space performance
- Both the types of C and the types of Python are supported
  - C is faster
  - **Smaller type is better than larger**
- Since NDArray is homogeneous, type is adjusted as the most general type
- Specify dType is ```dtype = np.Type```
- Check dType is ```object.dtype```
- Convert dType is ```object.astype(np.Type)```
- Re-interpret dType ```object.view(np.Type)```. **The memory remains**

In [78]:
# Check supported type
print(np.sctypeDict.keys())

# Performance
x = np.ones((int(1e8),), dtype=np.int64)
y = np.zeros((int(1e8),), dtype=np.int16)

%timeit x*x
%timeit y*y

x = np.array([1,2,3], dtype=np.int64)
y = x.astype(np.int32)
z = x.view(np.int32)
print(x)
print(y)
print(z)

dict_keys(['?', 0, 'byte', 'b', 1, 'ubyte', 'B', 2, 'short', 'h', 3, 'ushort', 'H', 4, 'i', 5, 'uint', 'I', 6, 'intp', 'p', 7, 'uintp', 'P', 8, 'long', 'l', 'L', 'longlong', 'q', 9, 'ulonglong', 'Q', 10, 'half', 'e', 23, 'f', 11, 'double', 'd', 12, 'longdouble', 'g', 13, 'cfloat', 'F', 14, 'cdouble', 'D', 15, 'clongdouble', 'G', 16, 'O', 17, 'S', 18, 'unicode', 'U', 19, 'void', 'V', 20, 'M', 21, 'm', 22, 'bool8', 'b1', 'int64', 'i8', 'uint64', 'u8', 'float16', 'f2', 'float32', 'f4', 'float64', 'f8', 'complex64', 'c8', 'complex128', 'c16', 'object0', 'bytes0', 'str0', 'void0', 'datetime64', 'M8', 'timedelta64', 'm8', 'int32', 'i4', 'uint32', 'u4', 'int16', 'i2', 'uint16', 'u2', 'int8', 'i1', 'uint8', 'u1', 'complex_', 'int0', 'uint0', 'single', 'csingle', 'singlecomplex', 'float_', 'intc', 'uintc', 'int_', 'longfloat', 'clongfloat', 'longcomplex', 'bool_', 'bytes_', 'string_', 'str_', 'unicode_', 'object_', 'int', 'float', 'complex', 'bool', 'object', 'str', 'bytes', 'a'])
117 ms ± 12.6

## Vectorization
- Arrays enable to express **batch operations** on data without any loop 'cause **sequential**
- Vectorization means the operations of sequential data on elementwise
- Numpy's ndarray is array-like. So, it can do parallels but it's important that the numpy is optimied at Big-Data. Thus, in small data, basic python list can be faster



## Shape, Size and Stride

### Shape
- ```(z, y, x)```
- Each ```vector``` has x elements
- Each ```matrix``` has y vectors
- Each ```3D array``` has z matrics
- **NDIM** is equals to ```len(shape)```
- Each ndarray is called by **NDIM** Tensor
  - shape = (n,) is 1st tensor, same as vector
  - shape = (m,n,) is 2nd tensor, same as matrix
  - shape = (p,m,n,) is 3rd tensor
- Examples(Dataset structure)
  - ```Vector Data```
    - Simple Dataset
    - 2D tensor(samples, features)
  - ```Timeseries or Sequential Data```
    - NL or Audio
    - 3D tensor(samples, timesteps, features)
  - ```Images Data```
    - 4D tensor(samples, height, width, channels) or (samples, channels, height, width)
  - ```Videos Data```
    - 5D tensor(samples, frames, Image)
    
### Size
- The size of the object
  - ```size = The products of all the shape of ndarray```
- ```object.itemsize``` shows element's bytes
  - The real size of objects is ```object.itemsize * object.size```
- ```np.resize``` changes the size of objects, so can loss its data
  - ```obj.resize``` can onnly change its own data

### Stride
- ```(matrix's stride, vector's stride, element's stride)```
- stride is based on **octet**
- ```transpose``` changes the strides of ndarray and set flag **transpose** so just **O(1)**


### Memory block
- Memory block maybe **shared**
- ```ndarray.data``` shows the memory block of objects, sequential
- ```ndarray.base``` refers to base object if ndarray is created by other ndarray
- **COPY-ON-DEMAND** isn't supported. For using independently, use **COPY**


### Byte order
- **=** := native
- **<** := little-endian
- **>** := big-endian
- **|** := not-relevant(endian doesn't matter)

### Row-major(C-style) vs Column-major(Fortran-style)
- The stride is applied by row/column
- Numpy is defaultly row-major. The ```order='C'|'F'``` specifies the major

### Axis
- The axis of an array describes the position of indexing
```
[[1,2], 
 [3,4]]
 
 arr[x,:] is [1,2] or [3,4] <-- axis = 0
 arr[:, y] is [1,3] or [2,4] <-- axis = 1
```

In [66]:
tensor = np.arange(100, dtype=np.int32).reshape((5, 5, 4, 1))
assert tensor.ndim == 4 == len(tensor.shape)
assert tensor.strides == (4*20, 4*4, 4*1, 4*1)
assert tensor.transpose().strides == (4*1, 4*1, 4*4, 4*20)

# Axis
print(np.mean(np.arange(1, 101).reshape(20,5), axis=0)) # The elements in axis=0 are reduced by mean operation
print(np.mean(np.arange(1, 101).reshape(20,5), axis=1)) # The elemetns in axis=1 are reduced by mean operation

x = np.resize(tensor, (3,4))
assert x.size == 3*4
print("real memory size:", x.size * x.itemsize)

[48.5 49.5 50.5 51.5 52.5]
[ 3.  8. 13. 18. 23. 28. 33. 38. 43. 48. 53. 58. 63. 68. 73. 78. 83. 88.
 93. 98.]
real memory size: 48


## Broadcasting
- Operation with **different-shape** arrays
- Basic numeric operation aren't applied at different shapes
- By **PULLING** tensor, the operation can be done
- **PULLING** doesn't copy memory. Just re-use if it is fulfilled

### Pulling Condition
- The paired dimension is same
- Pairing can't be transposed
- Pulled demension **must** be ```1```
- Pulling follows tensor definition
  - 1D -> 2D -> 3D -> ...
  - If operands are the same demension, Pulling is possible if Pulled demension is **1**

In [68]:
x = np.ones((10,))
y = np.ones((1,))
x + y
z = np.ones((2,))
x + z # Error

ValueError: operands could not be broadcast together with shapes (10,) (2,) 

In [73]:
x = np.ones((2, 3)) # shape = (2, 3)
y = np.ones((3,)) # shape = (1, 3)
x + y

z = np.ones((2, 1)) # shape = (2,1)
x + z

w = np.ones((3, 1)) # shape = (3, 1)
x + w # Error

ValueError: operands could not be broadcast together with shapes (2,3) (3,1) 

In [84]:
x = np.ones((3, 4, 5)) # shape = (3, 4, 5)
y = np.ones((5, )) # shape = (1, 5)
x + y

z = np.ones((4, 5)) # shape = (1, 4, 5)
x + z 

w = np.ones((3, 4)) # shape = (3, 4)
x + w # Error 'cause (3, 4) is higher dimension in 3D tensor

ValueError: operands could not be broadcast together with shapes (3,4,5) (3,4) 

## ufunc(universal Functions)
- Numpy supports vectorized operations already compiled
- The operations apply elementwise defaultly
- It is **UFUNC**

### Benefits
- Simple and Optimistic
- How to do is better, faster than What to do
  - By using dtype, the performance can be upgraded
- Vectorization is better than loop
- **DON't** mix default python functions

In [91]:
# ufunc for one-tensor
x = np.arange(-20, 0)
np.abs(x)

# ufunc for more tensors
y = np.arange(1, 2)
np.add(x, y) # broadcasting and casting is supported
np.floor_divide(x, y)

array([-20, -19, -18, -17, -16, -15, -14, -13, -12, -11, -10,  -9,  -8,
        -7,  -6,  -5,  -4,  -3,  -2,  -1])

## Creation

In [37]:
import numpy as np

# Constructor
x = np.ndarray(shape=(4, 3), dtype=np.uint8) # Fill random elements
print(x)

# Convert sequence objects to ndarray
x = np.array([1,2,3], dtype=np.uint8) # Copy default
y = np.asarray(x) # If x is ndarray, then share it. Otherwise copy it
assert x is y

# Like builtin range but return ndarray
x = np.arange(10, dtype=np.uint8)

# Fill ones

x = np.ones(shape = (3,4), dtype=np.uint8)

y = np.ones_like(x, dtype=np.float32) # Fill ones like x's shape
assert x.shape == y.shape

# Fill zeros

x = np.zeros(shape = (3,4), dtype=np.uint8)
y = np.zeros_like(x)
assert x.shape == y.shape

# Allocate memory but don't populate with any values
x = np.empty(shape=(3,4))
y = np.empty_like(x)
assert x.shape == y.shape

# identical matrix, 2D tensor
x = np.eye(3)
y = np.identity(3)
print(x)

# Create with interval
x = np.linspace(0, 10, num=10, dtype=np.uint8)
print(x)
x = np.logspace(0, 10, base=2, num=10, dtype=np.uint16) # logscale
print(x)

[[0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]]
[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]
[ 0  1  2  3  4  5  6  7  8 10]
[   1    2    4   10   21   47  101  219  474 1024]


## NDArray vs Matrix(MATLAB support)
||NDArray|Matrix|
|------|---|---|
|Demension|Tensor|2D|
|\* operator|Elementwise|Matrix|
|numpy.multiply|Elementwise|Elementwise|
|numpy.dot|Matrix|Matrix

In [351]:
import numpy as np

mat = np.matrix([1,2])
assert matX.shape == (1,2)

mat1 = np.matrix([[1,2],
                 [3,4]])
mat2 = np.matrix([[1,2],
                 [3,4]])

tensor1 = np.array([[1,2],
                   [3,4]])

tensor2 = np.array([[1,2],
                   [3,4]])

print(mat1*mat2)
print(tensor1 * tensor2)

# tensor multiplication
# Saclar, 1D, 2D tensor is same as matrix
x = np.array([1,2])
assert np.all(x*2 == np.dot(x, 2))
y = np.array([3,4])
assert np.all(np.dot(x, y) == 11)

x = np.array([[1, 2],
             [3, 4]])
assert np.all(np.dot(x, x) == np.array([[7, 10],
                                       [15, 22]]))

# Ndarray, is calcuated by
x = np.arange(150).reshape(2, 3, 5, 5)
r = np.dot(x, x)
assert r.shape == (2, 3, 5, 2, 3, 5)
assert r[0,0,0,0,0,3] == np.sum(x[0,0,0,:] * x[0,0,:,3])

# The last sub-tensor of first tensor and the second last sub-tensor of second tensor is calculated by the sum-production

[[ 7 10]
 [15 22]]
[[ 1  4]
 [ 9 16]]


## Casting&Indexing&Slicing&Searching

### Casting
- Casting is only types, **not value**
- Casting largest type is safe
- ```astype```, ```view```

### Indexing & Slicing & Searching
- If indexing or Slicing returns ndarray, they share base object
- A single ellipsis can occur
- Searching is done by ```np.where()``` or statical methods like ```np.max/min```
  - ```np.where(condition, [x, y])```
    - If condition == True then yield x. Otherwise yield y

In [244]:
import numpy as np

x = np.array([1,2,3])
y = x[0:1]

assert x.base is None
assert y.base is x

# But not the same object
assert not x is y

x = np.arange(100).reshape((10,5,2))

# Slicing with referneces
# refArr is 1D tensor and the shape must be same with corresponding axis
x = np.arange(20).reshape((5,4))
refAxis0 = np.array(['A', 'B', 'A', 'C', 'C'])
refAxis1 = np.array(['A', 'B', 'C', 'D'])
x[refAxis0=='A']
x[refAxis0=='A', refAxis1=='C']

# Copy sliced array(Fancy indexing)
# Sliced array remains its original dimension
# Wrap the copied subarray with the square brakets
# The order in brakets can be arbitrary
y = x[[1, 0]]
assert len(x.shape) == 2
assert len(y.shape) == 2


# Negative Index refers from end to start
x[-1,-1]

# Searching
print(x.max(axis=0)) # Search maximum in axis-0
print(x.argmax(axis=1)) # Search maximum, default axis is 0 and at then, just return scalar

x = np.array([1,2,3,4,5])
np.where(x > 3) # Search the indice of fulfilling conditions
# Search the indice and then find the elements
assert np.all(x[np.where(x > 3)] == x[x > 3] )

# Ellipsis
x = np.arange(100).reshape((2,2,5,5))
assert x[...,0].shape == (2, 2, 5) # Ellipsis skip all axis before explicitly denoted

# Advanced

# In python list, ::n means per interval
assert [1,2,3][::-1] == [3,2,1]
assert [1,2,3,4,5][::2] == [1,3,5]

# wrap array one more dimension
# If None is indexed, the dimension is increased
x = np.arange(10).reshape(2, 5)
assert np.newaxis == None
assert x[np.newaxis].shape == (1, 2, 5)
assert x[:,np.newaxis].shape == (2, 1, 5)
assert x[...,np.newaxis].shape == (2, 5, 1)

[16 17 18 19]
[3 3 3 3 3]


In [234]:
x = np.array([1,2,3])
x[None]

array([[1, 2, 3]])

### Examples

#### Substitude negative values as zero

In [149]:
import numpy as np

x = np.arange(-5, 5)

#1. List comprehension
y = [datum if datum >= 0 else 0 for datum in x]
print(y)

#2. Indexing
y = np.array(x)
y[y < 0] = 0
print(y)

#3. where
y = np.where(x < 0, 0, x)
print(y)

#4. Clip
# np.clip(array-like, min, max) clips the array in [min, max]
y = np.clip(x, 0, 4)
print(y)
print(x.clip(0)) # clips the current array with interval

[0, 0, 0, 0, 0, 0, 1, 2, 3, 4]
[0 0 0 0 0 0 1 2 3 4]
[0 0 0 0 0 0 1 2 3 4]
[0 0 0 0 0 0 1 2 3 4]
[0 0 0 0 0 0 1 2 3 4]


## Shape

In [202]:
import numpy as np
x = np.arange(1, 13) # (12,)
print(x)
y = x.reshape((3,4), order='C')
print(y)

# Return to the flatten array
# To flatten original x, the order MUST be same
x = y.ravel(order='C') # Return the "view" if possible, share memory
x = y.flatten('C') # Return the "copy"
print(x)

# Reshape with negative, interpreted by maximum
y = x.reshape((3, -100))
assert y.shape == (3,4) # 3 * x = 12; x = 4

[ 1  2  3  4  5  6  7  8  9 10 11 12]
[[ 1  2  3  4]
 [ 5  6  7  8]
 [ 9 10 11 12]]
[ 1  2  3  4  5  6  7  8  9 10 11 12]


## Stacking

In [245]:
# Stack at Axis, the stacked dimension must be same
# The shape of returned array is (..., the number of stacked arr(at axis), ...)
x = np.arange(1, 13)
y = np.arange(13, 25)
z = np.stack((x, y)) # Default axis is 0
print(z)
z = np.stack((x, y), axis = 1)
print(z)

x = x.reshape((3,4))
y = y.reshape((3,4))
assert np.stack((x, y)).shape[0] == 2
assert np.stack((x,y), axis=1).shape[1] == 2

[[ 1  2  3  4  5  6  7  8  9 10 11 12]
 [13 14 15 16 17 18 19 20 21 22 23 24]]
[[ 1 13]
 [ 2 14]
 [ 3 15]
 [ 4 16]
 [ 5 17]
 [ 6 18]
 [ 7 19]
 [ 8 20]
 [ 9 21]
 [10 22]
 [11 23]
 [12 24]]


## Split

```numpy.split(array-like, indices or sections, axis = 0)```
- indices or sections is ```int or 1D tensor```
  - integer means that the array-like is divided by n-equal subarries
  - 1D tensor means that the array-like is divided by slicing
  
- ```numpy.hsplit == numpy.split(axis = 1)```
- ```numpy.vsplit == numpy.split(axis = 0)```

In [246]:
import numpy as np
x = np.arange(18).reshape(6, 3)
y = np.split(x, 6, axis = 0)
assert len(y) == 6

try:
    y = np.split(x, 4) # Error. array split doesn't result in an equal division
except ValueError:
    pass

y = np.split(x, [1,2], axis = 1) #split to x[, :1], x[, 1:2], x[, 2:] 
for arr in y:
    print(arr.shape)

(6, 1)
(6, 1)
(6, 1)


## Concatenate
```
numpy.concatenate((x, y), axis=0)
```
- Concatenate arraies at axis's **tensorwise**
```
x = [1,2,3], y = [4,5,6] ==> [ [1,2,3],
                               [4,5,6] ]
```
``` 
numpy.concatenate((x,y), axis = 0) ==
numpy.vstack([x,y])
```

```
x = [1,2,3], y = [4,5,6] ==> [ 1, 2, 3, 4, 5, 6 ]
```
```
numpy.concatenate(axis = 1) == 
numpy.hstack([x, y])
```

```
x = [1,2,3], y = [4,5,6] ==> [ [1, 4],
                               [2, 5],
                               [3, 6] ]
```
```
numpy.concatenate((x.T, y.T), axis=1)
```

In [276]:
import numpy as np

x = np.array([1,2,3]).reshape(1,3)
y = np.array([4,5,6]).reshape(1,3)

z = np.concatenate((x, y), axis = 0) # Default = 0
w = np.vstack((x,y))
r = np.r_[x, y]
print(z)
assert np.all(z == w)
assert np.all(z == r)
print("=====================")
z = np.concatenate((x, y), axis = 1)
w = np.hstack((x, y))
print(z)
assert np.all(z == w)

print("=====================")
z = np.concatenate((x.T, y.T), axis = 1)
print(z)

[[1 2 3]
 [4 5 6]]
[[1 2 3 4 5 6]]
[[1 4]
 [2 5]
 [3 6]]


## Transpose and Swap axis
- Transpose changes **the stride** and set **transpose flag**

In [296]:
import numpy as np
x = np.arange(210).reshape(2,3,5,7)

assert np.all(x.T[0,0,0,:] == x[:,0,0,0])
assert np.all(x.T[0,:,0,0] == x[0,0,:,0])

# Swap axis

x = np.array([[1,2,3],
             [4,5,6]])
y = np.swapaxes(x, 0, 1)
print(y)

[[1 4]
 [2 5]
 [3 6]]


## Adding new axis

In [305]:
import numpy as np
x = np.arange(100).reshape(2,2,5,5)

assert x[np.newaxis,...].shape == (1, 2, 2, 5, 5)
assert x[:,np.newaxis,...].shape == (2, 1, 2, 5, 5)

# Repeat an array
# np.tile(x, (repeated axis))
x = np.arange(6).reshape(2, 3)
y = np.tile(x, (2, 3))
assert y.shape == (2*2, 3*3)

## Sorting

In [312]:
import numpy as np
x = np.random.random((3,4))

y = np.sort(x, axis = 0) # Sorting each column 'cause the baseline is axis=0
y = np.sort(x, axis = 1) # Sorting each row 'cause the baseline is axis=1

# Reverse sorting
y = np.sort(x, axis = 0)[::-1]

[[0.15095565 0.0119836  0.12172796 0.25271014]
 [0.32672276 0.88939201 0.37612024 0.38790659]
 [0.66201004 0.92629705 0.81595164 0.55727674]]
[[0.12172796 0.15095565 0.25271014 0.88939201]
 [0.0119836  0.32672276 0.37612024 0.38790659]
 [0.55727674 0.66201004 0.81595164 0.92629705]]


## Save and Load

In [329]:
import numpy as np

x = np.array([1,2,3])
y = np.array([4,5,6])

# Single array
np.save(r"./single.npy", x)
z = np.load(r'./single.npy')
print(z)

# Several arrays
np.savez(r"./several.npz", x=x, y=y) # For indexing, specify the identifiers
with np.load(r"./several.npz") as l:
    print(l['x'], l['y'])

# Compressed Several arrays
np.savez_compressed(r"./compressed.npz", x=x, y=y) # For indexing, specify the identifiers
with np.load(r"./compressed.npz") as l:
    print(l['x'], l['y'])
    
# Save textfile
# 1D or 2D only
np.savetxt(r"arr.txt", x)
z = np.loadtxt(r'arr.txt')
print(z)

[1 2 3]
[1 2 3] [4 5 6]
[1 2 3] [4 5 6]
[1. 2. 3.]
