# NUMPY

In [1]:
import numpy as np

In [2]:
a1 = np.array([1, 2, 3, 4])
a2 = np.array((0.1, 0.2, 0.3))

a1, a2

(array([1, 2, 3, 4]), array([0.1, 0.2, 0.3]))

In [3]:
type(a1), type(a2)

(numpy.ndarray, numpy.ndarray)

In [4]:
a1.dtype, a2.dtype

(dtype('int64'), dtype('float64'))

In [5]:
import sys

#fixed overhead storing shape, dtype, ndim, pointer to actual data buffer, strides and reference counts.
# nbytes gives actual raw bytes of data (len(a1) * 8 bytes/64 bits = 32 bytes
sys.getsizeof(a1), a1.nbytes

(144, 32)

In [6]:
a1.dtype=np.uint8
a2.dtype=np.float16

type(a1), a1.dtype, sys.getsizeof(a1), a1.nbytes
# actual data not converted, just how NumPy interprets the existing bytes

(numpy.ndarray, dtype('uint8'), 144, 32)

In [7]:
# Create a new array to reduce the size
a3 = np.array([1, 2, 3, 4], dtype=np.uint8)
a3.dtype, sys.getsizeof(a3), a3.nbytes

(dtype('uint8'), 116, 4)

In [8]:
a3[2] = 255
a3

array([  1,   2, 255,   4], dtype=uint8)

In [9]:
a3[2] = 256
a3

OverflowError: Python integer 256 out of bounds for uint8

In [10]:
a3 = a3.astype(np.int16)
a3.dtype, sys.getsizeof(a3), a3.nbytes

(dtype('int16'), 120, 8)

In [11]:
a3[2] = 25600
a3

array([    1,     2, 25600,     4], dtype=int16)

In [12]:
a3[2] = 999_999_999_999_9999
a3

OverflowError: Python integer 9999999999999999 out of bounds for int16

In [13]:
a3

array([    1,     2, 25600,     4], dtype=int16)

# if we change the dtype and a larger number is found it wraps around since range of int8 is 255, the number 25600 mod 255 = 0 and hence changes to 0

In [15]:
a3 = a3.astype(np.int8)

a3

array([1, 2, 0, 4], dtype=int8)

In [16]:
a3.dtype, sys.getsizeof(a3), a3.nbytes

(dtype('int8'), 116, 4)

## shapes and sizes and dimensions

In [25]:
a1 = np.array([1, 2, 3, 4], dtype=np.uint8)
m1 = np.array([[1, 2, 3], 
               [4, 5, 6], 
               [7, 8, 9],
               [10, 11, 12]], dtype=np.int16)

In [26]:
len(a1), len(m1)

(4, 4)

In [27]:
a1.size, m1.size

(4, 12)

In [28]:
a1.shape, m1.shape

((4,), (4, 3))

In [29]:
a1.ndim, m1.ndim

(1, 2)

# creating np.arrays from scratch

In [30]:
np.zeros((2,3), dtype=np.float16)

array([[0., 0., 0.],
       [0., 0., 0.]], dtype=float16)

In [31]:
np.ones((3,2), dtype=np.int8)

array([[1, 1],
       [1, 1],
       [1, 1]], dtype=int8)

In [32]:
np.full((4,3), 3.14, np.float16)

array([[3.14, 3.14, 3.14],
       [3.14, 3.14, 3.14],
       [3.14, 3.14, 3.14],
       [3.14, 3.14, 3.14]], dtype=float16)

In [33]:
np.eye(4)

array([[1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.]])

In [34]:
np.eye(4, 5, dtype=np.int8)

array([[1, 0, 0, 0, 0],
       [0, 1, 0, 0, 0],
       [0, 0, 1, 0, 0],
       [0, 0, 0, 1, 0]], dtype=int8)

In [38]:
np.eye(3, k=1, dtype=np.uint8)

array([[0, 1, 0],
       [0, 0, 1],
       [0, 0, 0]], dtype=uint8)

In [39]:
np.eye(3, k=-2, dtype=np.uint8)

array([[0, 0, 0],
       [0, 0, 0],
       [1, 0, 0]], dtype=uint8)

In [41]:
np.arange(0, 10, 2), np.arange(1,5,1)

(array([0, 2, 4, 6, 8]), array([1, 2, 3, 4]))

In [49]:
np.linspace(0, 100, 11), np.linspace(0, 100, 10)

(array([  0.,  10.,  20.,  30.,  40.,  50.,  60.,  70.,  80.,  90., 100.]),
 array([  0.        ,  11.11111111,  22.22222222,  33.33333333,
         44.44444444,  55.55555556,  66.66666667,  77.77777778,
         88.88888889, 100.        ]))

In [50]:
import math

In [52]:
x_coords = np.linspace(-2 * math.pi, 2 * math.pi)
x_coords

array([-6.28318531, -6.02672876, -5.77027222, -5.51381568, -5.25735913,
       -5.00090259, -4.74444605, -4.48798951, -4.23153296, -3.97507642,
       -3.71861988, -3.46216333, -3.20570679, -2.94925025, -2.6927937 ,
       -2.43633716, -2.17988062, -1.92342407, -1.66696753, -1.41051099,
       -1.15405444, -0.8975979 , -0.64114136, -0.38468481, -0.12822827,
        0.12822827,  0.38468481,  0.64114136,  0.8975979 ,  1.15405444,
        1.41051099,  1.66696753,  1.92342407,  2.17988062,  2.43633716,
        2.6927937 ,  2.94925025,  3.20570679,  3.46216333,  3.71861988,
        3.97507642,  4.23153296,  4.48798951,  4.74444605,  5.00090259,
        5.25735913,  5.51381568,  5.77027222,  6.02672876,  6.28318531])

In [53]:
type(x_coords)

numpy.ndarray

In [55]:
y_coords = np.sin(x_coords)
y_coords

array([ 2.44929360e-16,  2.53654584e-01,  4.90717552e-01,  6.95682551e-01,
        8.55142763e-01,  9.58667853e-01,  9.99486216e-01,  9.74927912e-01,
        8.86599306e-01,  7.40277997e-01,  5.45534901e-01,  3.15108218e-01,
        6.40702200e-02, -1.91158629e-01, -4.33883739e-01, -6.48228395e-01,
       -8.20172255e-01, -9.38468422e-01, -9.95379113e-01, -9.87181783e-01,
       -9.14412623e-01, -7.81831482e-01, -5.98110530e-01, -3.75267005e-01,
       -1.27877162e-01,  1.27877162e-01,  3.75267005e-01,  5.98110530e-01,
        7.81831482e-01,  9.14412623e-01,  9.87181783e-01,  9.95379113e-01,
        9.38468422e-01,  8.20172255e-01,  6.48228395e-01,  4.33883739e-01,
        1.91158629e-01, -6.40702200e-02, -3.15108218e-01, -5.45534901e-01,
       -7.40277997e-01, -8.86599306e-01, -9.74927912e-01, -9.99486216e-01,
       -9.58667853e-01, -8.55142763e-01, -6.95682551e-01, -4.90717552e-01,
       -2.53654584e-01, -2.44929360e-16])

In [57]:
[math.sin(x) for x in x_coords]

[2.4492935982947064e-16,
 0.2536545839095075,
 0.49071755200393785,
 0.6956825506034863,
 0.855142763005346,
 0.9586678530366608,
 0.9994862162006879,
 0.9749279121818236,
 0.886599306373,
 0.7402779970753156,
 0.5455349012105488,
 0.3151082180236206,
 0.064070219980713,
 -0.1911586287013721,
 -0.43388373911755823,
 -0.6482283953077884,
 -0.8201722545969558,
 -0.9384684220497603,
 -0.9953791129491981,
 -0.9871817834144502,
 -0.9144126230158127,
 -0.7818314824680298,
 -0.5981105304912161,
 -0.37526700487937437,
 -0.12787716168450639,
 0.1278771616845055,
 0.37526700487937353,
 0.5981105304912153,
 0.7818314824680298,
 0.9144126230158124,
 0.9871817834144501,
 0.9953791129491982,
 0.9384684220497606,
 0.8201722545969558,
 0.6482283953077891,
 0.43388373911755823,
 0.1911586287013734,
 -0.06407021998071255,
 -0.3151082180236194,
 -0.5455349012105482,
 -0.7402779970753144,
 -0.8865993063729997,
 -0.9749279121818236,
 -0.9994862162006879,
 -0.9586678530366608,
 -0.855142763005347,
 -0.69568

In [58]:
np.random.random(5)

array([0.91301539, 0.35642954, 0.31306983, 0.35517757, 0.48765907])

In [60]:
np.random.seed(0)
np.random.random((5,3))

array([[0.5488135 , 0.71518937, 0.60276338],
       [0.54488318, 0.4236548 , 0.64589411],
       [0.43758721, 0.891773  , 0.96366276],
       [0.38344152, 0.79172504, 0.52889492],
       [0.56804456, 0.92559664, 0.07103606]])

In [63]:
# simulate 10 dice rolls
np.random.randint(1, 6+1, (10,2))

array([[3, 6],
       [6, 6],
       [1, 2],
       [6, 2],
       [4, 1],
       [6, 1],
       [2, 3],
       [5, 3],
       [1, 6],
       [4, 3]])

# reshaping arrays

In [64]:
a = np.array([1, 2, 3, 4, 5, 6], dtype=np.uint8)
a

array([1, 2, 3, 4, 5, 6], dtype=uint8)

In [65]:
a.shape

(6,)

In [66]:
a.reshape((3,2))

array([[1, 2],
       [3, 4],
       [5, 6]], dtype=uint8)

In [67]:
a.reshape((3,3))

ValueError: cannot reshape array of size 6 into shape (3,3)

In [76]:
a.reshape((6,1))

array([[1],
       [2],
       [3],
       [4],
       [5],
       [6]], dtype=uint8)

# reshaping arrays

In [78]:
a = np.arange(12)
a

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

In [79]:
m = a.reshape((3,4))
m

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [80]:
m is a

False

## both objects are different but elements are still same 

In [81]:
a[0] = 100
a, m

(array([100,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11]),
 array([[100,   1,   2,   3],
        [  4,   5,   6,   7],
        [  8,   9,  10,  11]]))

In [82]:
m[2][3] = 200
a, m

(array([100,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10, 200]),
 array([[100,   1,   2,   3],
        [  4,   5,   6,   7],
        [  8,   9,  10, 200]]))

## for independent copy create a .copy()

In [83]:
a = np.arange(12)
m = a.reshape((3,4)).copy()
a[0] = 100
a , m

(array([100,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11]),
 array([[ 0,  1,  2,  3],
        [ 4,  5,  6,  7],
        [ 8,  9, 10, 11]]))

# stacking

In [84]:
a1 = np.arange(1, 6)
a1

array([1, 2, 3, 4, 5])

In [91]:
a2 = np.arange(10, 20). reshape(2, 5)
a2

array([[10, 11, 12, 13, 14],
       [15, 16, 17, 18, 19]])

In [92]:
s1 = np.vstack((a1, a2))
s1

array([[ 1,  2,  3,  4,  5],
       [10, 11, 12, 13, 14],
       [15, 16, 17, 18, 19]])

In [93]:
np.vstack((np.arange(3), a2))

ValueError: all the input array dimensions except for the concatenation axis must match exactly, but along dimension 1, the array at index 0 has size 3 and the array at index 1 has size 5

# would accomodate the most broader dtype

In [95]:
a1 = np.array([1, 3, 4, 5], dtype = np.uint8)
a2 = np.array([3.14, 2, 20, 30], dtype = np.float16)
s1 = np.vstack((a1, a2))
s1

array([[ 1.  ,  3.  ,  4.  ,  5.  ],
       [ 3.14,  2.  , 20.  , 30.  ]], dtype=float16)

# unline reshaped arrays stacked arrays do not share elements

In [97]:
a1[0] = 99
a1, s1

(array([99,  3,  4,  5], dtype=uint8),
 array([[ 1.  ,  3.  ,  4.  ,  5.  ],
        [ 3.14,  2.  , 20.  , 30.  ]], dtype=float16))

In [99]:
s1[1][1] = 99
a1, s1

(array([99,  3,  4,  5], dtype=uint8),
 array([[ 1.  ,  3.  ,  4.  ,  5.  ],
        [ 3.14, 99.  , 20.  , 30.  ]], dtype=float16))

In [126]:
a1 = np.linspace(0, 10, 10, dtype = np.uint8).reshape(5, 2)
a2 = np.random.randint(10, 50, 25).reshape(5, 5)

In [127]:
a1, a2

(array([[ 0,  1],
        [ 2,  3],
        [ 4,  5],
        [ 6,  7],
        [ 8, 10]], dtype=uint8),
 array([[12, 37, 31, 49, 49],
        [21, 32, 40, 27, 16],
        [17, 28, 38, 29, 39],
        [31, 19, 35, 42, 37],
        [19, 38, 27, 10, 32]]))

In [128]:
np.hstack((a1, a2))

array([[ 0,  1, 12, 37, 31, 49, 49],
       [ 2,  3, 21, 32, 40, 27, 16],
       [ 4,  5, 17, 28, 38, 29, 39],
       [ 6,  7, 31, 19, 35, 42, 37],
       [ 8, 10, 19, 38, 27, 10, 32]])

# indexing

In [129]:
a2 = np.random.randint(10, 50, 25).reshape(5, 5)
a2

array([[26, 46, 40, 34, 13],
       [18, 37, 39, 33, 42],
       [29, 18, 17, 33, 23],
       [27, 10, 21, 38, 46],
       [35, 42, 24, 32, 38]])

In [132]:
a2[2][3]

np.int64(33)

# we can specify these indices as tupple as well

In [133]:
a2[(2,3)]

np.int64(33)

# we can omit the braces in tupple in numpy

In [134]:
a2[2,3]

np.int64(33)

In [135]:
a2[3,1] = 100
a2

array([[ 26,  46,  40,  34,  13],
       [ 18,  37,  39,  33,  42],
       [ 29,  18,  17,  33,  23],
       [ 27, 100,  21,  38,  46],
       [ 35,  42,  24,  32,  38]])

# as type allows changing the dtype by returns a new array

In [141]:
a2.astype(np.uint8)

array([[ 26,  46,  40,  34,  13],
       [ 18,  37,  39,  33,  42],
       [ 29,  18,  17,  33,  23],
       [ 27, 100,  21,  38,  46],
       [ 35,  42,  24,  32,  38]], dtype=uint8)

In [144]:
a2.dtype # original stayed same ! 

dtype('int64')

# slicing

In [146]:
a = np.arange(0, 25).reshape(5, 5)
a

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14],
       [15, 16, 17, 18, 19],
       [20, 21, 22, 23, 24]])

In [148]:
a[1:3, 2:]

array([[ 7,  8,  9],
       [12, 13, 14]])

In [156]:
a[::2, 1::2]

array([[ 1,  3],
       [11, 13],
       [21, 23]])

# broadcast: usually if we replace the replacement has to be of same size if not then elements are boadcasted

In [158]:
a[::2, 1::2] = 99
a

array([[ 0, 99,  2, 99,  4],
       [ 5,  6,  7,  8,  9],
       [10, 99, 12, 99, 14],
       [15, 16, 17, 18, 19],
       [20, 99, 22, 99, 24]])

In [159]:
a[::2, 1::2] = [[91, 92], [93, 94], [95, 96]]
a

array([[ 0, 91,  2, 92,  4],
       [ 5,  6,  7,  8,  9],
       [10, 93, 12, 94, 14],
       [15, 16, 17, 18, 19],
       [20, 95, 22, 96, 24]])

In [160]:
b = a[::2, 1::2]
a, b

(array([[ 0, 91,  2, 92,  4],
        [ 5,  6,  7,  8,  9],
        [10, 93, 12, 94, 14],
        [15, 16, 17, 18, 19],
        [20, 95, 22, 96, 24]]),
 array([[91, 92],
        [93, 94],
        [95, 96]]))

# elements of slices are linked just as rehaped arrays' elements are linked

In [162]:
b[1:, 1:] = 100
a, b

(array([[  0,  91,   2,  92,   4],
        [  5,   6,   7,   8,   9],
        [ 10,  93,  12, 100,  14],
        [ 15,  16,  17,  18,  19],
        [ 20,  95,  22, 100,  24]]),
 array([[ 91,  92],
        [ 93, 100],
        [ 95, 100]]))

# fancy indexing
### use np.array([]) to fancy index. can be mixed with usual way of indexing as well

In [165]:
a = np.arange(1, 26)

In [164]:
a

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24])

In [169]:
a[np.array([3, 4, 8, 13])]

array([ 4,  5,  9, 14])

# fancy indexing shapes after the index and not original array

In [171]:
a[np.array([[3, 4], [8, 13]])]

array([[ 4,  5],
       [ 9, 14]])

In [174]:
a = a.reshape(5, 5)

In [175]:
a

array([[ 1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10],
       [11, 12, 13, 14, 15],
       [16, 17, 18, 19, 20],
       [21, 22, 23, 24, 25]])

In [176]:
a[3, np.array([0, 3, 4])]

array([16, 19, 20])

In [179]:
a[np.array([1, 2]), 2:]

array([[ 8,  9, 10],
       [13, 14, 15]])

In [182]:
a[np.array([1, 2]), 2:].reshape(6)

array([ 8,  9, 10, 13, 14, 15])

In [183]:
a[np.array([1, 2]), 2:].reshape(1, 6)

array([[ 8,  9, 10, 13, 14, 15]])

# when we use fancy indexing on both axeses, it works different. 
# think of this as zipping the incides from the two axis

In [184]:
a

array([[ 1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10],
       [11, 12, 13, 14, 15],
       [16, 17, 18, 19, 20],
       [21, 22, 23, 24, 25]])

In [187]:
# gives elements: (1,1), (3,2), (4,0)
a[np.array([1, 3, 4]), np.array([1, 2, 0])]

array([ 7, 18, 21])

In [225]:
import csv
import datetime

In [226]:
headers = []
diff_data = []
with open('data/AAPL.csv') as f:
    reader = csv.reader(f)
    headers = next(reader)
    diff_data = list(reader)

In [227]:
headers

['Symbol', ' Date', ' Close', ' Volume', ' Open', ' High', ' Low']

In [228]:
diff_data

[['AAPL',
  ' 10/29/2020',
  ' 115.32',
  ' 146129200',
  ' 112.37',
  ' 116.93',
  ' 112.2'],
 ['AAPL',
  ' 10/28/2020',
  ' 111.2',
  ' 143937800',
  ' 115.05',
  ' 115.43',
  ' 111.1'],
 ['AAPL',
  ' 10/27/2020',
  ' 116.6',
  ' 92276770',
  ' 115.49',
  ' 117.28',
  ' 114.5399'],
 ['AAPL',
  ' 10/26/2020',
  ' 115.05',
  ' 111850700',
  ' 114.01',
  ' 116.55',
  ' 112.88'],
 ['AAPL',
  ' 10/23/2020',
  ' 115.04',
  ' 82572650',
  ' 116.39',
  ' 116.55',
  ' 114.28'],
 ['AAPL',
  ' 10/22/2020',
  ' 115.75',
  ' 101988000',
  ' 117.45',
  ' 118.04',
  ' 114.59'],
 ['AAPL',
  ' 10/21/2020',
  ' 116.87',
  ' 89945980',
  ' 116.67',
  ' 118.705',
  ' 116.45'],
 ['AAPL',
  ' 10/20/2020',
  ' 117.51',
  ' 124423700',
  ' 116.2',
  ' 118.98',
  ' 115.63'],
 ['AAPL',
  ' 10/19/2020',
  ' 115.98',
  ' 120639300',
  ' 119.96',
  ' 120.419',
  ' 115.66'],
 ['AAPL',
  ' 10/16/2020',
  ' 119.02',
  ' 115393800',
  ' 121.28',
  ' 121.548',
  ' 118.81'],
 ['AAPL',
  ' 10/15/2020',
  ' 120.71',
  '

In [229]:
diff_data = np.array(diff_data)
diff_data

array([['AAPL', ' 10/29/2020', ' 115.32', ' 146129200', ' 112.37',
        ' 116.93', ' 112.2'],
       ['AAPL', ' 10/28/2020', ' 111.2', ' 143937800', ' 115.05',
        ' 115.43', ' 111.1'],
       ['AAPL', ' 10/27/2020', ' 116.6', ' 92276770', ' 115.49',
        ' 117.28', ' 114.5399'],
       ['AAPL', ' 10/26/2020', ' 115.05', ' 111850700', ' 114.01',
        ' 116.55', ' 112.88'],
       ['AAPL', ' 10/23/2020', ' 115.04', ' 82572650', ' 116.39',
        ' 116.55', ' 114.28'],
       ['AAPL', ' 10/22/2020', ' 115.75', ' 101988000', ' 117.45',
        ' 118.04', ' 114.59'],
       ['AAPL', ' 10/21/2020', ' 116.87', ' 89945980', ' 116.67',
        ' 118.705', ' 116.45'],
       ['AAPL', ' 10/20/2020', ' 117.51', ' 124423700', ' 116.2',
        ' 118.98', ' 115.63'],
       ['AAPL', ' 10/19/2020', ' 115.98', ' 120639300', ' 119.96',
        ' 120.419', ' 115.66'],
       ['AAPL', ' 10/16/2020', ' 119.02', ' 115393800', ' 121.28',
        ' 121.548', ' 118.81'],
       ['AAPL', ' 10/15

In [230]:
diff_data[:, 1]

array([' 10/29/2020', ' 10/28/2020', ' 10/27/2020', ' 10/26/2020',
       ' 10/23/2020', ' 10/22/2020', ' 10/21/2020', ' 10/20/2020',
       ' 10/19/2020', ' 10/16/2020', ' 10/15/2020', ' 10/14/2020',
       ' 10/13/2020', ' 10/12/2020', ' 10/09/2020', ' 10/08/2020',
       ' 10/07/2020', ' 10/06/2020', ' 10/05/2020', ' 10/02/2020',
       ' 10/01/2020', ' 09/30/2020', ' 09/29/2020'], dtype='<U11')

In [231]:
date_data = [datetime.datetime.strptime(x.strip(), "%m/%d/%Y").date() for x in diff_data[:, 1]]
date_data

[datetime.date(2020, 10, 29),
 datetime.date(2020, 10, 28),
 datetime.date(2020, 10, 27),
 datetime.date(2020, 10, 26),
 datetime.date(2020, 10, 23),
 datetime.date(2020, 10, 22),
 datetime.date(2020, 10, 21),
 datetime.date(2020, 10, 20),
 datetime.date(2020, 10, 19),
 datetime.date(2020, 10, 16),
 datetime.date(2020, 10, 15),
 datetime.date(2020, 10, 14),
 datetime.date(2020, 10, 13),
 datetime.date(2020, 10, 12),
 datetime.date(2020, 10, 9),
 datetime.date(2020, 10, 8),
 datetime.date(2020, 10, 7),
 datetime.date(2020, 10, 6),
 datetime.date(2020, 10, 5),
 datetime.date(2020, 10, 2),
 datetime.date(2020, 10, 1),
 datetime.date(2020, 9, 30),
 datetime.date(2020, 9, 29)]

In [237]:
diff_data[:, 2].astype(np.float16)

array([115.3 , 111.2 , 116.6 , 115.06, 115.06, 115.75, 116.9 , 117.5 ,
       116.  , 119.  , 120.7 , 121.2 , 121.1 , 124.4 , 117.  , 115.  ,
       115.06, 113.2 , 116.5 , 113.  , 116.8 , 115.8 , 114.06],
      dtype=float16)

In [238]:
diff_data[:, 4].astype(np.float16)

array([112.4 , 115.06, 115.5 , 114.  , 116.4 , 117.44, 116.7 , 116.2 ,
       119.94, 121.25, 118.75, 121.  , 125.25, 120.06, 115.25, 116.25,
       114.6 , 115.7 , 113.94, 112.9 , 117.6 , 113.8 , 114.56],
      dtype=float16)

In [241]:
diff_data = (
    (diff_data[:, 2].astype(np.float16) - diff_data[:, 4].astype(np.float16)) 
    / 
    diff_data[:, 4].astype(np.float16)
) * 100

In [242]:
diff_data

array([ 2.613 , -3.37  ,  0.974 ,  0.932 , -1.128 , -1.437 ,  0.1606,
        1.13  , -3.283 , -1.855 ,  1.631 ,  0.155 , -3.293 ,  3.592 ,
        1.519 , -1.075 ,  0.3816, -2.16  ,  2.25  ,  0.1107, -0.691 ,
        1.758 , -0.4365], dtype=float16)

In [243]:
[(x,y) for x, y in zip(date_data ,diff_data)]

[(datetime.date(2020, 10, 29), np.float16(2.613)),
 (datetime.date(2020, 10, 28), np.float16(-3.37)),
 (datetime.date(2020, 10, 27), np.float16(0.974)),
 (datetime.date(2020, 10, 26), np.float16(0.932)),
 (datetime.date(2020, 10, 23), np.float16(-1.128)),
 (datetime.date(2020, 10, 22), np.float16(-1.437)),
 (datetime.date(2020, 10, 21), np.float16(0.1606)),
 (datetime.date(2020, 10, 20), np.float16(1.13)),
 (datetime.date(2020, 10, 19), np.float16(-3.283)),
 (datetime.date(2020, 10, 16), np.float16(-1.855)),
 (datetime.date(2020, 10, 15), np.float16(1.631)),
 (datetime.date(2020, 10, 14), np.float16(0.155)),
 (datetime.date(2020, 10, 13), np.float16(-3.293)),
 (datetime.date(2020, 10, 12), np.float16(3.592)),
 (datetime.date(2020, 10, 9), np.float16(1.519)),
 (datetime.date(2020, 10, 8), np.float16(-1.075)),
 (datetime.date(2020, 10, 7), np.float16(0.3816)),
 (datetime.date(2020, 10, 6), np.float16(-2.16)),
 (datetime.date(2020, 10, 5), np.float16(2.25)),
 (datetime.date(2020, 10, 2), 

# Masking
### Boolean masking uses an expressing that evals to a boolean for each element
### and make an array of T/F and uses that array to filter elements in another array

In [244]:
a = np.arange(1, 11)
a

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [246]:
mask = a > 3
mask

array([False, False, False,  True,  True,  True,  True,  True,  True,
        True])

In [248]:
a[np.array(mask)]

array([ 4,  5,  6,  7,  8,  9, 10])

In [249]:
a[a > 3]

array([ 4,  5,  6,  7,  8,  9, 10])

In [250]:
a [ a % 2 != 0 ] 

array([1, 3, 5, 7, 9])

## masking a > 1 dimention array will return a 1D array (as we can mask variable elements and numpy arrays should be all of same dimention

In [251]:
a[(a % 2 == 0) & (a > 5)]

array([ 6,  8, 10])

In [253]:
a[(a % 2 == 0) | (a == 5)]

array([ 2,  4,  5,  6,  8, 10])

# some universal functions which behave differently

In [254]:
np.amin(np.array([10, 5, 20]))

np.int64(5)

In [255]:
np.amax(np.array([10, 5, 20]))

np.int64(20)

In [257]:
m = np.array([[10, 3, 2], [4, 50, 6], [7, 8, 90]])
m

array([[10,  3,  2],
       [ 4, 50,  6],
       [ 7,  8, 90]])

In [258]:
np.amin(m)

np.int64(2)

In [259]:
np.amax(m)

np.int64(90)

In [263]:
# across rows, across columns
np.amin(m, axis=0), np.amin(m, axis=1)

(array([4, 3, 2]), array([2, 4, 7]))

In [264]:
np.amax(m, axis=0), np.amax(m, axis=1)

(array([10, 50, 90]), array([10, 50, 90]))

In [268]:
# across the rows
np.mean(m, axis = 0)

array([ 7.        , 20.33333333, 32.66666667])

In [269]:
(10+4+7)/3 , (3+50+8)/3

(7.0, 20.333333333333332)

In [270]:
np.sum(m)

np.int64(180)

In [272]:
# across columns
np.sum(m, axis = 1)

array([ 15,  60, 105])

# PANDAS (built on top of numpy)

1. series ( 1D array)
2. DataFrame (2D, collection of series)
3. Index: used to index series and DataFrame objects.
4. one of the key differences between and numpy is Index
5. Numpy array elements are indexed (implicitly) by position
6. In pandas we can assign our own (explicity) labels

In [273]:
import pandas as pd

In [274]:
idx = pd.Index([10, 20, 30])
idx

Index([10, 20, 30], dtype='int64')

In [275]:
idx = pd.Index(['element1', 'element2'])
idx

Index(['element1', 'element2'], dtype='object')

In [276]:
idx[0]

'element1'

# index is just a numpy array , we can index, fancy index and mask

In [277]:
idx[idx == 'element1']

Index(['element1'], dtype='object')

In [279]:
idx[1:]

Index(['element2'], dtype='object')

# indexes are immutable

In [280]:
idx[0] = 'element3'

TypeError: Index does not support mutable operations

# pandas index also have set like ops

In [284]:
idx_1 = pd.Index(['a', 'b', 'c'])
idx_2 = pd.Index(['c', 'd', 'f'])

idx_1.intersection(idx_2), idx_1.union(idx_2), idx_1.difference(idx_2), idx_1.symmetric_difference(idx_2)

(Index(['c'], dtype='object'),
 Index(['a', 'b', 'c', 'd', 'f'], dtype='object'),
 Index(['a', 'b'], dtype='object'),
 Index(['a', 'b', 'd', 'f'], dtype='object'))

# range index

In [287]:
idx = pd.Index(range(2, 10, 2))

In [289]:
idx[0]

2

In [291]:
idx[3]

8

In [292]:
list(idx)

[2, 4, 6, 8]

# do not exhaust

In [293]:
list(idx)

[2, 4, 6, 8]

In [294]:
2 in idx

True

# index need not be unique 

In [295]:
idx = pd.Index([1, 2, 2, 3, 4 ])

In [296]:
idx

Index([1, 2, 2, 3, 4], dtype='int64')

# series - 1D with explicit index

In [297]:
s = pd.Series([10, 20, 30], index = ['a', 'b', 'c'])
s

a    10
b    20
c    30
dtype: int64

In [298]:
s['a']

np.int64(10)

In [299]:
s['d'] = 100
s

a     10
b     20
c     30
d    100
dtype: int64

In [300]:
my_dictionary = {
    'a': 42,
    'b': 3.14,
    'c': 2.1616
}

s = pd.Series(my_dictionary)
s

a    42.0000
b     3.1400
c     2.1616
dtype: float64

In [301]:
s.index

Index(['a', 'b', 'c'], dtype='object')

In [302]:
s.values

array([42.    ,  3.14  ,  2.1616])

In [303]:
s.items()

<zip at 0x1205dec00>

In [304]:
list(s.items())

[('a', 42.0), ('b', 3.14), ('c', 2.1616)]

In [305]:
list(s.keys())

['a', 'b', 'c']

In [312]:
my_dictionary = {
    'country' : 'USA',
    'city': 'london',
    'country': 'France',
    'city' : 'paris',
    'country': 'UK'
}

s = pd.Series(['USA', 'UK', 'london', 'berlin', 'paris'], index=['country', 'country', 'city', 'city', 'city'])
s

country       USA
country        UK
city       london
city       berlin
city        paris
dtype: object

In [313]:
s.index, s.values

(Index(['country', 'country', 'city', 'city', 'city'], dtype='object'),
 array(['USA', 'UK', 'london', 'berlin', 'paris'], dtype=object))

In [314]:
s['country']

country    USA
country     UK
dtype: object

In [315]:
s['city']

city    london
city    berlin
city     paris
dtype: object

In [316]:
s['city'] = 'budapest'
s

country         USA
country          UK
city       budapest
city       budapest
city       budapest
dtype: object

In [318]:
s = pd.Series(['USA', 'UK', 'london', 'berlin', 'paris'], 
              index=['country', 'country', 'city', 'city', 'city'])
s

country       USA
country        UK
city       london
city       berlin
city        paris
dtype: object

In [320]:
s.iloc[1]

'UK'

In [321]:
s.iloc[1:-1]

country        UK
city       london
city       berlin
dtype: object

In [324]:
s = pd.Series([100, 200, 300, 400], index=[10, 20, 30, 40])
s

10    100
20    200
30    300
40    400
dtype: int64

# never use explicit index to slice and positionally index things - use iloc and loc

In [325]:
s[20:30]

Series([], dtype: int64)

In [331]:
s.loc[20:30] # inclusive of end range

20    200
30    300
dtype: int64

In [332]:
s.iloc[1:3] # exclusive of end range

20    200
30    300
dtype: int64

In [333]:
s = pd.Series(['USA', 'UK', 'london', 'berlin', 'paris'], 
              index=['country', 'country', 'city', 'city', 'city'],
             name= 'areas')
s

country       USA
country        UK
city       london
city       berlin
city        paris
Name: areas, dtype: object

In [335]:
s[s != 'UK']

country       USA
city       london
city       berlin
city        paris
Name: areas, dtype: object