# NumPy

## Creating Arrays

### `arange()`

In [1]:
import numpy as np

a = np.arange(10)
print(a)

print()

b = np.arange(0,10,2)
print(b)

[0 1 2 3 4 5 6 7 8 9]

[0 2 4 6 8]


### `zeros()`

In [2]:
import numpy as np

a = np.zeros(5)
print(a)

print()

b = np.zeros((2,3))
print(b)

[0. 0. 0. 0. 0.]

[[0. 0. 0.]
 [0. 0. 0.]]


### `full()`

In [3]:
import numpy as np

a = np.full((2,3), 8)
print(a)

[[8 8 8]
 [8 8 8]]


### `eye()`

In [4]:
import numpy as np

a = np.eye(4)
print(a)

[[1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]]


### `random()`

Goes from 0 (inclusive) to 1 (exclusive)

In [5]:
import numpy as np

a = np.random.random((2,4))
print(a)

[[0.97852653 0.19975697 0.77728757 0.30968452]
 [0.61081188 0.15399567 0.99342223 0.0154658 ]]


### `array()`

In [6]:
import numpy as np

listy = [1, 2, 3, 4, 5]

a = np.array(listy)
print(a)

[1 2 3 4 5]


In [7]:
import numpy as np

list1 = [1, 2, 3, 4, 5]
list2 = [6, 7, 8, 9, 0]

a = np.array([list1, list2])
print(a)

[[1 2 3 4 5]
 [6 7 8 9 0]]


## Array Indexing

In [8]:
import numpy as np

a = [1, 2, 3, 4, 5]

print(a[0])

1


In [9]:
import numpy as np

a = [1, 2, 3, 4, 5]
b = [6, 7, 8, 9, 0]
c = np.array([a,b])

print(c[0,0])

1


In [10]:
import numpy as np

a = [1, 2, 3, 4, 5]
c = np.array(a)

print(c[[2,4]])

[3 5]


### Boolean Indexing

In [11]:
import numpy as np

a = [1, 2, 3, 4, 5]
c = np.array(a)

print(c > 2)

print()

print(c[c > 2])

[False False  True  True  True]

[3 4 5]


### Slicing Arrays

In [12]:
import numpy as np

a = np.arange(20)
odd_nums = a[a % 2 == 1]

print(odd_nums)

[ 1  3  5  7  9 11 13 15 17 19]


In [13]:
import numpy as np

a = np.array([[1, 2, 3, 4, 5],
              [4, 5, 6, 7, 8],
              [9, 8, 7, 6, 5]])
b = a[1:3, :3]

print(b)

print()

c = a[-2:, -2:]
print(c)

[[4 5 6]
 [9 8 7]]

[[7 8]
 [6 5]]


Slicing returns a reference (not a copy) to the original array

In [14]:
import numpy as np

a = np.array([[1, 2, 3, 4, 5],
              [4, 5, 6, 7, 8],
              [9, 8, 7, 6, 5]])
b = a[1:, 2:]

print(b)

print()

b[0,2] = 88

print(a)

[[6 7 8]
 [7 6 5]]

[[ 1  2  3  4  5]
 [ 4  5  6  7 88]
 [ 9  8  7  6  5]]


Result of slicing is dependent on how it is sliced

In [15]:
import numpy as np

a = np.array([[1, 2, 3, 4, 5],
              [4, 5, 6, 7, 8],
              [9, 8, 7, 6, 5]])
b = a[2:, :]

print(b)
print(b.shape)

print()

c = a[2, :]

print(c)
print(c.shape)

[[9 8 7 6 5]]
(1, 5)

[9 8 7 6 5]
(5,)


## Reshaping Arrays

### `reshape()`

In [16]:
# convert rank 1 array to rank 2

import numpy as np

a = np.array([[1, 2, 3, 4, 5],
              [4, 5, 6, 7, 8],
              [9, 8, 7, 6, 5]])
b = a[2, :]

print(b)

print()

b = b.reshape(1, -1) # -1 means automatically determine
print(b)
# output is same as reshape(1,5)

[9 8 7 6 5]

[[9 8 7 6 5]]


In [17]:
# convert rank 2 array to rank 1

import numpy as np

a = np.array([[1, 2, 3, 4, 5],
              [4, 5, 6, 7, 8],
              [9, 8, 7, 6, 5]])
b = a[2:, :]

print(b)

print()

b = b.reshape(-1,)
print(b)

[[9 8 7 6 5]]

[9 8 7 6 5]


## Array Math

### Addition / Subtraction / Multiplication / Division

In [18]:
import numpy as np

x = np.array([[1, 2, 3], [4, 5, 6]])
y = np.array([[7, 8, 9], [2 ,3, 4]])

print(x+y) # same as np.add(x,y)
print()
print(x-y) # same as np.subtract(x,y)
print()
print(x*y) # same as np.multiply(x,y)
print()
print(x/y) # same as np.divide(x,y)

[[ 8 10 12]
 [ 6  8 10]]

[[-6 -6 -6]
 [ 2  2  2]]

[[ 7 16 27]
 [ 8 15 24]]

[[0.14285714 0.25       0.33333333]
 [2.         1.66666667 1.5       ]]


### Dot Product

### `dot()`

In [19]:
import numpy as np

x = np.array([2, 3])
y = np.array([4, 2])

print(np.dot(x,y)) # 2*4 + 3*2

14


A dot product of two rank 2 arrays is the same as matrix multiplication

In [20]:
import numpy as np

x = np.array([[1, 2, 3], [4, 5, 6]])
y = np.array([[7, 8], [9, 10], [11, 12]])

print(np.dot(x,y))

[[ 58  64]
 [139 154]]


## Matrix

Matrices are two-dimensional while `ndarrays` are multidimensional

In [21]:
import numpy as np

x = np.matrix([[1, 2], [4, 5]])

print(x)

[[1 2]
 [4 5]]


In [22]:
# convert NumPy array to matrix

import numpy as np

x = np.array([[1, 2], [3, 4]])
y = np.asmatrix(x)

print(y)

[[1 2]
 [3 4]]


When multiplying two `ndarray`s, the result is element-wise multiplication.

When multiplying two matrices, the result is the dot product.

In [23]:
import numpy as np

x1 = np.array([[1, 2], [4, 5]])
y1 = np.array([[7, 8], [2, 3]])

print(x*y)

print()

x2 = np.matrix([[1, 2], [4, 5]])
y2 = np.matrix([[7, 8], [2, 3]])

print(x2*y2)

[[ 7 10]
 [15 22]]

[[11 14]
 [38 47]]


## Cumulative Sum

### `cumsum()`

In [24]:
import numpy as np

a = np.array([(1, 2, 3), (4, 5, 6), (7, 8, 9)])
print(a)

print()

print(a.cumsum())

[[1 2 3]
 [4 5 6]
 [7 8 9]]

[ 1  3  6 10 15 21 28 36 45]


In [25]:
import numpy as np

a = np.array([(1, 2, 3), (4, 5, 6), (7, 8, 9)])
print(a)

print()

print(a.cumsum(axis=0)) # sum over rows

[[1 2 3]
 [4 5 6]
 [7 8 9]]

[[ 1  2  3]
 [ 5  7  9]
 [12 15 18]]


In [26]:
import numpy as np

a = np.array([(1, 2, 3), (4, 5, 6), (7, 8, 9)])
print(a)

print()

print(a.cumsum(axis=1)) # sum over columns

[[1 2 3]
 [4 5 6]
 [7 8 9]]

[[ 1  3  6]
 [ 4  9 15]
 [ 7 15 24]]


## Sorting

### `sort()`

In [27]:
import numpy as np

a = np.array([34, 12, 37, 5, 13])

print(np.sort(a)) # does not modify original array

print()

print(a)

[ 5 12 13 34 37]

[34 12 37  5 13]


### `argsort()`

In [28]:
import numpy as np

a = np.array([34, 12, 37, 5, 13])

print(a.argsort()) # returns indices of sorted elements

[3 1 4 0 2]


In [29]:
import numpy as np

persons = np.array(['Fourth', 'Second', 'Fifth', 'First', 'Third'])
ages = np.array([34, 12, 37, 5, 13])

sort_indices = np.argsort(ages)

print(persons[sort_indices])

print()

reverse_sort_indices = np.argsort(ages)[::-1]

print(persons[reverse_sort_indices])

['First' 'Second' 'Third' 'Fourth' 'Fifth']

['Fifth' 'Fourth' 'Third' 'Second' 'First']


## Array Assignment

### Copy by Reference

In [30]:
import numpy as np

a = np.array([[1, 2, 3, 4], [5, 6, 7, 8]])

b = a # b is pointing to a

### Copy by View (Shallow Copy)

Creates a copy by reference, but changing the shape does not change the shape of the original

Values will still change

### `view()`

In [31]:
import numpy as np

a = np.array([[1, 2, 3, 4], [5, 6, 7, 8]])

b = a.view()

a.shape = 1, -1
a[0,0] = 9

print(a)

print()

print(b)

[[9 2 3 4 5 6 7 8]]

[[9 2 3 4]
 [5 6 7 8]]


### Copy by Value (Deep Copy)

### `copy()`

In [32]:
import numpy as np

a = np.array([[1, 2, 3, 4], [5, 6, 7, 8]])

b = a.copy()

a[0][0] = 9

print(a)

print()

print(b)

[[9 2 3 4]
 [5 6 7 8]]

[[1 2 3 4]
 [5 6 7 8]]


# Pandas

## Series

Behaves like a dictionary

### Creating a Series

In [33]:
import pandas as pd

series = pd.Series([1, 2, 3, 4, 5])

print(series)

0    1
1    2
2    3
3    4
4    5
dtype: int64


In [34]:
import pandas as pd

series = pd.Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 'd', 'c'])

print(series)

a    1
b    2
c    3
d    4
c    5
dtype: int64


### Accessing Elements in a Series

In [35]:
import pandas as pd

series = pd.Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 'd', 'c'])

print(series[2])
print(series.iloc[2])

3
3


In [36]:
import pandas as pd

series = pd.Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 'd', 'c'])

print(series['d'])
print(series.loc['d'])

4
4


In [37]:
import pandas as pd

series = pd.Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 'd', 'c'])

print(series['c'])

c    3
c    5
dtype: int64


In [38]:
import pandas as pd

series = pd.Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 'd', 'c'])

print(series[2:])

c    3
d    4
c    5
dtype: int64


## Datetime Range

### `date_range()`

In [39]:
import pandas as pd

dates = pd.date_range('20190525', periods=12)

print(dates) # D for 'daily'

DatetimeIndex(['2019-05-25', '2019-05-26', '2019-05-27', '2019-05-28',
               '2019-05-29', '2019-05-30', '2019-05-31', '2019-06-01',
               '2019-06-02', '2019-06-03', '2019-06-04', '2019-06-05'],
              dtype='datetime64[ns]', freq='D')


In [40]:
# assign range of dates as index of Series

import pandas as pd

dates = pd.date_range('20190525', periods=12)

series = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])
series.index = dates

print(series)

2019-05-25     1
2019-05-26     2
2019-05-27     3
2019-05-28     4
2019-05-29     5
2019-05-30     6
2019-05-31     7
2019-06-01     8
2019-06-02     9
2019-06-03    10
2019-06-04    11
2019-06-05    12
Freq: D, dtype: int64


In [41]:
import pandas as pd

dates = pd.date_range('2019-05-01', periods=12, freq='M') # 'M' for 'monthly'

print(dates)

DatetimeIndex(['2019-05-31', '2019-06-30', '2019-07-31', '2019-08-31',
               '2019-09-30', '2019-10-31', '2019-11-30', '2019-12-31',
               '2020-01-31', '2020-02-29', '2020-03-31', '2020-04-30'],
              dtype='datetime64[ns]', freq='M')


In [42]:
import pandas as pd

dates = pd.date_range('2019-05-01', periods=12, freq='MS') # 'M' for 'monthly', 'S' for 'start'

print(dates)

DatetimeIndex(['2019-05-01', '2019-06-01', '2019-07-01', '2019-08-01',
               '2019-09-01', '2019-10-01', '2019-11-01', '2019-12-01',
               '2020-01-01', '2020-02-01', '2020-03-01', '2020-04-01'],
              dtype='datetime64[ns]', freq='MS')


In [43]:
import pandas as pd

dates = pd.date_range('2019/05/17 09:00:00', periods=8, freq='H') # 'H' for 'hourly'

print(dates)

DatetimeIndex(['2019-05-17 09:00:00', '2019-05-17 10:00:00',
               '2019-05-17 11:00:00', '2019-05-17 12:00:00',
               '2019-05-17 13:00:00', '2019-05-17 14:00:00',
               '2019-05-17 15:00:00', '2019-05-17 16:00:00'],
              dtype='datetime64[ns]', freq='H')


## DataFrame

In [44]:
import pandas as pd
import numpy as np

df = pd.DataFrame(np.random.randn(10,4), columns=list('ABCD'))

print(df)

          A         B         C         D
0 -0.628577 -0.047730 -2.120651 -0.345224
1  0.818082 -0.218897 -0.336446  0.190480
2  0.769533  0.668049  1.542633  0.424238
3 -1.580455  0.876036 -0.421257  0.669471
4  1.291061  2.156422  0.252665  0.056647
5 -0.132923 -0.531289 -0.383515  1.286995
6 -0.570255 -0.040065  0.910904 -0.346261
7 -0.928025  2.422382  0.393385 -0.703196
8  0.681338  0.522273 -0.682317  0.675320
9 -0.518571 -0.194937 -2.149291  1.471668


### `read_csv()`

In [45]:
import pandas as pd

# df = pd.read_csv('data.csv')

### Index and Values

In [46]:
import pandas as pd
import numpy as np

df = pd.DataFrame(np.random.randn(10,4), columns=list('ABCD'))
days = pd.date_range('20190525', periods=10)
df.index = days

print(df)
print()
print(df.index)
print()
print(df.values)

                   A         B         C         D
2019-05-25 -0.225464 -0.527627 -1.816776  0.152401
2019-05-26  0.426616  0.266348  1.411340 -0.776213
2019-05-27 -0.474726 -0.160647  0.657518  2.807340
2019-05-28 -0.192639 -0.223570  0.921626  0.012514
2019-05-29 -0.368741  1.214521  0.733380  0.294213
2019-05-30 -0.065906  2.269507  0.242459  0.243003
2019-05-31 -0.309328 -0.046671  0.315767 -1.222598
2019-06-01 -0.580200 -0.173948 -1.193743 -0.629458
2019-06-02  0.659919 -1.501245  0.628645 -1.330315
2019-06-03  0.515195 -0.155877 -0.564847 -0.502733

DatetimeIndex(['2019-05-25', '2019-05-26', '2019-05-27', '2019-05-28',
               '2019-05-29', '2019-05-30', '2019-05-31', '2019-06-01',
               '2019-06-02', '2019-06-03'],
              dtype='datetime64[ns]', freq='D')

[[-0.22546377 -0.52762726 -1.81677581  0.15240091]
 [ 0.42661642  0.26634781  1.41133987 -0.77621309]
 [-0.4747256  -0.1606474   0.65751788  2.80733998]
 [-0.19263868 -0.22357034  0.92162556  0.01251359]

### Descriptive Statistics

### `describe()`

In [47]:
print(df.describe())

               A          B          C          D
count  10.000000  10.000000  10.000000  10.000000
mean   -0.061527   0.096079   0.133537  -0.095185
std     0.438558   1.014835   1.012783   1.177356
min    -0.580200  -1.501245  -1.816776  -1.330315
25%    -0.353888  -0.211165  -0.363020  -0.739524
50%    -0.209051  -0.158262   0.472206  -0.245110
75%     0.303486   0.188093   0.714414   0.220353
max     0.659919   2.269507   1.411340   2.807340


### `mean()`

In [48]:
print(df.mean(0)) # mean for each column

print()

print(df.mean(1)) # mean for each row

A   -0.061527
B    0.096079
C    0.133537
D   -0.095185
dtype: float64

2019-05-25   -0.604366
2019-05-26    0.332023
2019-05-27    0.707371
2019-05-28    0.129483
2019-05-29    0.468343
2019-05-30    0.672266
2019-05-31   -0.315707
2019-06-01   -0.644337
2019-06-02   -0.385749
2019-06-03   -0.177066
Freq: D, dtype: float64


### Extracting from DataFrames

### `head()`

In [49]:
print(df.head()) # first 5 rows

                   A         B         C         D
2019-05-25 -0.225464 -0.527627 -1.816776  0.152401
2019-05-26  0.426616  0.266348  1.411340 -0.776213
2019-05-27 -0.474726 -0.160647  0.657518  2.807340
2019-05-28 -0.192639 -0.223570  0.921626  0.012514
2019-05-29 -0.368741  1.214521  0.733380  0.294213


In [50]:
print(df.head(8)) # first 8 rows

                   A         B         C         D
2019-05-25 -0.225464 -0.527627 -1.816776  0.152401
2019-05-26  0.426616  0.266348  1.411340 -0.776213
2019-05-27 -0.474726 -0.160647  0.657518  2.807340
2019-05-28 -0.192639 -0.223570  0.921626  0.012514
2019-05-29 -0.368741  1.214521  0.733380  0.294213
2019-05-30 -0.065906  2.269507  0.242459  0.243003
2019-05-31 -0.309328 -0.046671  0.315767 -1.222598
2019-06-01 -0.580200 -0.173948 -1.193743 -0.629458


### `tail()`

In [51]:
print(df.tail()) # last 5 rows

                   A         B         C         D
2019-05-30 -0.065906  2.269507  0.242459  0.243003
2019-05-31 -0.309328 -0.046671  0.315767 -1.222598
2019-06-01 -0.580200 -0.173948 -1.193743 -0.629458
2019-06-02  0.659919 -1.501245  0.628645 -1.330315
2019-06-03  0.515195 -0.155877 -0.564847 -0.502733


In [52]:
print(df.tail(8)) # last 8 rows

                   A         B         C         D
2019-05-27 -0.474726 -0.160647  0.657518  2.807340
2019-05-28 -0.192639 -0.223570  0.921626  0.012514
2019-05-29 -0.368741  1.214521  0.733380  0.294213
2019-05-30 -0.065906  2.269507  0.242459  0.243003
2019-05-31 -0.309328 -0.046671  0.315767 -1.222598
2019-06-01 -0.580200 -0.173948 -1.193743 -0.629458
2019-06-02  0.659919 -1.501245  0.628645 -1.330315
2019-06-03  0.515195 -0.155877 -0.564847 -0.502733


#### Selecting Specific Column

In [53]:
print(df['A']) # same as df.A

2019-05-25   -0.225464
2019-05-26    0.426616
2019-05-27   -0.474726
2019-05-28   -0.192639
2019-05-29   -0.368741
2019-05-30   -0.065906
2019-05-31   -0.309328
2019-06-01   -0.580200
2019-06-02    0.659919
2019-06-03    0.515195
Freq: D, Name: A, dtype: float64


In [54]:
print(df[['A', 'B']])

                   A         B
2019-05-25 -0.225464 -0.527627
2019-05-26  0.426616  0.266348
2019-05-27 -0.474726 -0.160647
2019-05-28 -0.192639 -0.223570
2019-05-29 -0.368741  1.214521
2019-05-30 -0.065906  2.269507
2019-05-31 -0.309328 -0.046671
2019-06-01 -0.580200 -0.173948
2019-06-02  0.659919 -1.501245
2019-06-03  0.515195 -0.155877


#### Selecting Specific Row

In [55]:
print(df[2:4]) # same as print(df.iloc[2:4])

                   A         B         C         D
2019-05-27 -0.474726 -0.160647  0.657518  2.807340
2019-05-28 -0.192639 -0.223570  0.921626  0.012514


In [56]:
print(df.iloc[[2,4]])

                   A         B         C         D
2019-05-27 -0.474726 -0.160647  0.657518  2.807340
2019-05-29 -0.368741  1.214521  0.733380  0.294213


In [57]:
print(df.iloc[2])

A   -0.474726
B   -0.160647
C    0.657518
D    2.807340
Name: 2019-05-27 00:00:00, dtype: float64


#### Selecting Row and Column

In [58]:
print(df.iloc[2:4, 1:4])

                   B         C         D
2019-05-27 -0.160647  0.657518  2.807340
2019-05-28 -0.223570  0.921626  0.012514


In [59]:
print(df.iloc[[2, 4], [1, 3]])

                   B         D
2019-05-27 -0.160647  2.807340
2019-05-29  1.214521  0.294213


#### Slicing Based on Labels

In [60]:
print(df['20190601':'20190603'])

                   A         B         C         D
2019-06-01 -0.580200 -0.173948 -1.193743 -0.629458
2019-06-02  0.659919 -1.501245  0.628645 -1.330315
2019-06-03  0.515195 -0.155877 -0.564847 -0.502733


In [61]:
print(df.loc['20190601':'20190603', 'A':'C'])

                   A         B         C
2019-06-01 -0.580200 -0.173948 -1.193743
2019-06-02  0.659919 -1.501245  0.628645
2019-06-03  0.515195 -0.155877 -0.564847


In [62]:
print(df.loc['20190601':'20190603', ['A', 'C']])

                   A         C
2019-06-01 -0.580200 -1.193743
2019-06-02  0.659919  0.628645
2019-06-03  0.515195 -0.564847


In [63]:
print(df.loc['20190601'])

A   -0.580200
B   -0.173948
C   -1.193743
D   -0.629458
Name: 2019-06-01 00:00:00, dtype: float64


In [64]:
# extract specific rows with datetime as index

from datetime import datetime

date1 = datetime(2019, 6, 1, 0, 0, 0)
date2 = datetime(2019, 6, 3, 0, 0, 0)

print(df.loc[[date1, date2]])

                   A         B         C         D
2019-06-01 -0.580200 -0.173948 -1.193743 -0.629458
2019-06-03  0.515195 -0.155877 -0.564847 -0.502733


In [65]:
print(df.loc[date1, ['A', 'C']])

A   -0.580200
C   -1.193743
Name: 2019-06-01 00:00:00, dtype: float64
