In [40]:
import numpy as np

## Create numpy arrays
```python
>>> np.zeros((3,4))
>>> np.ones((2,3,4),dtype=np.int16) 
>>> d = np.arange(10,25,5)
>>> np.linspace(0,2,9)
```

In [41]:
# From a list
a = np.array([1, 2, 3, 4])
a

array([1, 2, 3, 4])

### Random arrays

In [42]:
ranstat = np.random.RandomState(1988)
tc = ranstat.normal(25.0, 2.0, size=(5,2))
print(
tc.shape, '\n',
tc[0], '\n',
tc[:,0], '\n',
tc.std()
)

(5, 2) 
 [22.34458463 23.87612186] 
 [22.34458463 26.03863924 26.20676545 27.5857451  24.56599398] 
 1.7257306953985907


## Array math

In [43]:
# direct math
tf = (9/5) * tc + 32
tf

array([[72.22025234, 74.97701935],
       [78.86955063, 74.25682371],
       [79.17217782, 72.05395819],
       [81.65434119, 75.39951728],
       [76.21878917, 72.44794676]])

In [44]:
# add two arrays of same shape
tc + tf

array([[ 94.56483698,  98.85314121],
       [104.90818986,  97.73283688],
       [105.37894327,  94.30615719],
       [109.24008629,  99.51036021],
       [100.78478315,  94.91902829]])

In [45]:
# map a function
c_to_f = lambda x: (9/5)*x+32

tf2 = np.array(list(map(c_to_f, tc)))
tf2

array([[72.22025234, 74.97701935],
       [78.86955063, 74.25682371],
       [79.17217782, 72.05395819],
       [81.65434119, 75.39951728],
       [76.21878917, 72.44794676]])

In [46]:
# Are these two arrays equal?
np.array_equal(tf, tf2)

True

In [47]:
np.info(np.array_equal)

 array_equal(*args, **kwargs)

True if two arrays have the same shape and elements, False otherwise.

Parameters
----------
a1, a2 : array_like
    Input arrays.

Returns
-------
b : bool
    Returns True if the arrays are equal.

See Also
--------
allclose: Returns True if two arrays are element-wise equal within a
          tolerance.
array_equiv: Returns True if input arrays are shape consistent and all
             elements equal.

Examples
--------
>>> np.array_equal([1, 2], [1, 2])
True
>>> np.array_equal(np.array([1, 2]), np.array([1, 2]))
True
>>> np.array_equal([1, 2], [1, 2, 3])
False
>>> np.array_equal([1, 2], [1, 4])
False


In [48]:
# vectorize a function


## Array manipulation

In [49]:
t = np.hstack((tc, tf))
t

array([[22.34458463, 23.87612186, 72.22025234, 74.97701935],
       [26.03863924, 23.47601317, 78.86955063, 74.25682371],
       [26.20676545, 22.252199  , 79.17217782, 72.05395819],
       [27.5857451 , 24.11084293, 81.65434119, 75.39951728],
       [24.56599398, 22.47108153, 76.21878917, 72.44794676]])

## Load data from file

Using `np.genfromtxt()`

In [50]:
# skip first row
da = np.genfromtxt('data.csv', delimiter=',',skip_header=1,dtype=float)

In [51]:
da.shape

(293, 14)

In [52]:
da[:5,:]

array([[ 28.,   1.,   2., 130., 132.,   0.,   2., 185.,   0.,   0.,  nan,
         nan,  nan,   0.],
       [ 29.,   1.,   2., 120., 243.,   0.,   0., 160.,   0.,   0.,  nan,
         nan,  nan,   0.],
       [ 29.,   1.,   2., 140.,  nan,   0.,   0., 170.,   0.,   0.,  nan,
         nan,  nan,   0.],
       [ 30.,   0.,   1., 170., 237.,   0.,   1., 170.,   0.,   0.,  nan,
         nan,   6.,   0.],
       [ 31.,   0.,   2., 100., 219.,   0.,   1., 150.,   0.,   0.,  nan,
         nan,  nan,   0.]])

In [53]:
print(da[:5,:])

[[ 28.   1.   2. 130. 132.   0.   2. 185.   0.   0.  nan  nan  nan   0.]
 [ 29.   1.   2. 120. 243.   0.   0. 160.   0.   0.  nan  nan  nan   0.]
 [ 29.   1.   2. 140.  nan   0.   0. 170.   0.   0.  nan  nan  nan   0.]
 [ 30.   0.   1. 170. 237.   0.   1. 170.   0.   0.  nan  nan   6.   0.]
 [ 31.   0.   2. 100. 219.   0.   1. 150.   0.   0.  nan  nan  nan   0.]]


In [54]:
with np.printoptions(precision=3):
    print(da[:5,:])

[[ 28.   1.   2. 130. 132.   0.   2. 185.   0.   0.  nan  nan  nan   0.]
 [ 29.   1.   2. 120. 243.   0.   0. 160.   0.   0.  nan  nan  nan   0.]
 [ 29.   1.   2. 140.  nan   0.   0. 170.   0.   0.  nan  nan  nan   0.]
 [ 30.   0.   1. 170. 237.   0.   1. 170.   0.   0.  nan  nan   6.   0.]
 [ 31.   0.   2. 100. 219.   0.   1. 150.   0.   0.  nan  nan  nan   0.]]


In [55]:
with np.printoptions(formatter={'float': '{: 0.1f}'.format}):
    print(da[:5,:])

[[ 28.0  1.0  2.0  130.0  132.0  0.0  2.0  185.0  0.0  0.0  nan  nan  nan
   0.0]
 [ 29.0  1.0  2.0  120.0  243.0  0.0  0.0  160.0  0.0  0.0  nan  nan  nan
   0.0]
 [ 29.0  1.0  2.0  140.0  nan  0.0  0.0  170.0  0.0  0.0  nan  nan  nan
   0.0]
 [ 30.0  0.0  1.0  170.0  237.0  0.0  1.0  170.0  0.0  0.0  nan  nan  6.0
   0.0]
 [ 31.0  0.0  2.0  100.0  219.0  0.0  1.0  150.0  0.0  0.0  nan  nan  nan
   0.0]]


In [56]:
# Use first row as column names
da2 = np.genfromtxt('data.csv', delimiter=',',names=True,dtype=float)

In [57]:
da2.dtype.names

('age',
 'gender',
 'cp',
 'trestbps',
 'chol',
 'fbs',
 'restecg',
 'thalach',
 'exang',
 'oldpeak',
 'slope',
 'ca',
 'thal',
 'num')

In [58]:
da2['age']

array([28., 29., 29., 30., 31., 32., 32., 32., 33., 34., 34., 34., 35.,
       35., 35., 35., 36., 36., 36., 36., 37., 37., 37., 37., 37., 37.,
       37., 38., 38., 38., 39., 39., 39., 39., 39., 39., 39., 39., 39.,
       39., 40., 40., 40., 40., 40., 41., 41., 41., 41., 41., 41., 41.,
       42., 42., 42., 42., 42., 42., 42., 43., 43., 43., 43., 43., 43.,
       43., 43., 44., 44., 44., 44., 45., 45., 45., 45., 45., 45., 45.,
       46., 46., 46., 46., 46., 46., 46., 47., 47., 47., 47., 47., 48.,
       48., 48., 48., 48., 48., 48., 48., 48., 48., 48., 49., 49., 49.,
       49., 49., 49., 49., 49., 50., 50., 50., 50., 50., 50., 50., 51.,
       51., 51., 51., 51., 51., 51., 52., 52., 52., 52., 52., 52., 52.,
       52., 53., 53., 53., 53., 53., 53., 53., 53., 53., 54., 54., 54.,
       54., 54., 54., 54., 54., 54., 54., 54., 54., 54., 54., 54., 54.,
       55., 55., 55., 55., 55., 55., 55., 55., 55., 55., 56., 56., 56.,
       56., 56., 57., 57., 57., 58., 58., 58., 58., 59., 59., 59

## Summarize values
What is the mean, std, min, max in each column?

In [59]:
da.mean(axis=0)

array([47.76791809,  0.72354949,  2.97952218,         nan,         nan,
               nan,         nan,         nan,         nan,  0.58464164,
               nan,         nan,         nan,  0.35836177])

In [60]:
print(np.nanmean(da,axis=0))

[4.77679181e+01 7.23549488e-01 2.97952218e+00 1.32592466e+02
 2.50759259e+02 7.01754386e-02 2.15753425e-01 1.39212329e+02
 3.01369863e-01 5.84641638e-01 1.89320388e+00 0.00000000e+00
 5.64285714e+00 3.58361775e-01]


In [61]:
with np.printoptions(precision=2, suppress=True):
    print(np.nanmean(da,axis=0))

[ 47.77   0.72   2.98 132.59 250.76   0.07   0.22 139.21   0.3    0.58
   1.89   0.     5.64   0.36]


In [62]:
da_min_max = np.vstack((da.min(axis=0), da.max(axis=0)))
print(da_min_max)

[[28.  0.  1. nan nan nan nan nan nan  0. nan nan nan  0.]
 [66.  1.  4. nan nan nan nan nan nan  5. nan nan nan  1.]]


## Find nans
How many nans in each column?

In [63]:
np.isnan(da)

array([[False, False, False, ...,  True,  True, False],
       [False, False, False, ...,  True,  True, False],
       [False, False, False, ...,  True,  True, False],
       ...,
       [False, False, False, ...,  True,  True, False],
       [False, False, False, ...,  True,  True, False],
       [False, False, False, ...,  True, False, False]])

In [64]:
np.isnan(da).sum(axis=0)

array([  0,   0,   0,   1,  23,   8,   1,   1,   1,   0, 190, 290, 265,
         0])

In [65]:
da_no_nan = np.nan_to_num(da)

da_min_max = np.vstack((da_no_nan.min(axis=0), da_no_nan.max(axis=0)))
print(da_min_max)

[[ 28.   0.   1.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.]
 [ 66.   1.   4. 200. 603.   1.   2. 190.   1.   5.   3.   0.   7.   1.]]


## Remove duplicate rows

### Create an array with duplicate rows

In [66]:
a = np.arange(20).reshape(5,4)
a

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15],
       [16, 17, 18, 19]])

In [67]:
rand_idx = ranstat.choice(5, 10)
rand_idx

array([3, 3, 0, 0, 2, 3, 2, 1, 4, 4])

In [68]:
dup = a[rand_idx]
dup

array([[12, 13, 14, 15],
       [12, 13, 14, 15],
       [ 0,  1,  2,  3],
       [ 0,  1,  2,  3],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15],
       [ 8,  9, 10, 11],
       [ 4,  5,  6,  7],
       [16, 17, 18, 19],
       [16, 17, 18, 19]])

### Using `np.unique()`

In [69]:
np.unique(dup, axis=0)

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15],
       [16, 17, 18, 19]])

### Another, pythonic way

In [70]:
np.vstack(set(map(tuple, dup)))

  """Entry point for launching an IPython kernel.


array([[ 8,  9, 10, 11],
       [12, 13, 14, 15],
       [16, 17, 18, 19],
       [ 4,  5,  6,  7],
       [ 0,  1,  2,  3]])

## Count unique values (a histogram)

In [71]:
# view on ages column
da_ages = da_no_nan[:,0]

In [72]:
ages, age_cnt = np.unique(da_ages, return_counts=True)
print(ages)
print(age_cnt)

[28. 29. 30. 31. 32. 33. 34. 35. 36. 37. 38. 39. 40. 41. 42. 43. 44. 45.
 46. 47. 48. 49. 50. 51. 52. 53. 54. 55. 56. 57. 58. 59. 60. 61. 62. 63.
 65. 66.]
[ 1  2  1  2  4  2  4  5  5  8  7 11  7 11  7 12  7  8 13 10 19 15 12  9
 17 12 25 15 10  5  9  8  2  2  2  1  2  1]


### What age splits the dataset in half?

In [73]:
da_ages[da_ages > 45].shape

(189,)

In [74]:
np.cumsum(age_cnt)

array([  1,   3,   4,   6,  10,  12,  16,  21,  26,  34,  41,  52,  59,
        70,  77,  89,  96, 104, 117, 127, 146, 161, 173, 182, 199, 211,
       236, 251, 261, 266, 275, 283, 285, 287, 289, 290, 292, 293])

In [75]:
idx, = np.where(np.cumsum(age_cnt)>150)
idx[0]

21

In [76]:
np.cumsum(age_cnt)[idx[0]]

161

In [77]:
ages[idx[0]-1]

48.0

### Create a young/old category

In [78]:
da_age_young_old = np.where(da_ages>48, 1, 0) #1-> old, 0-> young
da_age_young_old

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1])

In [79]:
da_ages

array([28., 29., 29., 30., 31., 32., 32., 32., 33., 34., 34., 34., 35.,
       35., 35., 35., 36., 36., 36., 36., 37., 37., 37., 37., 37., 37.,
       37., 38., 38., 38., 39., 39., 39., 39., 39., 39., 39., 39., 39.,
       39., 40., 40., 40., 40., 40., 41., 41., 41., 41., 41., 41., 41.,
       42., 42., 42., 42., 42., 42., 42., 43., 43., 43., 43., 43., 43.,
       43., 43., 44., 44., 44., 44., 45., 45., 45., 45., 45., 45., 45.,
       46., 46., 46., 46., 46., 46., 46., 47., 47., 47., 47., 47., 48.,
       48., 48., 48., 48., 48., 48., 48., 48., 48., 48., 49., 49., 49.,
       49., 49., 49., 49., 49., 50., 50., 50., 50., 50., 50., 50., 51.,
       51., 51., 51., 51., 51., 51., 52., 52., 52., 52., 52., 52., 52.,
       52., 53., 53., 53., 53., 53., 53., 53., 53., 53., 54., 54., 54.,
       54., 54., 54., 54., 54., 54., 54., 54., 54., 54., 54., 54., 54.,
       55., 55., 55., 55., 55., 55., 55., 55., 55., 55., 56., 56., 56.,
       56., 56., 57., 57., 57., 58., 58., 58., 58., 59., 59., 59

## Compare young and old

In [80]:
#view on blood pressure
da_bp = da_no_nan[:,3]
da_bp

array([130., 120., 140., 170., 100., 105., 110., 125., 120., 130., 150.,
        98., 120., 140., 120., 150., 120., 112., 130., 150., 120., 130.,
       130., 130., 130., 120., 130., 120., 140., 145., 110., 120., 120.,
       120., 130., 190., 120., 160., 110., 130., 130., 140., 130., 130.,
       140., 110., 125., 130., 120., 120., 125., 112., 115., 120., 120.,
       150., 120., 160., 140., 100., 120., 120., 120., 120., 150., 150.,
       142., 120., 120., 130., 150., 130., 180., 132., 140., 135., 120.,
       140., 130., 140., 120., 150., 110., 110., 180., 140., 130., 110.,
       160., 140.,   0., 120., 120., 120., 108., 120., 150., 100., 130.,
       140., 110., 110., 110., 124., 130., 100., 140., 120., 140., 110.,
       120., 120., 140., 170., 140., 150., 160., 110., 130., 150., 125.,
       130., 130., 120., 140., 125., 130., 120., 140., 160., 140., 113.,
       140., 120., 120., 140., 120., 124., 130., 140., 120., 120., 120.,
       130., 140., 150., 160., 120., 110., 120., 12

In [81]:
bp_old = da_bp[da_ages>48].mean()
bp_old

136.01360544217687

In [82]:
bp_young = da_bp[~(da_ages>48)].mean()
bp_young

128.23972602739727

In [83]:
bp_old - bp_young

7.7738794147796

Is this significant?