# Analyzing Patient Data

In [3]:
# Import numpy module
import numpy

In [5]:
# Load data file using numpy
# But this isn't stored in memory
numpy.loadtxt(fname="data/inflammation-01.csv", delimiter=",")

array([[ 0.,  0.,  1., ...,  3.,  0.,  0.],
       [ 0.,  1.,  2., ...,  1.,  0.,  1.],
       [ 0.,  1.,  1., ...,  2.,  1.,  1.],
       ..., 
       [ 0.,  1.,  1., ...,  1.,  1.,  1.],
       [ 0.,  0.,  0., ...,  0.,  2.,  0.],
       [ 0.,  0.,  1., ...,  1.,  1.,  0.]])

In [7]:
# We can easily create variables in Python and then refer back to them
weight_kg = 55
print(weight_kg)

55


In [9]:
# Obviously, computers can do arithmetic
print("Weight in pounds:", 2.2 * weight_kg)

Weight in pounds: 121.00000000000001


In [10]:
# You can reassign the value of any variable
weight_kg = 57.5
print("Weight in kilograms is now:", weight_kg)

Weight in kilograms is now: 57.5


In [12]:
# You can assign a value to one variable based on another
weight_lb = 2.2 * weight_kg
print("Weight in kilograms:", weight_kg, "and in pounds:", weight_lb)

Weight in kilograms: 57.5 and in pounds: 126.50000000000001


In [13]:
# But when you change that first variable, it doesn't change the variables that it was used for
weight_kg = 100
print("Weight in kilograms:", weight_kg, "and in pounds:", weight_lb)

Weight in kilograms: 100 and in pounds: 126.50000000000001


In [14]:
# So, let's assign the numpy data to a variable
data = numpy.loadtxt(fname="data/inflammation-01.csv", delimiter=",")

In [15]:
# Notice that the data wasn't displayed this time. This is because it was assigned to a variable.
# We can manually print the data.
print(data)

[[ 0.  0.  1. ...,  3.  0.  0.]
 [ 0.  1.  2. ...,  1.  0.  1.]
 [ 0.  1.  1. ...,  2.  1.  1.]
 ..., 
 [ 0.  1.  1. ...,  1.  1.  1.]
 [ 0.  0.  0. ...,  0.  2.  0.]
 [ 0.  0.  1. ...,  1.  1.  0.]]


In [17]:
# Let's take a closer look to the `data` variable
print(type(data))

<class 'numpy.ndarray'>


In [18]:
# "numpy.ndarray" tells us that data contains a N-dimensional array defined by numpy
# In this case, the data contains arthritis patients' inflamation.
# The rows represent patients and the columns represent daily inflammation measurements. 
# We can get the number of rows and columns as follows. 
print(data.shape)

(60, 40)


In [26]:
# Here, `shape` is an atttribute of the N-dimensional array stored in data. 
# We can extract values from the matrix by providing indices (like positions) in square brackets
print("First data point:", data[0, 0])

First data point: 0.0


In [27]:
# We can similarly access the "middle" data point
print("Middle data point:", data[30, 20])

Middle data point: 13.0


In [28]:
# There are a few things worth noting here. 
# 1) The order of the indices in square brackets are [row, column]. This matches how mathematicians 
#    use matrices.
# 2) The first data point was referred to as [0,0]. This is because Python uses "0-based indexing". 
#    This just means that the first position in any sequence of things is index 0. Accordingly, if
#    you want to access the ith item, you use i-1 index (e.g., the 6th item is accessed using index 5).

In [30]:
# We can also extract multiple values from the matrix
# For this, we use slices: `data[0:4, 0:10]`. This means extract from  the 0th row up to, 
# but not including, the 4th row. The same goes for the columns. The fact that the end index
# isn't included in the range takes some getting used to. This ensures that the difference 
# between the start and end equals the length of the range. 
print(data[0:4, 0:10])

[[ 0.  0.  1.  3.  1.  2.  4.  7.  8.  3.]
 [ 0.  1.  2.  1.  2.  1.  3.  2.  2.  6.]
 [ 0.  1.  1.  3.  3.  2.  6.  2.  5.  9.]
 [ 0.  0.  2.  0.  4.  2.  2.  1.  6.  7.]]


In [32]:
# We can also extract a section in the middle of the matrix
print(data[5:10, 2:12])

[[ 1.  2.  2.  4.  2.  1.  6.  4.  7.  6.]
 [ 2.  2.  4.  2.  2.  5.  5.  8.  6.  5.]
 [ 1.  2.  3.  1.  2.  3.  5.  3.  7.  8.]
 [ 0.  3.  1.  5.  6.  5.  5.  8.  2.  4.]
 [ 1.  2.  1.  3.  5.  3.  5.  8.  6.  8.]]


In [42]:
# If we exlude the value on either side of the colon (:) in the slice, Python will include all values
# on that side. For example, the following code extracts the first 3 rows (up to but excluding index 3) 
# and all columns beyond the 35th column (index 34). 
print(data[:3, 34:])

[[ 3.  4.  2.  3.  0.  0.]
 [ 5.  5.  1.  1.  0.  1.]
 [ 2.  3.  2.  2.  1.  1.]]


In [44]:
# We can also store this subset of the matrix in another variable
small = data[:3, 34:]
print("small is:")
print(small)

small is:
[[ 3.  4.  2.  3.  0.  0.]
 [ 5.  5.  1.  1.  0.  1.]
 [ 2.  3.  2.  2.  1.  1.]]


In [46]:
# Let's try out some math
# Here, we can double all values in the `small` matrix
small_double = small * 2
print("small is:")
print(small)
print("small_double is:")
print(small_double)

small is:
[[ 3.  4.  2.  3.  0.  0.]
 [ 5.  5.  1.  1.  0.  1.]
 [ 2.  3.  2.  2.  1.  1.]]
small_double is:
[[  6.   8.   4.   6.   0.   0.]
 [ 10.  10.   2.   2.   0.   2.]
 [  4.   6.   4.   4.   2.   2.]]


In [47]:
# We can also add these matrices together, giving a tripled matrix
small_triple = small + small_double
print("small_triple is:")
print(small_triple)

small_triple is:
[[  9.  12.   6.   9.   0.   0.]
 [ 15.  15.   3.   3.   0.   3.]
 [  6.   9.   6.   6.   3.   3.]]


In [49]:
# Numpy arrays can do more complex operations. For instance, we can calculate the mean
# of all values in the matrix. Here, `mean()` is a method of `data`. In contrast with the 
# `shape` attribute we used before, we need parentheses for `mean()`. This is because 
# `shape` is a description while `mean()` is an action (compute the mean on all values).
mean = data.mean()
print("Mean is:", mean)

mean is: 6.14875


In [50]:
# More useful methods!
print("Minimum inflammation:", data.min())
print("Maximum inflammation:", data.max())
print("Standard deviation:", data.std())

Minimum inflammation: 0.0
Maximum inflammation: 20.0
Standard deviation: 4.61383319712


In [52]:
# Computing dataset-wide statistics like this is only so useful though.
# We would like to calculate similar statistics, but for each patient (row) or for each day (column)
# One way to do this is extract the subset we want (e.g., patient 0) and perform the calculations
# data[0, :] means only the first row (no colon before comma) and all data (no start or end before or after colon).
patient_0 = data[0, :]
print("Patient 0 mean:", patient_0.mean())

Patient 0 min: 0.0
Patient 0 mean: 5.45
Patient 0 max: 18.0


In [53]:
# We don't actually need the intermediate variable
print("Patient 0 mean:", data[0, :].mean())

Patient 0 mean: 5.45


In [54]:
# If we want to generalize this approach to all patients or all days, most methods for numpy arrays
# allows us to specify an axis along which to execute a computation.

![Axes in Numpy](fig/python-operations-across-axes.svg)

In [84]:
# Let's take a small subset of data as an example
example = data[0:3, 0:3]
print("example is:")
print(example)

example is:
[[ 0.  0.  1.]
 [ 0.  1.  2.]
 [ 0.  1.  1.]]


In [85]:
# Then, if we calculate the sum without specifying an axis, we sum up all values.
# This is the equivalent to collapsing all dimensions.
print(example.sum())

6.0


In [86]:
# We can also specify an axis along which we perform the sum. 
# For instance, we can specify axis 0 to sum up all rows (vertical axis).
print(example.sum(axis=0))

[ 0.  2.  4.]


In [87]:
# Similarly, we can calculate the sum across all columns using axis 1.
print(example.sum(axis=1))

[ 1.  3.  2.]


In [88]:
# Let's expand this notion to the entire dataset, but using mean.
# We can calculate the mean of along the rows of our dataset, meaning the average 
# inflammation per day for all patients.
print(data.mean(axis=0))

[  0.           0.45         1.11666667   1.75         2.43333333   3.15
   3.8          3.88333333   5.23333333   5.51666667   5.95         5.9
   8.35         7.73333333   8.36666667   9.5          9.58333333
  10.63333333  11.56666667  12.35        13.25        11.96666667
  11.03333333  10.16666667  10.           8.66666667   9.15         7.25
   7.33333333   6.58333333   6.06666667   5.95         5.11666667   3.6
   3.3          3.56666667   2.48333333   1.5          1.13333333
   0.56666667]


In [89]:
# If you recall, the shape of our dataset is (60, 40), meaning 60 rows (patients) and 40 columns (days). 
# Therefore, we expect 40 items in the array of averages above. 
print("shape of array:", data.mean(axis=0).shape)

shape of array: (40,)


In [80]:
# Similarly, we can apply the same calculation along the columns, which would result in an array of 60 elements.
print("shape of array:", data.mean(axis=1).shape)

shape of array: (60,)


In [81]:
# We should probably visualize the data with plots. For this, we can use matplotlib. 
import matplotlib

ImportError: No module named 'matplotlib'

In [83]:
# We also need to enable a special option in Jupyter that allows us to preview the plots within the notebook. 
%matplotlib inline

ImportError: No module named 'matplotlib'