## The Data

In [1]:
# health_list: [height in cm, weight in kg, age, male_yn in int]
health_lists = [
    [152, 48, 63, 1],
    [157, 53, 41, 1],
    [140, 37, 63, 0],
    [137, 32, 65, 0],
]

health_dicts = [
    {'height_cm': 152, 'weight_kg': 48, 'age': 63, 'male_yn': 1},
    {'height_cm': 157, 'weight_kg': 53, 'age': 41, 'male_yn': 1},
    {'height_cm': 140, 'weight_kg': 37, 'age': 63, 'male_yn': 0},
    {'height_cm': 137, 'weight_kg': 32, 'age': 65, 'male_yn': 0},
]

## NumPy

In [2]:
import numpy as np

### The N-Dimensional Arrays

$ BMI = \dfrac{weight}{height^{2}} $

Where:

- $ weight $: weight in kg
- $ height $: height in m

In [3]:
health_a = np.array(health_lists)

In [4]:
# get the 0th row — a person's data
health_a[0]

array([152,  48,  63,   1])

In [5]:
# get the 0th col — all heights
health_a[:, 0]

array([152, 157, 140, 137])

In [6]:
# not matrix multiplication, just array broadcasting
height_m_a = health_a[:, 0] * 0.01
height_m_a

array([ 1.52,  1.57,  1.4 ,  1.37])

In [7]:
weight_kg_a = health_a[:, 1]
weight_kg_a

array([48, 53, 37, 32])

In [8]:
bmi_a = weight_kg_a / height_m_a**2
bmi_a

array([ 20.77562327,  21.50188649,  18.87755102,  17.04938995])

In [9]:
bmi_a.mean()

19.551112681722302

In [10]:
# or
(health_a[:, 1] / (health_a[:, 0]*0.01)**2).mean()

19.551112681722302

### The Matrices

$ P = 10m + 6.25h - 5a + s $

Where:

- $ P $: BMR, kcal / day
- $ m $: weight in kg
- $ h $: height in cm
- $ a $: age in year
- $ s $: +5 for males, -161 for females

$ \equiv $

$ P = 10m + 6.25h - 5a + 5g - 161(1-g) $

Where:

- $ g $: gender in int, 0 is female, 1 is male.

$ \equiv $

$
    \begin{bmatrix}
        P
    \end{bmatrix}
    = 
    \begin{bmatrix}
        m & h & a & g & 1-g
    \end{bmatrix}
    \begin{bmatrix}
        10 \\ 6.25 \\ -5 \\ 5 \\ -161
    \end{bmatrix}
$

In [11]:
health_m = np.mat(health_lists)

In [12]:
# X: X matrix, n × 5
X = np.hstack((
    health_m,
    1-health_m[:, -1:]
))
X
X.shape

(4, 5)

In [13]:
# beta: beta matrix, 5 × 1
beta = np.mat([6.25, 10, -5, 5, -161]).T
beta

matrix([[   6.25],
        [  10.  ],
        [  -5.  ],
        [   5.  ],
        [-161.  ]])

In [14]:
X*beta

matrix([[ 1120.  ],
        [ 1311.25],
        [  769.  ],
        [  690.25]])

In [15]:
(X*beta).mean()

972.625

In [16]:
# or
(np.hstack((health_m, 1-health_m[:, -1:] )) * np.mat([6.25, 10, -5, 5, -161]).T).mean()

972.625

## Pandas

In [17]:
import pandas as pd

In [18]:
health_df = pd.DataFrame(data=health_lists, columns=['height_cm', 'weight_kg', 'age', 'male_yn'])
# or
# health_df = pd.DataFrame(data=health_dicts)
health_df

Unnamed: 0,height_cm,weight_kg,age,male_yn
0,152,48,63,1
1,157,53,41,1
2,140,37,63,0
3,137,32,65,0


In [19]:
health_df.height_cm

0    152
1    157
2    140
3    137
Name: height_cm, dtype: int64

In [20]:
type(health_df.height_cm)

pandas.core.series.Series

In [21]:
bmi_s = health_df.weight_kg/(health_df.height_cm*0.01)**2
bmi_s

0    20.775623
1    21.501886
2    18.877551
3    17.049390
dtype: float64

In [22]:
type(bmi_s)

pandas.core.series.Series

In [23]:
bmi_s.mean()

19.551112681722302

In [24]:
# or
(health_df.weight_kg/(health_df.height_cm*0.01)**2).mean()

19.551112681722302

### Add a Series Into DataFrames

In [25]:
health_df['bmi'] = bmi_s
health_df

Unnamed: 0,height_cm,weight_kg,age,male_yn,bmi
0,152,48,63,1,20.775623
1,157,53,41,1,21.501886
2,140,37,63,0,18.877551
3,137,32,65,0,17.04939


### Load CSV

In [26]:
tmp_df = pd.read_csv('dataset_howell1.csv', sep=';')

In [27]:
tmp_df.head()

Unnamed: 0,height,weight,age,male
0,151.765,47.825606,63.0,1
1,139.7,36.485807,63.0,0
2,136.525,31.864838,65.0,0
3,156.845,53.041915,41.0,1
4,145.415,41.276872,51.0,0


In [28]:
tmp_df.dtypes

height    float64
weight    float64
age       float64
male        int64
dtype: object

## Dig More

* [NumPy User Guide](https://docs.scipy.org/doc/numpy/user/index.html)
    * [Quickstart tutorial](https://docs.scipy.org/doc/numpy/user/quickstart.html)
    * [Indexing](https://docs.scipy.org/doc/numpy/user/basics.indexing.html)
    * [Broadcasting](https://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
* [10 Minutes to pandas](https://pandas.pydata.org/pandas-docs/stable/10min.html)