In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

## Loading Data Into Pandas DataFrames

In [None]:
# example: loading the synthetic patients file simulated for the state of Massachusetts
patients_ma = pd.read_parquet('https://dicbworkshops.s3.amazonaws.com/output_ma/parquet/patients.parquet')

In [None]:
# display the first 5 rows
patients_ma.head(5)

In [None]:
# display the columns in the DataFrame
patients_ma.columns

In [None]:
# display the columns and their datatypes
patients_ma.dtypes

In [None]:
# accessing the data in a particular column
patients_ma['FIRST']

## Basic Data Manipulation with DataFrames

### Counting

In [None]:
# count the number of male and female patients
patients_ma['GENDER'].value_counts()

### Cross tabulations/Contingency Tables

In [None]:
# count patients by gender and race
pd.crosstab(patients_ma['GENDER'], patients_ma['RACE'])

In [None]:
# alternative method using df.pivot_table(), but need to provide an aggregate function
patients_ma.pivot_table(index='GENDER', columns='RACE', aggfunc='size')

In [None]:
# computing the average healthcare expenses by gender and race
patients_ma.pivot_table(values='HEALTHCARE_EXPENSES', index='GENDER', columns='RACE', aggfunc='mean')

### Descriptive Statistics

In [None]:
# compute the average healthcare expenses
patients_ma['HEALTHCARE_EXPENSES'].mean()

In [None]:
# accessing a larger suite of descriptive statistics
patients_ma['HEALTHCARE_EXPENSES'].describe()

### Descriptive Statistics with Numpy

In [None]:
# compute the average healthcare expenses
np.mean(patients_ma['HEALTHCARE_EXPENSES'])

In [None]:
# compute the standard deviation of healthcare expenses
np.std(patients_ma['HEALTHCARE_EXPENSES'])

In [None]:
# we can also call methods directly on columns/pandas series
patients_ma['HEALTHCARE_EXPENSES'].std()

Quick Question: Why did Numpy's std() give a different (smaller) value for the standard deviation than Pandas?

### Filtering Rows and Columns

In [None]:
# Selecting only the BIRTHDATE, RACE, ETHNICITY, and SEX columns
subset = patients_ma[['BIRTHDATE', 'RACE', 'ETHNICITY', 'GENDER']]

In [None]:
subset

In [None]:
# Selecting only patients who are female
females = patients_ma[patients_ma['GENDER'] == 'F'] # or patients_ma.query('GENDER == "F"')

In [None]:
females

In [None]:
# get patients born after 1990
patients_ma[patients_ma['BIRTHDATE'] >= '1990-01-01']

### Sorting

In [None]:
# get patients born after 1990, and sort by birthdate
patients_ma[patients_ma['BIRTHDATE'] >= '1990-01-01'].sort_values(by='BIRTHDATE')

### Assigning New Columns/Variables

In [None]:
# assign a full name column
patients_ma['FULLNAME'] = patients_ma['FIRST'] + ' ' + patients_ma['MIDDLE'] + ' ' + patients_ma['LAST']

In [None]:
patients_ma['FULLNAME']

### Visualizing Distributions with Matplotlib and Pandas

In [None]:
# Plotting the distribution of income as a histogram
fig, ax = plt.subplots(dpi=150)
patients_ma['INCOME'].hist(bins=20, ax=ax)

In [None]:
# alternative using matplotlib directly
plt.hist(patients_ma['INCOME'], bins=20)
plt.show()

### Grouped Histograms

In [None]:
# plotting the distributions of healthcare expenses by race
fig, ax = plt.subplots(1, 2, dpi=150)
patients_ma.hist('HEALTHCARE_EXPENSES', by='GENDER', bins=20, ax=ax)

### Box Plots and Violing Plots with Seaborn

In [None]:
# Alternative approaches to visualizing distributions with seaborn
sns.boxplot(patients_ma, x='GENDER', y='HEALTHCARE_EXPENSES')

In [None]:
sns.boxplot(patients_ma, x='RACE', y='HEALTHCARE_EXPENSES', hue='GENDER')

In [None]:
sns.violinplot(patients_ma, x='GENDER', y='HEALTHCARE_EXPENSES')

## Quick Exercises

### 1. What is the median income of patients born after January 1st 1995

In [None]:
# Write and run your solution here

### 2. Count the number of patients in each COUNTY

In [None]:
# Write and run your solution here

### 3. What is the mean age (in years) of all patients?
<details>
    <summary>Hint 1</summary>
    <h4>You can convert BIRTHDATE to the datetime data type using <code>pd.to_datetime()</code></h4>
</details>
<details>
    <summary>Hint 2</summary>
    <h4>You can get today's date with <code>pd.to_datetime("today")</code></h4>
</details>
<details>
    <summary>Hint 3</summary>
    <h4>You can use the <code>.dt.days</code> method on a datetime column/series or datetime value to get the value of the date(s) expressed in days</h4>
</details>


In [None]:
# Write and run your solution here

### 4. Count the number of patients of each RACE who are over 60
<details>
    <summary>Hint</summary>
    <h4>If you didn't save the AGE that we computed in the last exercise to a column/variable, do that first</h4>
</details>

In [None]:
# Write and run your solution here

### 5. Generate a grouped histogram of income distributions by 10 year AGE cohorts


In [None]:
# create an AGE_BIN column
patients_ma['AGE_BIN'] = pd.cut(patients_ma['AGE'], np.arange(0, 120, 10), include_lowest=True)

fig, ax = plt.subplots(3, 4, dpi=150, figsize=(8, 6))

# Flatten the axes array
ax = ax.flatten()

# Count the number of unique AGE_BINs
num_groups = patients_ma['AGE_BIN'].nunique()

# insert your plotting code here, passing in ax=ax[:num_groups] as a keyword argument to the correct plot function

### 6. Plot INCOME vs AGE for all patients
<details>
    <summary>Hint</summary>
    <p>The pandas documentation for <a href="https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.plot.scatter.html">DataFrame.plot.scatter</a> might prove useful</p>
</details>

In [None]:
# Write and run your solution here