<a href="https://colab.research.google.com/github/c-marq/CAP3321C-Data-Wrangling/blob/main/demos/chapter-02/ch02-demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Chapter 2: The Pandas essentials for data analysis

In [None]:
import pandas as pd

## Get the data

### Read a CSV file from a website into a DataFrame

In [None]:
url = 'https://data.cdc.gov/api/views/v6ab-adf5/rows.csv?accessType=DOWNLOAD'
mortality_data = pd.read_csv(url)

In [None]:
mortality_data

### Use DataFrame constructor to build a DataFrame object

In [None]:
df_data=[[1900, '1-4 Years', 1983.8],
         [1901, '1-4 Years', 1695.0]]
df_columns=['Year', 'Age Group', 'Death Rate']

In [None]:
mortality_df = pd.DataFrame(
    data=df_data,
    columns=df_columns)

In [None]:
mortality_df

### Save and restore a DataFrame

In [None]:
mortality_data.to_pickle('mortality_data.pkl')

In [None]:
mortality_data = pd.read_pickle('mortality_data.pkl')

In [None]:
mortality_data.head()

## Examine and clean the data

### Display the data

In [None]:
mortality_data

In [None]:
mortality_data.head()

In [None]:
mortality_data.tail(3)

In [None]:
with pd.option_context(
    'display.max_rows', 5,
    'display.max_columns', None):
    display(mortality_data)

### Display the DataFrame attributes

In [None]:
mortality_data.values

In [None]:
print("Index:  ", mortality_data.index)
print("Columns:", mortality_data.columns)
print("Size:   ", mortality_data.size)
print("Shape:  ", mortality_data.shape)

### Use the columns attribute to replace spaces with nothing

In [None]:
mortality_data.columns = mortality_data.columns.str.replace(' ', '')

In [None]:
mortality_data.columns

In [None]:
mortality_data.head()

### Use the info(), nunique(), and describe() methods

In [None]:
mortality_data.info()

In [None]:
mortality_data.info(memory_usage='deep')

In [None]:
mortality_data.nunique()

In [None]:
mortality_data.describe()

In [None]:
mortality_data.describe().T

## Access the data

### Access columns

In [None]:
mortality_data.DeathRate.head(2)

In [None]:
type(mortality_data.DeathRate)

In [None]:
mortality_data['DeathRate'].head(2)

In [None]:
mortality_data[['Year','DeathRate']].head(2)

In [None]:
type(mortality_data[['Year','DeathRate']])

### Access rows

In [None]:
mortality_data.query('Year==1900')

In [None]:
mortality_data.query('Year == 2000 and AgeGroup != "1-4 Years"')

In [None]:
mortality_data.query('Year == 1900 or Year == 2000').head()

In [None]:
# use backticks if a column name contains spaces
# mortality_data.query('Year == 2000 and `Age Group` != "1-4 Years"')

### Access a subset of rows and columns

In [None]:
mortality_data.query('Year == 1900').DeathRate.head()

In [None]:
mortality_data.query('Year == 1900')['DeathRate'].head()

In [None]:
mortality_data.query('Year == 1900')[['DeathRate']].head()

In [None]:
mortality_data.query('Year == 1900')[['AgeGroup','DeathRate']].head()

### Access rows with the loc[] accessor

In [None]:
mortality_data.loc[[0,5,10]]

In [None]:
mortality_data.loc[4:6]

In [None]:
mortality_data.loc[0:20:5]

In [None]:
mortality_data.loc[mortality_data.Year == 1917]

### Access columns with the loc[] accessor

In [None]:
mortality_data.loc[:, ['Year', 'AgeGroup']]

### Access rows and columns with the loc[] accessor

In [None]:
mortality_data.loc[[0,5,10],['AgeGroup','DeathRate']]

In [None]:
mortality_data.loc[4:6,'AgeGroup':'DeathRate']

### How to access rows and columns with the iloc[] accessor

In [None]:
mortality_data.iloc[[4,5,6],[1,2]]

In [None]:
mortality_data.iloc[4:7,1:3]

In [None]:
mortality_data.iloc[-10:]

## Prepare the data

### Sort the data

In [None]:
mortality_data.sort_values('DeathRate', ascending=False).head(3)

In [None]:
mortality_data.sort_values(['Year','DeathRate']).head(3)

In [None]:
mortality_data.sort_values(['Year','DeathRate'],
                           ascending=[True,False]).head()

### Apply statistical methods

In [None]:
mortality_data.DeathRate.mean()

In [None]:
mortality_data[['AgeGroup','DeathRate']].max()

In [None]:
mortality_data.count()

In [None]:
mortality_data[['Year', 'DeathRate']].quantile([.1,.9])

In [None]:
mortality_data.DeathRate.cumsum()

### Use Python for column arithmetic

In [None]:
mortality_data['MeanCentered'] = \
    mortality_data.DeathRate - mortality_data.DeathRate.mean()

In [None]:
mortality_data.head(4)

In [None]:
mortality_data['DeathRate'] = mortality_data.DeathRate / 100000

In [None]:
mortality_data.head(4)

### Modify the string data in a column

In [None]:
mortality_data.AgeGroup.replace(
    to_replace = ['1-4 Years','5-9 Years'],
    value = ['01-04 Years','05-09 Years'],
    inplace = True)

In [None]:
mortality_data.AgeGroup.replace(
    {'1-4 Years':'01-04 Years','5-9 Years':'05-09 Years'},
    inplace = True)

In [None]:
# mortality_data['AgeGroup'] = mortality_data.AgeGroup.str.replace('1-4 Years','01-04 Years')
# mortality_data['AgeGroup'] = mortality_data.AgeGroup.str.replace('5-9 Years','05-09 Years')

In [None]:
mortality_data.head(4)

## Shape the data

### Set and use an index

In [None]:
mortality_data = mortality_data.set_index('Year')
mortality_data.head(2)

In [None]:
mortality_data.reset_index(inplace=True)

In [None]:
# NOTE: the following line of code causes ValueError: Index has duplicate keys
# mortality_data = mortality_data.set_index('Year', verify_integrity=True)

In [None]:
mortality_data = mortality_data.set_index(
    ['Year','AgeGroup'], verify_integrity=True)
mortality_data.head(2)

In [None]:
mortality_data.reset_index(inplace=True)
mortality_data.head(2)

### Pivot the data

In [None]:
mortality_wide = mortality_data.pivot(
    index='Year', columns='AgeGroup', values='DeathRate')
mortality_wide.head(3)

In [None]:
mortality_wide = mortality_data.pivot(
    index='Year', columns='AgeGroup')
mortality_wide.head(3)

### Melt the data

In [None]:
# get starting data
mortality_wide = mortality_data.pivot(
    index='Year', columns='AgeGroup', values='DeathRate')

# save to Excel format to remove indexes
mortality_wide.to_excel('mortality_wide.xlsx')
mortality_wide = pd.read_excel('mortality_wide.xlsx')

mortality_wide.head(4)

In [None]:
mortality_long = mortality_wide.melt(
    id_vars = 'Year',
    value_vars=['01-04 Years','05-09 Years'],
    var_name ='AgeGroup',
    value_name='DeathRate')

with pd.option_context('display.max_rows', 4):
    display(mortality_long)

## Analyze the data

### Group the data

In [None]:
mortality_data.groupby('AgeGroup').mean()

In [None]:
mortality_data.groupby('Year').median(numeric_only=True).head(4)

In [None]:
mortality_data.groupby(['Year','AgeGroup']).count().head()

### Aggregate the data

In [None]:
mortality_data.groupby('AgeGroup').agg(['mean','median'])

In [None]:
mortality_data.groupby('AgeGroup')['DeathRate'] \
    .agg(['mean','median','std','nunique'])

In [None]:
mortality_data.groupby('Year')['DeathRate'] \
    .agg(['mean','median','std','min','max','var','nunique']).head(3)

## Visualize the data

In [None]:
mortality_data.pivot(index='Year',columns='AgeGroup')['DeathRate'].plot()

In [None]:
mortality_data.groupby('AgeGroup')['DeathRate'] \
    .agg(['mean','median','std']).plot.barh()