<a href="https://colab.research.google.com/github/c-marq/CAP3321C-Data-Wrangling/blob/main/demos/ch08_examples.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Chapter 8: How to analyze the data

In [None]:
import pandas as pd
import seaborn as sns

## How to melt columns to create long data

In [None]:
cars = pd.read_pickle('cars.pkl')

In [None]:
cars.head()

In [None]:
cars_melted = pd.melt(cars, id_vars='price',
                      value_vars=['enginesize','curbweight'],
                      var_name='feature', value_name='featureValue')
cars_melted

## How to plot melted columns

In [None]:
sns.relplot(data=cars_melted, x='featureValue', y='price',
            hue='feature')

In [None]:
sns.relplot(data=cars_melted, x='featureValue', y='price',
            col='feature', facet_kws={'sharex':False})

## How to group and apply a single aggregate method

In [None]:
# get the data
fires = pd.read_pickle('fires_prepared.pkl')
fires = fires.reset_index(drop=True)

In [None]:
fires.head(3)

In [None]:
fires.groupby('state').mean(numeric_only=True).head(3)

In [None]:
fires.groupby(['state','fire_year','fire_month']).max(numeric_only=True).head(3)

## How to work with a DataFrameGroupBy object

In [None]:
fires.head(3)

In [None]:
yearly_group = fires.groupby('fire_year')
yearly_sums = yearly_group[['acres_burned', 'days_burning']].sum()
yearly_sums.head(3)

In [None]:
yearly_group = fires.groupby('fire_year', as_index=False)
yearly_sums = yearly_group[['acres_burned', 'days_burning']].sum()
yearly_sums.head(3)

## How to apply multiple aggregate methods

In [None]:
monthly_group = fires.groupby(['state','fire_year','fire_month'])

In [None]:
monthly_group[['acres_burned', 'days_burning']].agg(['sum','count','mean']).head(3)

In [None]:
monthly_group.days_burning.agg(['sum','count','mean']).head(3)

In [None]:
df = monthly_group.agg({'acres_burned':['sum','max','min'],
                   'days_burning':['sum','mean'],
                   'fire_name':'count'})
df.head(3)

## How to use the pivot() method

In [None]:
states = ['AK','CA','ID','TX']
top_states = fires.groupby(['state','fire_year'], as_index=False)[['acres_burned', 'days_burning']].sum()
top_states = top_states.query('state in @states')
top_states.head(2)

In [None]:
top_states.pivot(index='fire_year', columns='state', values='acres_burned').head(2)

In [None]:
top_states.pivot(index='fire_year', columns='state', values='acres_burned').plot()

## How to use the pivot_table() method

In [None]:
states = ['AK','CA','ID','TX']
fires_top_4 = fires.query('state in @states')
fires_top_4 = fires_top_4.pivot_table(index='fire_year', columns='state',
                                      values='acres_burned', aggfunc='sum')
fires_top_4.head(2)

In [None]:
fires_top_4.plot()

## How to create bins of equal size

In [None]:
fires_filtered = fires.query('fire_year == 2010 and days_burning > 0').dropna()

In [None]:
pd.cut(fires_filtered.acres_burned, bins=4)

In [None]:
pd.cut(fires_filtered.acres_burned, bins=[0,100000,200000,300000,400000])

In [None]:
pd.cut(fires_filtered.acres_burned, bins=[0,100000,200000,300000,400000],
       labels=['small','medium','large','very large'])

In [None]:
pd.cut(fires_filtered.acres_burned, bins=[0,100000,200000,300000,400000],
       labels=['small','medium','large','very large']).value_counts()

## How to create bins with equal numbers of unique items

In [None]:
pd.qcut(fires_filtered.acres_burned, q=4,
        labels=['small','medium','large','very large'])

In [None]:
pd.qcut(fires_filtered.acres_burned, q=4,
        labels=['small','medium','large','very large']).value_counts()

In [None]:
fires_filtered['fire_size'] = pd.qcut(fires_filtered.acres_burned, q=4,
    labels=['small','medium','large','very large'])

In [None]:
pd.qcut(fires_filtered.days_burning, q=4,
        labels=['short','medium','long'], duplicates='drop').value_counts()

## How to plot the binned data

In [None]:
fires_filtered.head()

In [None]:
sns.catplot(data=fires_filtered, kind='count', x='fire_month', hue='fire_size')

## How to get the top n rows

In [None]:
cars.nlargest(n=6, columns='enginesize')

In [None]:
cars.nlargest(n=6, columns=['enginesize','price'])

## How to calculate percent change

In [None]:
df = fires[['state','fire_year','acres_burned']] \
    .groupby(['state','fire_year']).sum()
df.head()

In [None]:
df.pct_change()

## How to rank rows

In [None]:
df = fires.groupby('state')[['acres_burned','days_burning']].sum()
df.head(3)

In [None]:
df['acres_rank'] = df.acres_burned.rank(ascending=False)
df.sort_values('acres_rank').head(3)

In [None]:
df['days_rank'] = df.days_burning.rank(method='max')
df.sort_values('days_burning').head(4)