# Altair workbook

Reproducing the seaborn workbook, but visualizations through Altair.

In [None]:
import numpy as np
import pandas as pd
import altair as alt

## Data

We will use the same mortgage data we practiced with in the pandas tutorials.

In [None]:
data = pd.read_csv('../data/loan_data_subset.csv')
data.head()

## How many type of loans are there in this dataset? which one is most requested? 

(barchart)

In [None]:
alt.Chart(data,height=300).mark_bar().encode(
    x='count(loan_purpose_name)',
    y='loan_purpose_name',
    tooltip=['count(loan_purpose_name)'])

### Applying sorting to the bar chart

In [None]:
alt.Chart(data,height=300).mark_bar().encode(
    x='count(loan_purpose_name)',
    y=alt.Y('loan_purpose_name',sort='-x'),
    tooltip=['count(loan_purpose_name)'])

## There are multiple loan agencies (`agency_abbr`). Is any of them particularly getting more requests from low/high income applicants? or for a specific type of loan?

Let's first make a bar chart of agency and median income of applicants:

In [None]:
alt.Chart(data,width=500).mark_bar().encode(
    x = alt.X('agency_abbr',sort='-y'),
    y = 'median(applicant_income_000s)',
    color = alt.X('agency_abbr',sort='-y'),
    tooltip=['median(applicant_income_000s)','count(applicant_income_000s)'])

#### Now let's make a bar chart categorizing with both agency and loan type

In [None]:
alt.Chart(data,width=200).mark_bar().encode(
    column = 'loan_purpose_name:O',
    y = 'count(loan_purpose_name)',
    x = alt.X('agency_abbr',sort='y'),
    color = 'agency_abbr',
    tooltip = ['count(loan_purpose_name)'])

## distribution of loan amount as a boxplot function, categorized by agency and loan type

In [None]:
alt.Chart(data,width=200).mark_boxplot().encode(
    column = 'loan_purpose_name:O',
    y = alt.Y('loan_amount_000s'),
    x = alt.X('agency_abbr',sort='y'),
    color = 'agency_abbr')

The outliers are strong with this one!

In [None]:
alt.Chart(data,width=200).mark_boxplot().encode(
    column = 'loan_purpose_name:O',
    y = alt.Y('loan_amount_000s',scale=alt.Scale(type="log")),
    x = alt.X('agency_abbr',sort='y'),
    color = 'agency_abbr')

## How are the loan amounts distributed per type of loan?

Let's first look at median

In [None]:
alt.Chart(data,width=500).mark_bar().encode(
    x = 'loan_purpose_name',
    y = 'median(loan_amount_000s)'
)

How are they distributed?

In [None]:
alt.Chart(data).transform_density(
    'loan_amount_000s',
    as_=['loan_amount_000s', 'density'],
    groupby=['loan_purpose_name']
).mark_area(orient='horizontal').encode(
    y=alt.Y('loan_amount_000s',scale=alt.Scale(type="linear")),
    color='loan_purpose_name',
    x=alt.X(
        'density:Q',
        stack='center',
        impute=None,
        title=None,
        axis=alt.Axis(labels=False, values=[0],grid=False, ticks=True),
    ),
    column=alt.Column(
        'loan_purpose_name:N',
        header=alt.Header(
            titleOrient='bottom',
            labelOrient='bottom',
            labelPadding=0,
        ),
    )
).properties(
    width=100
).configure_facet(
    spacing=0
).configure_view(
    stroke=None
)

Let's do this in log-scale based on log of the data

In [None]:
data['log_loan_amount_000s'] = np.log10(data['loan_amount_000s'])

In [None]:
alt.Chart(data).transform_density(
    'log_loan_amount_000s',
    as_=['log_loan_amount_000s', 'density'],
    extent=[0,4],
    groupby=['loan_purpose_name']
).mark_area(orient='horizontal').encode(
    y=alt.Y('log_loan_amount_000s',scale=alt.Scale(type="linear")),
    color='loan_purpose_name',
    x=alt.X(
        'density:Q',
        stack='center',
        impute=None,
        title=None,
        axis=alt.Axis(labels=False, values=[0],grid=False, ticks=True),
    ),
    column=alt.Column(
        'loan_purpose_name:N',
        header=alt.Header(
            titleOrient='bottom',
            labelOrient='bottom',
            labelPadding=0,
        ),
    )
).properties(
    width=100
).configure_facet(
    spacing=0
).configure_view(
    stroke=None
)

### let's visualize that with a swarm plot
*a.k.a. remember `geom_jitter` in `ggplot`? (lecture 2 by Suman)*

In [None]:
stripplot =  alt.Chart(data, width=200).mark_circle(size=8).encode(
    x=alt.X(
        'jitter:Q',
        title=None,
        axis=alt.Axis(values=[0], ticks=True, grid=False, labels=False),
        scale=alt.Scale(),
    ),
    y=alt.Y('log_loan_amount_000s:Q'),
    color=alt.Color('loan_purpose_name:N', legend=None),
    column=alt.Column(
        'loan_purpose_name:N',
        header=alt.Header(
            labelAngle=-90,
            titleOrient='top',
            labelOrient='bottom',
            labelAlign='right',
            labelPadding=3,
        ),
    ),
).transform_calculate(
    # Generate Gaussian jitter with a Box-Muller transform
    jitter='sqrt(-2*log(random()))*cos(2*PI*random())'
).configure_facet(
    spacing=0
).configure_view(
    stroke=None
)

stripplot

In [None]:

alt.Chart(data).mark_circle(size=60).encode(
    x=alt.X('loan_amount_000s',scale=alt.Scale(type='log')),
    y=alt.Y('applicant_income_000s',scale=alt.Scale(type='log')),
    color='loan_purpose_name',
    tooltip=['agency_name']
).interactive()

