# 8. `College`

In [None]:
import altair as alt
import polars as pl

import isl

df = isl.get_data('College')
columns = {
    'Private' : 'Public/private indicator',
    'Apps' : 'Number of applications received',
    'Accept' : 'Number of applicants accepted',
    'Enroll' : 'Number of new students enrolled',
    'Top10perc' : r'New students from top 10 % of high school class',
    'Top25perc' : r'New students from top 25 % of high school class',
    'F.Undergrad' : 'Number of full-time undergraduates',
    'P.Undergrad' : 'Number of part-time undergraduates',
    'Outstate' : 'Out-of-state tuition',
    'Room.Board' : 'Room and board costs',
    'Books' : 'Estimated book costs',
    'Personal' : 'Estimated personal spending',
    'PhD' : 'Percent of faculty with Ph.D.s',
    'Terminal' : 'Percent of faculty with terminal degree',
    'S.F.Ratio' : 'Student/faculty ratio',
    'perc.alumni' : 'Percent of alumni who donate',
    'Expend' : 'Instructional expenditure per student',
    'Grad.Rate' : 'Graduation rate',
}
df.describe()

In [None]:
(
    df.top_k(50, by='Top10perc').sample(10).sort('Top10perc', descending=True).with_row_index('Rank')
    .plot.text(
        x=alt.X('Rank').scale(reverse=True),
        y='Top10perc',
        text='Name',
    )
 )

In [None]:
features = ['Top10perc', 'Apps', 'Enroll']
(
    alt.Chart(df)
    .mark_point()
    .encode(
        x=alt.X(alt.repeat("column"), type='quantitative'),
        y=alt.Y(alt.repeat("row"), type='quantitative'),
        color='Private:N',
        tooltip=['Name:N', 'Private:N'] + features,
    )
    .properties(
        width=100,
        height=100,
    )
    .repeat(
        row=features,
        column=features[::-1],
    )
    .interactive()
)

In [None]:
df.plot.boxplot(y='Private', x='Outstate:Q', color='Private', tooltip=['Name', 'Outstate:Q'],)

## 8f using `pl.Expr`

In [None]:
(
    df
    .with_columns(pl.col('Top10perc').gt(50).alias('Elite'))
    .plot.boxplot(x='Outstate', y='Elite', color='Elite')
)

## 8f using `Series.cut()` (unnecessary) 

In [None]:
(
    df
    .with_columns(pl.col('Top10perc').cut([50], labels=['1', '0']).cast(bool).alias('Elite'))
    .plot.boxplot(x='Outstate', y='Elite', color='Elite')
)

In [None]:
def kde(column):
    return df[column].plot.kde().properties(title=columns[column], width=50, height=50)
kde('PhD') | kde('Terminal') | kde('Personal')

---
# 9. `Auto`

In [None]:
import altair as alt
import polars as pl

import isl

df = isl.get_data('Auto')
df.describe()

In [None]:
quantitative_features = ['cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'year']
(
    alt.Chart(df)
    .mark_point()
    .encode(
        alt.X(alt.repeat("column"), type='quantitative').scale(zero=False),
        alt.Y(alt.repeat("row"), type='quantitative').scale(zero=False),
        color='origin:N',
        tooltip=['Name:N', 'Private:N'] + quantitative_features,
    )
    .properties(
        width=100,
        height=100,
    )
    .repeat(
        row=quantitative_features,
        column=quantitative_features[::-1],
    )
    .interactive()
)

In [None]:
from polars_ds.eda import diagnosis, plots
import polars_ds as pds

plots.plot_lin_reg(df, 'weight', 'mpg', add_bias=True).interactive()

In [None]:
df.with_columns(pds.simple_lin_reg('weight', 'mpg', add_bias=True, return_pred=True).struct.unnest())

In [None]:
quantitative_features = ['cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'year']
for feature in quantitative_features:
    df.plot.boxplot(x=alt.X(feature).scale(zero=False), y='origin', color='origin').properties(width=100).show()


## 9c calculate standard deviation 
- sum of deviations from mean, divided by `N - 1` for sample estimate, squareroot-ed

In [None]:
x = df.filter(pl.col('origin') == 'USA')['acceleration']
((x - x.mean()).pow(2).sum() / (x.len() - 1)) ** 0.5, x.std(ddof=1)

In [None]:
(
    alt.Chart(df)
    .mark_point()
    .encode(
        x=alt.X(alt.repeat("column"), type='quantitative').scale(zero=False),
        y='mpg:Q',
        color='origin:N',
        tooltip=['name:N', 'origin:N'] + quantitative_features,
    )
    .properties(
        width=100,
        height=100,
    )
    .repeat(column=quantitative_features)
)

---
# 10. `Boston`

In [None]:
import altair as alt
import polars as pl

import isl

df = isl.get_data('Boston')
columns = {
    'crim': 'per capita crime rate by town',
    'zn': 'proportion of residential land zoned for lots over 25,000 sq.ft.',
    'indus': 'proportion of non-retail business acres per town',
    'chas': 'Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)',
    'nox': 'nitrogen oxides concentration (parts per 10 million)',
    'rm': 'average number of rooms per dwelling',
    'age': 'proportion of owner-occupied units built prior to 1940',
    'dis': 'weighted mean of distances to five Boston employment centres',
    'rad': 'index of accessibility to radial highways',
    'tax': 'full-value property-tax rate per $10,000',
    'ptratio': 'pupil-teacher ratio by town',
    'lstat': 'lower status of the population (percent)',
    'medv': 'median value of owner-occupied homes in $1000s',
}
df.describe()

In [None]:
(
    df
    .filter(pl.col('crim') > 5)
    .plot.scatter(
        x=alt.X(alt.repeat("column"), type='quantitative').scale(zero=False),
        y='crim:Q',
        color='chas:N',
    )
    .properties(
        width=100,
        height=100,
    )
    .repeat(column=[c for c in columns if c not in ('crim',)])
    .interactive()
)