<a href="https://colab.research.google.com/github/brook-miller/2023mbai417/blob/main/3-class/california_housing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title Standard imports - we'll use in most EDA
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objs as go

from datetime import datetime, timedelta
from dateutil.parser import parse
from google.colab import data_table
data_table.enable_dataframe_formatter()

In [None]:
#@title pulling in the California housing dataset
#@markdown description here: https://developers.google.com/machine-learning/crash-course/california-housing-data-description
from sklearn.datasets import fetch_california_housing

df = fetch_california_housing(as_frame=True).frame
df

In [None]:
#@title use .info() to confirm variables are as expected
df.info()

In [None]:
#@title histogram shows the distribution of a continuous variable
fig = px.histogram(df, x="MedInc")
fig.show()

In [None]:
#@title Data is from 1990 census fairly deep recession in 87, average new block construction peaked in 1954
fig = px.histogram(df, x="HouseAge")
fig.show()

In [None]:
#@title Scatter matrix lets us analyze for relationships that might not be obvious
fig = px.scatter_matrix(df, height=1200)
fig.show()

In [None]:
#@title The correlation matrix shows AveRooms, MedInc, MedHouseValue to be positively correlated
corr = df.corr()
fig = go.Figure(data=go.Heatmap(z=corr.values,
                  x=corr.index.values,
                  y=corr.columns.values))
fig.show()

In [None]:
#@title Using the cut function to create our own bins
bins = [ 0, 10, 20, 30, 40, 50 ]
labels = ['<10', '10-20', '20-30','30-40', '40-50']
df['age_bins'] = pd.cut(df['HouseAge'] , bins=bins, labels=labels, include_lowest=True)
print(df.age_bins.value_counts())
fig = px.box(df, x="age_bins", y="MedHouseVal", points="all")
fig.show()

In [None]:
#@title Average incomes are lognormal distributed but why are they peaked
df["log_income"] = np.log10(df["MedInc"])
fig = px.histogram(df, x="log_income")
fig.show()

In [None]:
#@title This should look like California
fig = px.scatter(df, x="Longitude", y="Latitude", height=600, width=600)
fig.show()

In [None]:
#@title Using color to see the relationship between 3 variables
fig = px.scatter(df, x="Population", y="MedHouseVal", color = "MedInc", height=600, width=600)
fig.show()