In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import statsmodels as sm
import os
%matplotlib inline
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
PREVIOUS_MAX_ROWS = pd.options.display.max_rows
pd.options.display.max_rows = 20
np.random.seed(12345)
plt.rc('figure', figsize=(10, 6))
np.set_printoptions(precision=4, suppress=True)

In [None]:
LOCAL_DIR = os.path.join("datasets", "cali_housing")
LOCAL_FILE = os.path.join(LOCAL_DIR,'housing.csv')

In [None]:
df = pd.read_csv(LOCAL_FILE)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df["ocean_proximity"].value_counts()

In [None]:
df = df[df["ocean_proximity"] != 'ISLAND']

In [None]:
df["ocean_proximity"].value_counts()

In [None]:
df.describe()

In [None]:
df.hist(bins=50, figsize=(20,15))

## Three ways to Deal with Missing Values
### 1. Drop the rows containing missing values
### 2. Drop the whole column that has missing values
### 3. For Numerical columns, Impute (estimate) the values with mean, median or some other statistic of the column

In [None]:
df.dropna(subset=["total_bedrooms"]) # option 1

In [None]:
df.drop("total_bedrooms", axis=1) # option 2

In [None]:
len(df)

In [None]:
median = df["total_bedrooms"].median()
df["total_bedrooms"].fillna(median) # option 3

In [None]:
corr_matrix = df.corr()

In [None]:
corr_matrix

In [None]:
corr_matrix["median_house_value"].sort_values(ascending=False)

In [None]:
df['median_income'].hist()

In [None]:
buckets = np.linspace(0,16,6)
df['median_income_cats'] = pd.cut(df['median_income'], buckets)

In [None]:
df['median_income_cats'].value_counts()

In [None]:
df['median_income_cats'].value_counts()/len(df)