<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Data-Description" data-toc-modified-id="Data-Description-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Data Description</a></span></li><li><span><a href="#Imports" data-toc-modified-id="Imports-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Imports</a></span></li><li><span><a href="#Useful-Scripts" data-toc-modified-id="Useful-Scripts-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Useful Scripts</a></span></li><li><span><a href="#Load-the-data" data-toc-modified-id="Load-the-data-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Load the data</a></span></li><li><span><a href="#Univariate-Analysis" data-toc-modified-id="Univariate-Analysis-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Univariate Analysis</a></span><ul class="toc-item"><li><span><a href="#Discrete-variables-and-Categorical-variables" data-toc-modified-id="Discrete-variables-and-Categorical-variables-5.1"><span class="toc-item-num">5.1&nbsp;&nbsp;</span>Discrete variables and Categorical variables</a></span></li><li><span><a href="#Continuous-variables" data-toc-modified-id="Continuous-variables-5.2"><span class="toc-item-num">5.2&nbsp;&nbsp;</span>Continuous variables</a></span><ul class="toc-item"><li><span><a href="#histograms" data-toc-modified-id="histograms-5.2.1"><span class="toc-item-num">5.2.1&nbsp;&nbsp;</span>histograms</a></span></li></ul></li></ul></li><li><span><a href="#Bi-variate-Analysis" data-toc-modified-id="Bi-variate-Analysis-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Bi-variate Analysis</a></span><ul class="toc-item"><li><span><a href="#Scatter-plots" data-toc-modified-id="Scatter-plots-6.1"><span class="toc-item-num">6.1&nbsp;&nbsp;</span>Scatter plots</a></span></li></ul></li><li><span><a href="#Multi-variate-Analysis" data-toc-modified-id="Multi-variate-Analysis-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Multi-variate Analysis</a></span><ul class="toc-item"><li><span><a href="#countplots" data-toc-modified-id="countplots-7.1"><span class="toc-item-num">7.1&nbsp;&nbsp;</span>countplots</a></span></li></ul></li><li><span><a href="#Map-visualization" data-toc-modified-id="Map-visualization-8"><span class="toc-item-num">8&nbsp;&nbsp;</span>Map visualization</a></span></li></ul></div>

# Data Description

This dataset contains house sale prices for King County, which includes Seattle.
It includes homes sold between May 2014 and May 2015.

![](../data/raw/data_description.png)

# Imports

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
sns.set(color_codes=True)

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

import os
import time

# random state
random_state=100
np.random.seed(random_state) # we need this in each cell
np.random.set_state=random_state

# Jupyter notebook settings for pandas
pd.set_option('display.max_columns', 50)
pd.set_option('display.float_format', '{:,.2g}'.format) # numbers sep by comma
pd.set_option('display.max_rows', 20) # None for all the rows
pd.set_option('display.max_colwidth', 50)

import IPython
from IPython.display import display

print([(x.__name__,x.__version__) for x in [np, pd,sns,matplotlib]])

[('numpy', '1.16.4'), ('pandas', '0.25.0'), ('seaborn', '0.9.0'), ('matplotlib', '3.1.1')]


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;

<IPython.core.display.Javascript object>

In [4]:
import bokeh
from bokeh.io import output_file, output_notebook
from bokeh.plotting import figure, show, reset_output
from bokeh.models import ColumnDataSource
from bokeh.layouts import row, column, gridplot
from bokeh.models.widgets import Tabs, Panel
from bokeh.palettes import Spectral6
from bokeh.models import ColumnDataSource,FactorRange

# Output the visualization directly in the notebook
output_notebook()

[(x.__name__,x.__version__) for x in [bokeh]]

[('bokeh', '1.3.4')]

In [5]:
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;

<IPython.core.display.Javascript object>

# Useful Scripts

In [6]:
def show_method_attributes(method, ncols=7):
    """ Show all the attributes of a given method.
    Example:
    ========
    show_method_attributes(list)
     """
    x = [i for i in dir(method) if i[0]!='_']
    x = [i for i in x if i not in 'os np pd sys time psycopg2'.split()]

    return pd.DataFrame(np.array_split(x,ncols)).T.fillna('')

# Load the data

In [7]:
df = pd.read_csv('../data/processed/data_cleaned_encoded.csv')
print(df.shape)
df.head().T

(21613, 92)


Unnamed: 0,0,1,2,3,4
id,7129300520,6414100192,5631500400,2487200875,1954400510
date,2014-10-13,2014-12-09,2015-02-25,2014-12-09,2015-02-18
price,2.2e+05,5.4e+05,1.8e+05,6e+05,5.1e+05
bedrooms,3,3,2,4,3
bathrooms,1,2.2,1,3,2
sqft_living,1180,2570,770,1960,1680
sqft_lot,5650,7242,10000,5000,8080
floors,1,2,1,1,1
waterfront,0,0,0,0,0
view,0,0,0,0,0


# Univariate Analysis

## Discrete variables and Categorical variables

In [8]:
df['bedrooms'].value_counts()

3     9824
4     6882
2     2760
5     1601
6      272
1      199
7       38
8       13
0       13
9        6
10       3
11       1
33       1
Name: bedrooms, dtype: int64

In [9]:
from bhishan.util_bokeh import countplot_bokeh

In [10]:
ofile = '../reports/bokeh_outputs/bedrooms_countplot.html'
countplot_bokeh(df, 'bedrooms',height=400,ofile=None)

In [11]:
countplot_bokeh(df,'view',height=300)

## Continuous variables

### histograms

In [12]:
from bhishan.util_bokeh import histogram_bokeh

histogram_bokeh(df,'sqft_living',n_bins=20)

# Bi-variate Analysis

## Scatter plots

In [13]:
from bhishan.util_bokeh import scatterplot_bokeh

In [14]:
ofile = '../reports/bokeh_outputs/sqftLiving_vs_price.html'
scatterplot_bokeh(df,'sqft_living','price',ofile=ofile)

# Multi-variate Analysis

## countplots

In [15]:
from bhishan.util_bokeh import stacked_countplot_bokeh

stacked_countplot_bokeh(df,'bedrooms','yr_sales','price')

# Map visualization

In [16]:
df[['lat','long']].head()

Unnamed: 0,lat,long
0,48,-120.0
1,48,-120.0
2,48,-120.0
3,48,-120.0
4,48,-120.0


In [17]:
from bhishan.util_bokeh import map_plot_bokeh

In [18]:
ofile = '../reports/bokeh_outputs/map.html'
map_plot_bokeh(df, 'lat', 'long',ofile)