In [1]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Wrangling
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

# Exploring
import scipy.stats as stats

# Visualizing
import matplotlib.pyplot as plt
from matplotlib import cm
import seaborn as sns
from sklearn.model_selection import learning_curve
%matplotlib inline

pd.options.display.float_format = '{:20,.2f}'.format

import wrangle_zillow

In [2]:
zillow = wrangle_zillow.wrangle_zillow()

In [3]:
train, test = wrangle_zillow.split_my_data(zillow, 0.8)

In [4]:
train.head()

Unnamed: 0_level_0,parcelid,airconditioningtypeid,basementsqft,bathroomcnt,bedroomcnt,buildingqualitytypeid,decktypeid,finishedsquarefeet12,fips,fireplacecnt,...,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock,logerror,transactiondate,county
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1776129,17273608,0,0,2.0,3,6,0,1441,6111,1,...,494000.0,2016,321000.0,5297.86,0,0,61110060003017.0,-0.03,2017-03-13,Ventura
1129326,13977124,0,0,6.5,6,6,0,6084,6059,0,...,6396075.0,2016,4919675.0,69945.88,0,0,60590994151036.0,0.07,2017-06-09,Orange
1410513,12684160,0,0,2.0,3,6,0,1214,6037,0,...,629340.0,2016,503473.0,7196.91,0,0,60376514012008.0,-0.01,2017-03-28,Los_Angeles
1218442,13872114,0,0,2.0,3,6,0,1059,6059,0,...,151716.0,2016,101134.0,1989.8,Y,15,60590110004014.0,0.27,2017-04-14,Orange
912132,11007780,1,0,2.0,3,6,0,1292,6037,0,...,400000.0,2016,300000.0,5042.45,0,0,60371061112002.0,-0.04,2017-04-18,Los_Angeles


### Plot all continuous or numeric/ordered variables against each other in scatterplots, or through a pairplot.

In [5]:
num_vars = list(train.select_dtypes('number').columns)
num_vars

['parcelid',
 'airconditioningtypeid',
 'basementsqft',
 'bathroomcnt',
 'bedroomcnt',
 'buildingqualitytypeid',
 'decktypeid',
 'finishedsquarefeet12',
 'fips',
 'fireplacecnt',
 'garagecarcnt',
 'garagetotalsqft',
 'hashottuborspa',
 'heatingorsystemtypeid',
 'latitude',
 'longitude',
 'lotsizesquarefeet',
 'poolcnt',
 'poolsizesum',
 'regionidcounty',
 'roomcnt',
 'unitcnt',
 'yearbuilt',
 'numberofstories',
 'structuretaxvaluedollarcnt',
 'taxvaluedollarcnt',
 'assessmentyear',
 'landtaxvaluedollarcnt',
 'taxamount',
 'taxdelinquencyyear',
 'logerror']

In [None]:
cont_vars = ['basementsqft',
             'finishedsquarefeet12',
             'garagetotalsqft',
             'lotsizesquarefeet',
             'poolsizesum',
             'yearbuilt',
             'structuretaxvaluedollarcnt',
             'taxvaluedollarcnt',
             'landtaxvaluedollarcnt',
             'taxamount',
             'logerror'
            ]
disc_vars = ['airconditioningtypeid',
             'bathroomcnt',
             'bedroomcnt',
             'buildingqualitytypeid',
             'decktypeid',
             'fireplacecnt',
             'garagecarcnt',
             'hashottuborspa',
             'heatingorsystemtypeid',
             'poolcnt',
             'roomcnt',
             'unitcnt',
             'numberofstories',
             'assessmentyear',
             'taxdelinquencyyear'
            ]

In [None]:
sns.pairplot(train[cont_vars], hue='county')

In [None]:
sns.pairplot(train[disc_vars], hue='county')

### Plot a correlation matrix heatmap.

In [None]:
ax = sns.heatmap(train[cont_vars].corr(), cmap='plasma', annot=True)
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)

In [None]:
ax = sns.heatmap(train[disc_vars].corr(), cmap='plasma', annot=True)
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)

### Plot latitude against longitude.

### Create a useable plot with 4 dimensions, e.g. x, y, color, size.

In [None]:
sns.scatterplot(x='finishedsquarefeet12', y='', hue='county', size='bathroomcnt')

### Compute the mean(logerror) by zipcode and the overall mean(logerror). Write a loop that will run a t-test between the overall mean and the mean for each zip code. We want to identify the zip codes where the error is significantly higher or lower than the expected error.

In [None]:
LA = train[train.county == 'Los_Angeles'].logerror.mean()
OC = train[train.county == 'Orange'].logerror.mean()
VC = train[train.county == 'Ventura'].logerror.mean()
overall_mean = train.logerror.mean()

### Is logerror significantly different for properties in LA County vs Orange County vs Ventura County?