Current Goal:
Ensure that data is being prepped in the most appropriate way possible

In [1]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Wrangling
import pandas as pd
import numpy as np
import math

import acquire

# Statistical Tests
import scipy.stats as stats

# Visualizing
import matplotlib.pyplot as plt
from matplotlib import cm
import seaborn as sns

In [2]:
train, validate, test = acquire.prepare_zillow()

In [3]:
train.head()

Unnamed: 0,bathroomcnt,bedroomcnt,buildingqualitytypeid,calculatedfinishedsquarefeet,fips,fullbathcnt,heatingorsystemtypeid,latitude,longitude,lotsizesquarefeet,...,yearbuilt,structuretaxvaluedollarcnt,taxvaluedollarcnt,landtaxvaluedollarcnt,taxamount,censustractandblock,logerror,transactiondate,heatingorsystemdesc,propertylandusedesc
48285,2.0,2.0,4.0,1170.0,Los Angeles,2.0,7.0,33876919.0,-118405108.0,2258.0,...,1916.0,38403.0,294146.0,255743.0,3376.55,60376200000000.0,0.163588,2017-06-27,Floor/Wall,Single Family Residential
4918,3.0,2.0,7.0,1827.0,Los Angeles,3.0,2.0,33765300.0,-118172000.0,15018.0,...,1970.0,360413.0,604073.0,243660.0,7207.01,60375800000000.0,0.049248,2017-01-25,Central,Condominium
14238,3.0,3.0,8.0,1949.0,Los Angeles,3.0,2.0,34111771.0,-117748974.0,3317.0,...,1980.0,106726.0,177748.0,71022.0,2210.76,60374000000000.0,0.041176,2017-03-07,Central,Planned Unit Development
33467,3.0,4.0,6.639845,1835.0,Ventura,3.0,2.0,34181577.0,-118953714.0,6098.0,...,1974.0,160407.0,267340.0,106933.0,2801.2,61110100000000.0,0.089397,2017-05-11,Central,Condominium
22026,3.0,3.0,4.0,1287.0,Los Angeles,3.0,2.0,33802256.0,-118187611.0,6255.0,...,1922.0,159987.0,338075.0,178088.0,4171.91,60375700000000.0,0.035123,2017-04-04,Central,Single Family Residential


In [4]:
print(train.shape)
print(validate.shape)
print(test.shape)

(52518, 25)
(13130, 25)
(7295, 25)


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 52518 entries, 48285 to 12816
Data columns (total 25 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   bathroomcnt                   52518 non-null  float64
 1   bedroomcnt                    52518 non-null  float64
 2   buildingqualitytypeid         52518 non-null  float64
 3   calculatedfinishedsquarefeet  52518 non-null  float64
 4   fips                          52518 non-null  object 
 5   fullbathcnt                   52518 non-null  float64
 6   heatingorsystemtypeid         52518 non-null  float64
 7   latitude                      52518 non-null  float64
 8   longitude                     52518 non-null  float64
 9   lotsizesquarefeet             52518 non-null  float64
 10  propertycountylandusecode     52518 non-null  object 
 11  propertylandusetypeid         52518 non-null  object 
 12  rawcensustractandblock        52518 non-null  object 
 1

In [6]:
df = acquire.acquire_cache_zillow()

In [7]:
df.head()

Unnamed: 0,id,parcelid,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,...,censustractandblock,logerror,transactiondate,airconditioningdesc,architecturalstyledesc,buildingclassdesc,heatingorsystemdesc,propertylandusedesc,storydesc,typeconstructiondesc
0,1727539,14297519,,,,3.5,4.0,,,3.5,...,60590630000000.0,0.025595,2017-01-01,,,,,Single Family Residential,,
1,1387261,17052889,,,,1.0,2.0,,,1.0,...,61110010000000.0,0.055619,2017-01-01,,,,,Single Family Residential,,
2,11677,14186244,,,,2.0,3.0,,,2.0,...,60590220000000.0,0.005383,2017-01-01,,,,,Single Family Residential,,
3,2288172,12177905,,,,3.0,4.0,,8.0,3.0,...,60373000000000.0,-0.10341,2017-01-01,,,,Central,Single Family Residential,,
4,1970746,10887214,1.0,,,3.0,3.0,,8.0,3.0,...,60371240000000.0,0.00694,2017-01-01,Central,,,Central,Condominium,,


In [8]:
df = acquire.handle_missing_values(df)
df.head()

Unnamed: 0,id,parcelid,bathroomcnt,bedroomcnt,buildingqualitytypeid,calculatedbathnbr,calculatedfinishedsquarefeet,finishedsquarefeet12,fips,fullbathcnt,...,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,censustractandblock,logerror,transactiondate,heatingorsystemdesc,propertylandusedesc
0,1727539,14297519,3.5,4.0,,3.5,3100.0,3100.0,6059.0,3.0,...,485713.0,1023282.0,2016.0,537569.0,11013.72,60590630000000.0,0.025595,2017-01-01,,Single Family Residential
1,1387261,17052889,1.0,2.0,,1.0,1465.0,1465.0,6111.0,1.0,...,88000.0,464000.0,2016.0,376000.0,5672.48,61110010000000.0,0.055619,2017-01-01,,Single Family Residential
2,11677,14186244,2.0,3.0,,2.0,1243.0,1243.0,6059.0,2.0,...,85289.0,564778.0,2016.0,479489.0,6488.3,60590220000000.0,0.005383,2017-01-01,,Single Family Residential
3,2288172,12177905,3.0,4.0,8.0,3.0,2376.0,2376.0,6037.0,3.0,...,108918.0,145143.0,2016.0,36225.0,1777.51,60373000000000.0,-0.10341,2017-01-01,Central,Single Family Residential
4,1970746,10887214,3.0,3.0,8.0,3.0,1312.0,1312.0,6037.0,3.0,...,73681.0,119407.0,2016.0,45726.0,1533.89,60371240000000.0,0.00694,2017-01-01,Central,Condominium


In [9]:
df.columns

Index(['id', 'parcelid', 'bathroomcnt', 'bedroomcnt', 'buildingqualitytypeid',
       'calculatedbathnbr', 'calculatedfinishedsquarefeet',
       'finishedsquarefeet12', 'fips', 'fullbathcnt', 'heatingorsystemtypeid',
       'latitude', 'longitude', 'lotsizesquarefeet',
       'propertycountylandusecode', 'propertylandusetypeid',
       'propertyzoningdesc', 'rawcensustractandblock', 'regionidcity',
       'regionidcounty', 'regionidzip', 'roomcnt', 'unitcnt', 'yearbuilt',
       'structuretaxvaluedollarcnt', 'taxvaluedollarcnt', 'assessmentyear',
       'landtaxvaluedollarcnt', 'taxamount', 'censustractandblock', 'logerror',
       'transactiondate', 'heatingorsystemdesc', 'propertylandusedesc'],
      dtype='object')

In [10]:
df.propertyzoningdesc.unique()

array([nan, 'LCR110000*', 'LAR3', ..., 'HAR4-R2*', 'LCR1VV', 'BFA15000*'],
      dtype=object)

In [11]:
df.regionidzip.unique()

array([ 96978.,  97099.,  97078.,  96330.,  96451.,  97091.,  96293.,
        96325.,  96173.,  96047.,  96374.,  96966.,  97051.,  97008.,
        97107.,  96962.,  96275.,  96003.,  96220.,  96971.,  96954.,
        96018.,  96120.,  96237.,  96957.,  96352.,  96008.,  97005.,
        96122.,  97067.,  96116.,  96271.,  96946.,  96446.,  96349.,
        96987.,  96450.,  96447.,  96000.,  96291.,  96292.,  96016.,
        96133.,  96486.,  96361.,  96983.,  96998.,  96414.,  96464.,
        97106.,  96517.,  96982.,  96241.,  96121.,  96488.,  97047.,
        96522.,  96961.,  96370.,  95998.,  96366.,  96387.,  96507.,
        96993.,  96506.,  96494.,  96212.,  96337.,  96401.,  96049.,
        97018.,  96229.,  97079.,  96020.,  97118.,  96437.,  95983.,
        96185.,  96086.,  96218.,  96025.,  96236.,  96058.,  96974.,
        97063.,  96373.,  96452.,  96505.,  96294.,  96280.,  97020.,
        97068.,  95984.,  97003.,  96389.,  96208.,  97024.,  97318.,
        96101.,  970

In [12]:
df.regionidzip.shape

(73528,)

In [17]:
train.head()

Unnamed: 0,bathroomcnt,bedroomcnt,buildingqualitytypeid,calculatedfinishedsquarefeet,fips,fullbathcnt,heatingorsystemtypeid,latitude,longitude,lotsizesquarefeet,...,yearbuilt,structuretaxvaluedollarcnt,taxvaluedollarcnt,landtaxvaluedollarcnt,taxamount,censustractandblock,logerror,transactiondate,heatingorsystemdesc,propertylandusedesc
48285,2.0,2.0,4.0,1170.0,Los Angeles,2.0,7.0,33876919.0,-118405108.0,2258.0,...,1916.0,38403.0,294146.0,255743.0,3376.55,60376200000000.0,0.163588,2017-06-27,Floor/Wall,Single Family Residential
4918,3.0,2.0,7.0,1827.0,Los Angeles,3.0,2.0,33765300.0,-118172000.0,15018.0,...,1970.0,360413.0,604073.0,243660.0,7207.01,60375800000000.0,0.049248,2017-01-25,Central,Condominium
14238,3.0,3.0,8.0,1949.0,Los Angeles,3.0,2.0,34111771.0,-117748974.0,3317.0,...,1980.0,106726.0,177748.0,71022.0,2210.76,60374000000000.0,0.041176,2017-03-07,Central,Planned Unit Development
33467,3.0,4.0,6.639845,1835.0,Ventura,3.0,2.0,34181577.0,-118953714.0,6098.0,...,1974.0,160407.0,267340.0,106933.0,2801.2,61110100000000.0,0.089397,2017-05-11,Central,Condominium
22026,3.0,3.0,4.0,1287.0,Los Angeles,3.0,2.0,33802256.0,-118187611.0,6255.0,...,1922.0,159987.0,338075.0,178088.0,4171.91,60375700000000.0,0.035123,2017-04-04,Central,Single Family Residential


In [22]:
q1, q3 = train.calculatedfinishedsquarefeet.quantile([.25, .75])
q1, q3

(1174.0, 2074.0)

In [23]:
train.columns.to_list()

['bathroomcnt',
 'bedroomcnt',
 'buildingqualitytypeid',
 'calculatedfinishedsquarefeet',
 'fips',
 'fullbathcnt',
 'heatingorsystemtypeid',
 'latitude',
 'longitude',
 'lotsizesquarefeet',
 'propertycountylandusecode',
 'propertylandusetypeid',
 'rawcensustractandblock',
 'regionidcounty',
 'regionidzip',
 'yearbuilt',
 'structuretaxvaluedollarcnt',
 'taxvaluedollarcnt',
 'landtaxvaluedollarcnt',
 'taxamount',
 'censustractandblock',
 'logerror',
 'transactiondate',
 'heatingorsystemdesc',
 'propertylandusedesc']

In [34]:
def compress_outliers(df):
    '''
    Takes in a dataframe and identifies the interquartile range (IQR) of each numeric column.
    An upper threshold is defined for each column equal to the 75th percentile + 6 * IQR.
    A lower threshold is defined for each column equal to the 25th percentile - 6 * IQR.
    Any values above the upper threshold are set to be equal to the upper threshold.
    Any values below the lower threshold are set to be equal to the lower threshold.
    Returns the modified dataframe.
    '''
    columns = df.columns.to_list()
    for column in columns:
        if df[column].dtype == 'object':
            continue
        else:
            q1, q3 = df[column].quantile([.25, .75])
            IQR = q3 - q1
            upper_threshold = q3 + 6 * IQR
            lower_threshold = q1 - 6 * IQR 
            df[column] = df[column].apply(lambda x: upper_threshold if x > upper_threshold else x)
            df[column] = df[column].apply(lambda x: lower_threshold if x < lower_threshold else x)
    return df

In [36]:
df_compressed = compress_outliers(train)
df_compressed.calculatedfinishedsquarefeet.describe()

count    52518.000000
mean      1758.142713
std        903.074967
min        152.000000
25%       1174.000000
50%       1524.000000
75%       2074.000000
max       7474.000000
Name: calculatedfinishedsquarefeet, dtype: float64

In [30]:
q1, q3 = df['bathroomcnt'].quantile([.25, .75])

In [37]:
q1, q3 = df['fips'].quantile([.25, .75])

In [38]:
df['fips']

0        6059.0
1        6111.0
2        6059.0
3        6037.0
4        6037.0
          ...  
73690    6037.0
73691    6037.0
73692    6111.0
73693    6037.0
73694    6037.0
Name: fips, Length: 73528, dtype: float64

In [39]:
train.fips

48285    Los Angeles
4918     Los Angeles
14238    Los Angeles
33467        Ventura
22026    Los Angeles
            ...     
21993    Los Angeles
26824    Los Angeles
24732    Los Angeles
52546    Los Angeles
12816    Los Angeles
Name: fips, Length: 52518, dtype: object