In [1]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import env


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

import wrangle2
import wrangle3

In [2]:
train, X_train, y_train, X_validate, y_validate, X_test, y_test=wrangle3.wrangle()

In [3]:
X_train.county.value_counts()

Los_Angeles    23398
Orange          9865
Ventura         3121
Name: county, dtype: int64

In [4]:
X_train.sample(50)

Unnamed: 0.1,Unnamed: 0,parcelid,bathroomcnt,bedroomcnt,calculatedfinishedsquarefeet,fips,latitude,longitude,lotsizesquarefeet,regionidcity,...,acres,acres_bin,sqft_bin,structure_dollar_per_sqft,structure_dollar_sqft_bin,land_dollar_per_sqft,lot_dollar_sqft_bin,bath_bed_ratio,cola,baseline
824,824,12827854,2.0,3.0,1359.0,6037.0,33935113.0,-117989872.0,6353.0,14634.0,...,0.145845,0.1,0.3,111.263429,0.4,43.310877,0.3,0.666667,0,0.017913
21046,21046,12685252,3.0,3.0,1880.0,6037.0,33803851.0,-118368199.0,8864.0,33311.0,...,0.203489,0.2,0.4,45.813830,0.1,8.665727,0.2,1.000000,0,0.017913
4015,4015,13935501,2.0,3.0,1908.0,6059.0,33796151.0,-117935222.0,9360.0,16764.0,...,0.214876,0.2,0.4,61.507862,0.2,48.863568,0.3,0.666667,0,0.017913
9754,9754,11468424,1.0,2.0,806.0,6037.0,33957656.0,-118388956.0,5701.0,12447.0,...,0.130877,0.1,0.1,148.633995,0.4,106.849675,0.5,0.500000,1,0.017913
18072,18072,12878640,2.0,3.0,1145.0,6037.0,34028922.0,-117736285.0,6039.0,20008.0,...,0.138636,0.1,0.2,76.220087,0.3,14.548932,0.2,0.666667,0,0.017913
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12299,12299,12624656,3.0,4.0,2430.0,6037.0,33782123.0,-118303788.0,5991.0,12447.0,...,0.137534,0.1,0.5,22.575309,0.0,8.292105,0.2,0.750000,1,0.017913
667,667,11197626,2.0,3.0,1104.0,6037.0,34563500.0,-118121152.0,8499.0,40227.0,...,0.195110,0.2,0.2,47.194746,0.1,1.535592,0.1,0.666667,0,0.017913
29834,29834,14413575,2.5,3.0,2056.0,6059.0,33512001.0,-117704994.0,5418.0,25459.0,...,0.124380,0.1,0.5,124.928016,0.4,113.743632,0.5,0.833333,0,0.017913
9303,9303,17136516,2.0,4.0,1694.0,6111.0,34235856.0,-119031039.0,6700.0,51239.0,...,0.153811,0.2,0.4,106.827627,0.4,18.006119,0.2,0.500000,0,0.017913


## Feature engineering

- 

In [None]:
df=X_train

In [None]:
def create_features(df):
    df['age'] = 2017 - df.yearbuilt
    df['age_bin'] = pd.cut(df.age, 
                           bins = [0, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140],
                           labels = [0, .066, .133, .20, .266, .333, .40, .466, .533, 
                                     .60, .666, .733, .8, .866, .933])

    # create taxrate variable
    df['taxrate'] = df.taxamount/df.taxvaluedollarcnt*100

    # create acres variable
    df['acres'] = df.lotsizesquarefeet/43560

    # bin acres
    df['acres_bin'] = pd.cut(df.acres, bins = [0, .10, .15, .25, .5, 1, 5, 10, 20, 50, 200], 
                       labels = [0, .1, .2, .3, .4, .5, .6, .7, .8, .9])

    # square feet bin
    df['sqft_bin'] = pd.cut(df.calculatedfinishedsquarefeet, 
                            bins = [0, 800, 1000, 1250, 1500, 2000, 2500, 3000, 4000, 7000, 12000],
                            labels = [0, .1, .2, .3, .4, .5, .6, .7, .8, .9]
                       )

    # dollar per square foot-structure
    df['structure_dollar_per_sqft'] = df.structuretaxvaluedollarcnt/df.calculatedfinishedsquarefeet


    df['structure_dollar_sqft_bin'] = pd.cut(df.structure_dollar_per_sqft, 
                                             bins = [0, 25, 50, 75, 100, 150, 200, 300, 500, 1000, 1500],
                                             labels = [0, .1, .2, .3, .4, .5, .6, .7, .8, .9]
                                            )


    # dollar per square foot-land
    df['land_dollar_per_sqft'] = df.landtaxvaluedollarcnt/df.lotsizesquarefeet

    df['lot_dollar_sqft_bin'] = pd.cut(df.land_dollar_per_sqft, bins = [0, 1, 5, 20, 50, 100, 250, 500, 1000, 1500, 2000],
                                       labels = [0, .1, .2, .3, .4, .5, .6, .7, .8, .9]
                                      )


    # update datatypes of binned values to be float
    df = df.astype({'sqft_bin': 'float64', 'acres_bin': 'float64', 'age_bin': 'float64',
                    'structure_dollar_sqft_bin': 'float64', 'lot_dollar_sqft_bin': 'float64'})


    # ratio of bathrooms to bedrooms
    df['bath_bed_ratio'] = df.bathroomcnt/df.bedroomcnt

    # 12447 is the ID for city of LA. 
    # I confirmed through sampling and plotting, as well as looking up a few addresses.
    df['cola'] = df['regionidcity'].apply(lambda x: 1 if x == 12447.0 else 0)

    return df

In [None]:
df = create_features(df)
df.head().T

### Remove outliers

In [None]:
def remove_outliers(df):
    '''
    remove outliers in bed, bath, zip, square feet, acres & tax rate
    '''

    return df[((df.bathroomcnt <= 7) & (df.bedroomcnt <= 7) & 
               (df.regionidzip < 100000) & 
               (df.bathroomcnt > 0) & 
               (df.bedroomcnt > 0) & 
               (df.acres < 20) &
               (df.calculatedfinishedsquarefeet < 10000) & 
               (df.taxrate < 10)
              )]

In [None]:
df = remove_outliers(df)


## Clustering