In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import acquire
import prepare

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

import warnings
warnings.filterwarnings('ignore')

from sklearn.cluster import KMeans

import scipy.stats as stats

random_state = 42

In [2]:
df = acquire.zillow_data()

Reading from local CSV...


In [3]:
df = prepare.prep_zillow(df)
train, validate, test = prepare.train_validate_test_split(df)
target = 'logerror'

train	 n = 29001
test	 n = 10358
validate n = 12429


In [4]:
outlier_columns = [col for col in df.columns if ((df[col].dtype != 'object') & (col not in [target, 'latitude', 'longitude']))]
train, validate, test = prepare.remove_outliers(train, validate, test, 3, outlier_columns)

train	 n = 15874
test	 n = 5731
validate n = 6750


In [5]:
train, validate, test = prepare.scale_zillow(train, validate, test, target)
train, validate, test = prepare.encode_zillow(train, validate, test, target)

#### adding cluster features

In [6]:
def add_clusters(train, validate, test):
    
    # cluster_BedBath

    features = ['scaled_bedroomcnt', 'scaled_bathroomcnt']
    x = train[features]
    kmeans = KMeans(n_clusters=3, random_state=random_state)
    kmeans.fit(x)

    for sample in [train, validate, test]:
        x = sample[features]
        sample['cluster_BedBath'] = kmeans.predict(x)
        sample['cluster_BedBath'] = sample.cluster_BedBath.map({1:'low', 0:'mid', 2:'high'})


    # cluster_BedBathSqft

    features = ['scaled_bedroomcnt', 'scaled_bathroomcnt', 'scaled_sqft']
    x = train[features]
    kmeans = KMeans(n_clusters=3, random_state=random_state)
    kmeans.fit(x)

    for sample in [train, validate, test]:
        x = sample[features]
        sample['cluster_BedBathSqft'] = kmeans.predict(x)
        sample['cluster_BedBathSqft'] = sample.cluster_BedBathSqft.map({1:'low', 0:'mid', 2:'high'})

    # cluster_LatLong
    features = ['scaled_latitude', 'scaled_longitude']
    x = train[features]
    kmeans = KMeans(n_clusters=4, random_state=random_state)
    kmeans.fit(x)

    for sample in [train, validate, test]:
        x = sample[features]
        sample['cluster_LatLong'] = kmeans.predict(x)
        sample['cluster_LatLong'] = sample.cluster_LatLong.map({0:'east', 1:'central', 2:'west', 3:'north'})

    # cluster_BedBathTaxvaluepersqft
    featyres = ['scaled_bedroomcnt', 'scaled_bathroomcnt', 'scaled_taxvalue_per_sqft']
    x = train[features]
    kmeans = KMeans(n_clusters=3, random_state=random_state)
    kmeans.fit(x)

    for sample in [train, validate, test]:
        x = sample[features]
        sample['cluster_BedBathTaxvaluepersqft'] = kmeans.predict(x)

    return train, validate, test

In [7]:
train, validate, test = prepare.add_clusters(train, validate, test)

In [8]:
train[[col for col in train.columns if 'cluster_' in col]].head()

Unnamed: 0,cluster_BedBath,cluster_BedBathSqft,cluster_LatLong,cluster_BedBathTaxvaluepersqft
23280,mid,mid,north,0
55542,mid,mid,west,2
7420,low,low,central,2
68942,high,high,west,2
67157,low,low,central,1


In [9]:
validate[[col for col in train.columns if 'cluster_' in col]].head()

Unnamed: 0,cluster_BedBath,cluster_BedBathSqft,cluster_LatLong,cluster_BedBathTaxvaluepersqft
71548,high,high,east,1
59584,low,low,central,1
41011,high,high,east,1
19700,high,high,north,0
33495,mid,mid,east,1


In [10]:
test[[col for col in train.columns if 'cluster_' in col]].head()

Unnamed: 0,cluster_BedBath,cluster_BedBathSqft,cluster_LatLong,cluster_BedBathTaxvaluepersqft
46416,mid,mid,central,1
47163,mid,mid,west,2
39040,mid,mid,west,2
50404,low,low,central,1
15572,mid,mid,east,1
