In [1]:
import warnings
warnings.filterwarnings("ignore")

import env
import Liam_wrangle
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import viz_kmeans
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from scipy import stats

import sklearn.preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [2]:
df = pd.read_csv('train_zillow.csv')

In [3]:
def split_zillow(df):
    '''This fuction takes in a df 
    splits into train, test, validate
    return: three pandas dataframes: train, validate, test
    '''
    # split the focused zillow data
    train_validate, test = train_test_split(df, test_size=.2, random_state=1234)
    train, validate = train_test_split(train_validate, test_size=.3, random_state=1234)
    return train, validate, test

In [4]:
train, validate, test = split_zillow(df)

In [5]:
def scale_my_data(train, validate, test):
    scale_columns = ['bathrooms', 'bedrooms', 'quality', 
              'square_feet', 'full_bathrooms', 
              'latitude', 'longitude', 'lot_square_feet',  
              'land_type', 'city', 'county', 'zip_code', 
             'room_count', 'unitcnt', 'structure_tax_value', 
             'tax_value',  'land_tax_value', 'taxamount',
              'house_age', 'tax_rate', 'acres']
    scaler = MinMaxScaler()
    scaler.fit(train[scale_columns])

    train_scaled = scaler.transform(train[scale_columns])
    validate_scaled = scaler.transform(validate[scale_columns])
    test_scaled = scaler.transform(test[scale_columns])
    #turn into dataframe
    train_scaled = pd.DataFrame(train_scaled)
    validate_scaled = pd.DataFrame(validate_scaled)
    test_scaled = pd.DataFrame(test_scaled)
    
    return train_scaled, validate_scaled, test_scaled

In [6]:
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

In [7]:
train_scaled, validate_scaled, test_scaled = scale_my_data(train, validate, test)

In [8]:
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

In [9]:
kmeans = KMeans(n_clusters=5, random_state=123)

In [10]:
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

In [11]:
# identify columns we want to cluster on
cluster_cols = ['longitude', 'latitude', 'house_age']

In [12]:
# clustering on train, getting the cetnoids
kmeans = kmeans.fit(train[cluster_cols])

In [13]:
# identifying clusters in train
train['longitude_latitude_houseage_cluster'] = kmeans.predict(train[cluster_cols])

In [14]:
# identifying clusters in validate, test
validate['longitude_latitude_houseage_cluster'] = kmeans.predict(validate[cluster_cols])
test['longitude_latitude_houseage_cluster'] = kmeans.predict(test[cluster_cols])

In [15]:
def predict_cluster(some_dataframe):
    some_dataframe['longitude_latitude_houseage_cluster'] = kmeans.predict(some_dataframe[cluster_cols])
    return some_dataframe

In [16]:
# look at overall counts of observations by cluster
train.longitude_latitude_houseage_cluster.value_counts()

1    6563
2    5075
4    4568
3    1404
0    1120
Name: longitude_latitude_houseage_cluster, dtype: int64

In [17]:
# look at overall counts of observations by cluster
validate.longitude_latitude_houseage_cluster.value_counts()

1    2843
2    2131
4    1951
3     593
0     510
Name: longitude_latitude_houseage_cluster, dtype: int64

In [18]:
# look at overall counts of observations by cluster
test.longitude_latitude_houseage_cluster.value_counts()

1    2305
2    1796
4    1654
3     514
0     421
Name: longitude_latitude_houseage_cluster, dtype: int64

In [19]:
def get_dummy_longitude_latitude_houseage_cluster(some_dataframe):
    dummy_df =  pd.get_dummies(some_dataframe['longitude_latitude_houseage_cluster'])
    dummy_df.columns = ['Ventura', 'Orange County', 
                    'North downtown LA', 'East downtown LA', 
                    'North LA']
    some_dataframe = pd.concat([some_dataframe, dummy_df], axis=1)
    some_dataframe = some_dataframe.drop(['Orange County', 'East downtown LA', 
                    'North downtown LA', 'longitude_latitude_houseage_cluster'], axis=1)
    return some_dataframe

In [20]:
def prep_longitude_latitude_houseage_clusters(some_dataframe):
    some_dataframe = predict_cluster(some_dataframe)
    some_dataframe = get_dummy_longitude_latitude_houseage_cluster(some_dataframe)
    return some_dataframe

In [21]:
train = prep_longitude_latitude_houseage_clusters(train)

In [22]:
validate = prep_longitude_latitude_houseage_clusters(validate)

In [23]:
test = prep_longitude_latitude_houseage_clusters(test)

In [24]:
train.head()

Unnamed: 0,parcelid,land_type,has_heating_system,bathrooms,bedrooms,quality,square_feet,fips,has_fireplace,full_bathrooms,...,in_ventura,house_age,tax_rate,acres,level_of_log_error,square_feet_bins,lot_sqft_bins,acre_bins,Ventura,North LA
18796,12086611,266.0,1,2.0,3.0,8.0,1543.0,6037.0,0.0,2.0,...,0,37.0,81.643225,0.51056,Accurate,1.5l-2k,20k-30k,0-15,0,0
3518,17086463,261.0,0,3.0,4.0,6.0,1719.0,6111.0,1.0,3.0,...,1,64.0,93.685084,0.189991,Accurate,1.5l-2k,0-10k,0-15,0,0
15170,10764495,266.0,1,3.0,2.0,8.0,1170.0,6037.0,0.0,3.0,...,0,42.0,80.3492,3.3486,Accurate,1k-1.5k,70+,0-15,0,1
13451,11080027,261.0,1,2.0,3.0,8.0,2049.0,6037.0,0.0,2.0,...,0,47.0,79.989777,0.268939,Accurate,2k-2.5k,10k-20k,0-15,0,1
28664,14225001,261.0,0,2.0,4.0,6.0,1660.0,6059.0,1.0,2.0,...,0,59.0,89.914255,0.195133,Accurate,1.5l-2k,0-10k,0-15,0,0


In [25]:
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

In [26]:
kmeans = KMeans(n_clusters=6, random_state=123)

In [27]:
# identify columns we want to cluster on
cluster_cols = ['structure_tax_value', 'land_tax_value']

In [28]:
# clustering on train, getting the cetnoids
kmeans = kmeans.fit(train[cluster_cols])

In [29]:
# identifying clusters in train
train['taxes_cluster'] = kmeans.predict(train[cluster_cols])

In [30]:
# identifying clusters in validate, test
validate['taxes_cluster'] = kmeans.predict(validate[cluster_cols])
test['taxes_cluster'] = kmeans.predict(test[cluster_cols])

In [31]:
def predict_cluster(some_dataframe):
    some_dataframe['taxes_cluster'] = kmeans.predict(some_dataframe[cluster_cols])
    return some_dataframe

In [32]:
# look at overall counts of observations by cluster
train.taxes_cluster.value_counts()

0    6493
2    4049
1    3029
4    2825
3    1303
5    1031
Name: taxes_cluster, dtype: int64

In [33]:
# look at overall counts of observations by cluster
validate.taxes_cluster.value_counts()

0    2786
2    1731
1    1266
4    1183
3     610
5     452
Name: taxes_cluster, dtype: int64

In [34]:
# look at overall counts of observations by cluster
test.taxes_cluster.value_counts()

0    2350
2    1434
1    1096
4     972
3     448
5     390
Name: taxes_cluster, dtype: int64

In [35]:
def get_dummy_taxes_cluster(some_dataframe):
    dummy_df =  pd.get_dummies(some_dataframe['taxes_cluster'])
    dummy_df.columns = ['low_structure_and_land_tax', 'drop1',
                        'drop2', 'medium_structure_low_land_tax', 
                        'drop4', 'drop5']
    some_dataframe = pd.concat([some_dataframe, dummy_df], axis=1)
    some_dataframe = some_dataframe.drop(['drop1', 'drop2', 'drop4', 'drop5', 'taxes_cluster'], axis=1)
    return some_dataframe

In [36]:
def prep_taxes_clusters(some_dataframe):
    some_dataframe = predict_cluster(some_dataframe)
    some_dataframe = get_dummy_taxes_cluster(some_dataframe)
    return some_dataframe

In [37]:
train = prep_taxes_clusters(train)

In [38]:
validate = prep_taxes_clusters(validate)

In [39]:
test = prep_taxes_clusters(test)

In [40]:
train.head()

Unnamed: 0,parcelid,land_type,has_heating_system,bathrooms,bedrooms,quality,square_feet,fips,has_fireplace,full_bathrooms,...,tax_rate,acres,level_of_log_error,square_feet_bins,lot_sqft_bins,acre_bins,Ventura,North LA,low_structure_and_land_tax,medium_structure_low_land_tax
18796,12086611,266.0,1,2.0,3.0,8.0,1543.0,6037.0,0.0,2.0,...,81.643225,0.51056,Accurate,1.5l-2k,20k-30k,0-15,0,0,1,0
3518,17086463,261.0,0,3.0,4.0,6.0,1719.0,6111.0,1.0,3.0,...,93.685084,0.189991,Accurate,1.5l-2k,0-10k,0-15,0,0,0,0
15170,10764495,266.0,1,3.0,2.0,8.0,1170.0,6037.0,0.0,3.0,...,80.3492,3.3486,Accurate,1k-1.5k,70+,0-15,0,1,1,0
13451,11080027,261.0,1,2.0,3.0,8.0,2049.0,6037.0,0.0,2.0,...,79.989777,0.268939,Accurate,2k-2.5k,10k-20k,0-15,0,1,0,0
28664,14225001,261.0,0,2.0,4.0,6.0,1660.0,6059.0,1.0,2.0,...,89.914255,0.195133,Accurate,1.5l-2k,0-10k,0-15,0,0,0,0


In [41]:
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

In [42]:
kmeans = KMeans(n_clusters=5, random_state=123)

In [43]:
# identify columns we want to cluster on
cluster_cols = ['quality', 'house_age', 'room_count']

In [44]:
# clustering on train, getting the cetnoids
kmeans = kmeans.fit(train[cluster_cols])

In [45]:
# identifying clusters in train
train['quality_houseage_roomcount_cluster'] = kmeans.predict(train[cluster_cols])

In [46]:
# identifying clusters in validate, test
validate['quality_houseage_roomcount_cluster'] = kmeans.predict(validate[cluster_cols])
test['quality_houseage_roomcount_cluster'] = kmeans.predict(test[cluster_cols])

In [47]:
def predict_cluster(some_dataframe):
    some_dataframe['quality_houseage_roomcount_cluster'] = kmeans.predict(some_dataframe[cluster_cols])
    return some_dataframe

In [48]:
# look at overall counts of observations by clustert
train.quality_houseage_roomcount_cluster.value_counts()

4    5690
2    4609
0    4563
3    2131
1    1737
Name: quality_houseage_roomcount_cluster, dtype: int64

In [49]:
# look at overall counts of observations by cluster
validate.quality_houseage_roomcount_cluster.value_counts()

4    2374
0    2004
2    1949
3     968
1     733
Name: quality_houseage_roomcount_cluster, dtype: int64

In [50]:
# look at overall counts of observations by cluster
test.quality_houseage_roomcount_cluster.value_counts()

4    2062
0    1665
2    1550
3     782
1     631
Name: quality_houseage_roomcount_cluster, dtype: int64

In [51]:
def get_dummy_quality_houseage_roomcount_cluster(some_dataframe):
    dummy_df =  pd.get_dummies(some_dataframe['quality_houseage_roomcount_cluster'])
    dummy_df.columns = ['house quality = 0', 
                    'Older homes low quality', 
                    'Younger homes avg. quality', 
                    'Newer Homes High Quality', 
                    'Older Homes High Quality']
    some_dataframe = pd.concat([some_dataframe, dummy_df], axis=1)
    some_dataframe = some_dataframe.drop(['Older homes low quality', 
                    'Younger homes avg. quality', 
                    'quality_houseage_roomcount_cluster'], axis=1)
    return some_dataframe

In [52]:
def prep_quality_houseage_roomcount_clusters(some_dataframe):
    some_dataframe = predict_cluster(some_dataframe)
    some_dataframe = get_dummy_quality_houseage_roomcount_cluster(some_dataframe)
    return some_dataframe

In [53]:
train = prep_quality_houseage_roomcount_clusters(train)

In [54]:
validate = prep_quality_houseage_roomcount_clusters(validate)

In [55]:
test = prep_quality_houseage_roomcount_clusters(test)

In [56]:
train.head()

Unnamed: 0,parcelid,land_type,has_heating_system,bathrooms,bedrooms,quality,square_feet,fips,has_fireplace,full_bathrooms,...,square_feet_bins,lot_sqft_bins,acre_bins,Ventura,North LA,low_structure_and_land_tax,medium_structure_low_land_tax,house quality = 0,Newer Homes High Quality,Older Homes High Quality
18796,12086611,266.0,1,2.0,3.0,8.0,1543.0,6037.0,0.0,2.0,...,1.5l-2k,20k-30k,0-15,0,0,1,0,1,0,0
3518,17086463,261.0,0,3.0,4.0,6.0,1719.0,6111.0,1.0,3.0,...,1.5l-2k,0-10k,0-15,0,0,0,0,0,0,1
15170,10764495,266.0,1,3.0,2.0,8.0,1170.0,6037.0,0.0,3.0,...,1k-1.5k,70+,0-15,0,1,1,0,1,0,0
13451,11080027,261.0,1,2.0,3.0,8.0,2049.0,6037.0,0.0,2.0,...,2k-2.5k,10k-20k,0-15,0,1,0,0,0,0,0
28664,14225001,261.0,0,2.0,4.0,6.0,1660.0,6059.0,1.0,2.0,...,1.5l-2k,0-10k,0-15,0,0,0,0,0,0,0


In [57]:
validate.head()

Unnamed: 0,parcelid,land_type,has_heating_system,bathrooms,bedrooms,quality,square_feet,fips,has_fireplace,full_bathrooms,...,square_feet_bins,lot_sqft_bins,acre_bins,Ventura,North LA,low_structure_and_land_tax,medium_structure_low_land_tax,house quality = 0,Newer Homes High Quality,Older Homes High Quality
23193,13059280,261.0,1,3.0,4.0,8.0,2155.0,6037.0,0.0,3.0,...,2k-2.5k,20k-30k,0-15,0,0,0,1,1,0,0
3699,12282400,261.0,0,2.0,3.0,4.0,1487.0,6037.0,0.0,2.0,...,1k-1.5k,0-10k,0-15,0,0,1,0,0,0,0
11406,11872875,261.0,1,2.0,3.0,6.0,1279.0,6037.0,0.0,2.0,...,1k-1.5k,0-10k,0-15,0,0,0,0,0,0,1
16924,14239107,261.0,0,1.5,2.0,6.0,1167.0,6059.0,0.0,1.0,...,1k-1.5k,0-10k,0-15,0,0,0,0,1,0,0
31134,14122618,261.0,0,1.5,4.0,6.0,1314.0,6059.0,0.0,1.0,...,1k-1.5k,0-10k,0-15,0,0,1,0,0,0,1


In [58]:
test.head()

Unnamed: 0,parcelid,land_type,has_heating_system,bathrooms,bedrooms,quality,square_feet,fips,has_fireplace,full_bathrooms,...,square_feet_bins,lot_sqft_bins,acre_bins,Ventura,North LA,low_structure_and_land_tax,medium_structure_low_land_tax,house quality = 0,Newer Homes High Quality,Older Homes High Quality
15693,10926937,261.0,1,4.0,5.0,6.0,2336.0,6037.0,0.0,4.0,...,2k-2.5k,0-10k,0-15,0,1,0,0,0,0,0
8989,10830570,261.0,1,2.0,4.0,6.0,1674.0,6037.0,0.0,2.0,...,1.5l-2k,10k-20k,0-15,0,1,0,0,0,0,0
29206,11850232,261.0,1,2.0,4.0,4.0,1974.0,6037.0,0.0,2.0,...,1.5l-2k,0-10k,0-15,0,0,0,0,0,0,1
22314,11300905,263.0,0,2.0,3.0,8.0,1926.0,6037.0,0.0,2.0,...,1.5l-2k,0-10k,0-15,1,0,1,0,0,1,0
10255,12136656,261.0,1,2.0,3.0,6.0,1531.0,6037.0,0.0,2.0,...,1.5l-2k,0-10k,0-15,0,0,1,0,0,0,1
