In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from scipy import stats

In [2]:
import wrangle

In [3]:
df = pd.read_csv('train_zillow.csv')

In [4]:
df.head()

Unnamed: 0,parcelid,land_type,has_heating_system,bathrooms,bedrooms,quality,square_feet,fips,has_fireplace,full_bathrooms,...,in_los_angeles,in_orange_county,in_ventura,house_age,tax_rate,acres,level_of_log_error,square_feet_bins,lot_sqft_bins,acre_bins
0,12397645,261.0,1,2.0,3.0,6.0,1139.0,6037.0,0.0,2.0,...,1,0,0,70.0,77.088243,0.115886,Accurate,1k-1.5k,0-10k,0-15
1,13899643,261.0,0,1.0,2.0,6.0,882.0,6059.0,0.0,1.0,...,0,1,0,65.0,99.593289,0.094582,Accurate,500-1k,0-10k,0-15
2,10936804,261.0,1,1.0,2.0,4.0,943.0,6037.0,0.0,1.0,...,1,0,0,81.0,91.300853,0.163522,Accurate,500-1k,0-10k,0-15
3,12877473,261.0,1,3.0,5.0,6.0,1449.0,6037.0,0.0,3.0,...,1,0,0,51.0,77.842486,0.141919,Over,1k-1.5k,0-10k,0-15
4,11190920,261.0,1,2.0,4.0,8.0,1546.0,6037.0,0.0,2.0,...,1,0,0,29.0,51.992135,0.195294,Accurate,1.5l-2k,0-10k,0-15


In [5]:
train, validate, test = wrangle.split_zillow(df)

In [6]:
train_scaled, validate_scaled, test_scaled = wrangle.scale_my_data(train, validate, test)

In [7]:
train_scaled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,0.285714,0.333333,0.636364,0.23009,0.166667,0.54576,0.644968,0.003157,1.0,0.022785,...,0.000132,0.0,0.0,0.140431,0.209431,0.10225,0.119913,0.226277,0.05637,0.003157
1,0.428571,0.444444,0.454545,0.261169,0.333333,0.639556,0.119151,0.001153,0.166667,0.079,...,0.003625,0.5,0.0,0.287354,0.467722,0.25198,0.234834,0.423358,0.064811,0.001153
2,0.428571,0.222222,0.636364,0.164224,0.333333,0.585095,0.472813,0.020891,1.0,0.022785,...,0.001258,0.0,0.0,0.150284,0.215873,0.100546,0.125676,0.262774,0.055463,0.020891
3,0.285714,0.333333,0.636364,0.319442,0.166667,0.626845,0.472787,0.001647,0.166667,0.022785,...,0.001232,0.0,0.0,0.243873,0.330633,0.142199,0.194214,0.29927,0.055211,0.001647
4,0.285714,0.444444,0.454545,0.25075,0.166667,0.323683,0.846541,0.001186,0.166667,0.075715,...,0.00356,0.666667,0.0,0.116168,0.380583,0.308323,0.198861,0.386861,0.062167,0.001186


In [8]:
kmeans = KMeans(n_clusters=5, random_state=123)

In [9]:
# identify columns we want to cluster on
cluster_cols = ['quality', 'house_age', 'room_count']

In [10]:
# clustering on train, getting the cetnoids
kmeans = kmeans.fit(train[cluster_cols])

In [11]:
# identifying clusters in train
train['quality_houseage_roomcount_cluster'] = kmeans.predict(train[cluster_cols])

In [12]:
# identifying clusters in validate, test
validate['quality_houseage_roomcount_cluster'] = kmeans.predict(validate[cluster_cols])
test['quality_houseage_roomcount_cluster'] = kmeans.predict(test[cluster_cols])

In [13]:
def predict_cluster(some_dataframe):
    some_dataframe['quality_houseage_roomcount_cluster'] = kmeans.predict(some_dataframe[cluster_cols])
    return some_dataframe

In [14]:
def get_dummy_quality_houseage_roomcount_cluster(some_dataframe):
    dummy_df =  pd.get_dummies(some_dataframe['quality_houseage_roomcount_cluster'])
    dummy_df.columns = ['house quality = 0', 
                    'Older homes low quality', 
                    'Younger homes avg. quality', 
                    'Newer Homes High Quality', 
                    'Older Homes High Quality']
    some_dataframe = pd.concat([some_dataframe, dummy_df], axis=1)
    some_dataframe = some_dataframe.drop(['Older homes low quality', 
                    'Younger homes avg. quality', 
                    'quality_houseage_roomcount_cluster'], axis=1)
    return some_dataframe

In [15]:
def prep_quality_houseage_roomcount_clusters(some_dataframe):
    some_dataframe = predict_cluster(some_dataframe)
    some_dataframe = get_dummy_quality_houseage_roomcount_cluster(some_dataframe)
    return some_dataframe

In [16]:
train = prep_quality_houseage_roomcount_clusters(train)

In [17]:
train.head()

Unnamed: 0,parcelid,land_type,has_heating_system,bathrooms,bedrooms,quality,square_feet,fips,has_fireplace,full_bathrooms,...,house_age,tax_rate,acres,level_of_log_error,square_feet_bins,lot_sqft_bins,acre_bins,house quality = 0,Newer Homes High Quality,Older Homes High Quality
18796,12086611,266.0,1,2.0,3.0,8.0,1543.0,6037.0,0.0,2.0,...,37.0,81.643225,0.51056,Accurate,1.5l-2k,20k-30k,0-15,1,0,0
3518,17086463,261.0,0,3.0,4.0,6.0,1719.0,6111.0,1.0,3.0,...,64.0,93.685084,0.189991,Accurate,1.5l-2k,0-10k,0-15,0,0,1
15170,10764495,266.0,1,3.0,2.0,8.0,1170.0,6037.0,0.0,3.0,...,42.0,80.3492,3.3486,Accurate,1k-1.5k,70+,0-15,1,0,0
13451,11080027,261.0,1,2.0,3.0,8.0,2049.0,6037.0,0.0,2.0,...,47.0,79.989777,0.268939,Accurate,2k-2.5k,10k-20k,0-15,0,0,0
28664,14225001,261.0,0,2.0,4.0,6.0,1660.0,6059.0,1.0,2.0,...,59.0,89.914255,0.195133,Accurate,1.5l-2k,0-10k,0-15,0,0,0


In [19]:
kmeans = KMeans(n_clusters=5, random_state=123)

# identify columns we want to cluster on
cluster_cols = ['quality', 'house_age', 'room_count']

# clustering on train, getting the cetnoids
kmeans = kmeans.fit(train[cluster_cols])

# identifying clusters in train
train['quality_houseage_roomcount_cluster'] = kmeans.predict(train[cluster_cols])
# identifying clusters in validate, test
validate['quality_houseage_roomcount_cluster'] = kmeans.predict(validate[cluster_cols])
test['quality_houseage_roomcount_cluster'] = kmeans.predict(test[cluster_cols])

def predict_cluster(some_dataframe):
    some_dataframe['quality_houseage_roomcount_cluster'] = kmeans.predict(some_dataframe[cluster_cols])
    return some_dataframe

def get_dummy_quality_houseage_roomcount_cluster(some_dataframe):
    dummy_df =  pd.get_dummies(some_dataframe['quality_houseage_roomcount_cluster'])
    dummy_df.columns = ['house quality = 0', 
                    'Older homes low quality', 
                    'Younger homes avg. quality', 
                    'Newer Homes High Quality', 
                    'Older Homes High Quality']
    some_dataframe = pd.concat([some_dataframe, dummy_df], axis=1)
    some_dataframe = some_dataframe.drop(['Older homes low quality', 
                    'Younger homes avg. quality', 
                    'quality_houseage_roomcount_cluster'], axis=1)
    return some_dataframe

def prep_quality_houseage_roomcount_clusters(some_dataframe):
    some_dataframe = predict_cluster(some_dataframe)
    some_dataframe = get_dummy_quality_houseage_roomcount_cluster(some_dataframe)
    return some_dataframe

In [20]:
train = prep_quality_houseage_roomcount_clusters(train)