In [33]:
# Methods/functions we created
import City_Preprocessing as zillowPro
import Gentrification_Control as gentPro
import Time_Series_KMeans_Utils as tsUtils

# Libraries
import pandas as pd
import numpy as np
from tslearn.metrics import dtw, cdist_dtw
from tslearn.preprocessing import TimeSeriesResampler
from tslearn.utils import to_time_series, to_time_series_dataset
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import MinMaxScaler
from tslearn.clustering import TimeSeriesKMeans

# Raw Zillow city time series data with all 81 features
city_raw = pd.read_csv('/Users/briankalinowski/PycharmProjects/CIS600/zecon/City_time_series.csv')

In [None]:
# Geo and housing features
housing_features = ['Date', 'RegionName', 
                    'Sale_Counts_Seas_Adj',
                    'InventorySeasonallyAdjusted_AllHomes',
                    'PctOfHomesDecreasingInValues_AllHomes',
                    'PctOfHomesIncreasingInValues_AllHomes']

# all our housing measurement features
housing_values = ['Sale_Counts_Seas_Adj',
                  'InventorySeasonallyAdjusted_AllHomes',
                  'PctOfHomesDecreasingInValues_AllHomes',
                  'PctOfHomesIncreasingInValues_AllHomes']

lookup_ix = ['State', 'City']
full_ix = ['State', 'City', 'Date']

city_housing_data = zillowPro.process_city_data(city_raw, housing_features, housing_values, lookup_ix, full_ix)

print('Housing Data Shape:', city_housing_data.shape)
print('Unique State/City Groups:', city_housing_data.groupby(['State', 'City']).ngroups)
city_housing_data.head(21)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


In [4]:
city_housing_data.values.isna().any()

city_housing_data.isna().values.any()

city_housing_data[city_housing_data.isna().any(axis=1)]

Unnamed: 0,State,City,Date,AVG_Sale_Counts_Seas_Adj,AVG_InventorySeasonallyAdjusted_AllHomes,AVG_PctOfHomesDecreasingInValues_AllHomes,AVG_PctOfHomesIncreasingInValues_AllHomes
4163,NY,Springville,2007,,,,
5114,NY,Town of Hamburg,2007,,,,
5427,NY,Lancaster,2007,,,,
5428,NY,Lancaster,2008,,,,
5429,NY,Lancaster,2009,,,,
6097,NY,Evans,2007,,,,
8015,NY,West Seneca,2007,,,,
8016,NY,West Seneca,2008,,,,
8017,NY,West Seneca,2009,,,,
8176,KY,Shepherdsville,2000,,,,


In [10]:
city_housing_data.fillna(method='bfill', inplace=True)
city_housing_data[city_housing_data.isna().any(axis=1)]

Unnamed: 0,State,City,Date,AVG_Sale_Counts_Seas_Adj,AVG_InventorySeasonallyAdjusted_AllHomes,AVG_PctOfHomesDecreasingInValues_AllHomes,AVG_PctOfHomesIncreasingInValues_AllHomes


In [11]:
features_to_scale = ['AVG_Sale_Counts_Seas_Adj',
                    'AVG_InventorySeasonallyAdjusted_AllHomes',
                    'AVG_PctOfHomesDecreasingInValues_AllHomes',
                    'AVG_PctOfHomesIncreasingInValues_AllHomes']

city_housing_data = zillowPro.min_max_scale(city_housing_data, features_to_scale)
city_housing_data.head()

Unnamed: 0,State,City,Date,AVG_Sale_Counts_Seas_Adj,AVG_InventorySeasonallyAdjusted_AllHomes,AVG_PctOfHomesDecreasingInValues_AllHomes,AVG_PctOfHomesIncreasingInValues_AllHomes
0,CA,Folsom,1997,0.014904,0.013691,0.347986,0.498477
1,CA,Folsom,1998,0.014904,0.013691,0.095524,0.836393
2,CA,Folsom,1999,0.014904,0.013691,0.072787,0.8885
3,CA,Folsom,2000,0.014904,0.013691,0.039178,0.948618
4,CA,Folsom,2001,0.014904,0.013691,0.013899,0.982036


In [12]:
city_dtw_housing = gentPro.set_geo_dtw_data(city_housing_data)

In [13]:
# Cities MOST similar to our gentrification control
city_dtw_housing.sort_values(by='dtw_score', ascending=True).reset_index(drop=True).head(21)

Unnamed: 0,State,City,dtw_score,dtw_value,dtw_label
0,GENT_CONTROL,GENT_CONTROL,0.0,0.0,HIGH_GENT
1,CO,Colorado Springs,0.589576,0.0,HIGH_GENT
2,OR,Portland,0.6019,0.0,HIGH_GENT
3,CA,San Diego,0.61917,0.0,HIGH_GENT
4,WA,Seattle,0.718773,0.0,HIGH_GENT
5,AZ,Tucson,0.728763,0.0,HIGH_GENT
6,AZ,Mesa,0.733445,0.0,HIGH_GENT
7,TN,Memphis,0.769357,0.0,HIGH_GENT
8,CO,Aurora,0.78157,0.0,HIGH_GENT
9,WI,Milwaukee,0.799147,0.0,HIGH_GENT


In [14]:
# Cities LEAST similar to our gentrification control
city_dtw_housing.sort_values(by='dtw_score', ascending=False).reset_index(drop=True).head(21)

Unnamed: 0,State,City,dtw_score,dtw_value,dtw_label
0,NY,New York,4.398851,4.0,NO_GENT
1,IL,Chicago,3.629912,4.0,NO_GENT
2,NY,Elma,2.17438,3.0,LOW_GENT
3,OH,Johnstown,2.060606,3.0,LOW_GENT
4,OH,Newark,2.043865,3.0,LOW_GENT
5,AZ,Phoenix,2.015426,3.0,LOW_GENT
6,IL,Carlinville,1.993913,3.0,LOW_GENT
7,PA,New Kensington,1.988533,3.0,LOW_GENT
8,NY,Wheatfield,1.982321,3.0,LOW_GENT
9,MI,Belmont,1.979379,3.0,LOW_GENT


In [16]:
zillow_zpiv_ts = pd.read_csv('/Users/briankalinowski/Desktop/CIS600_DataMining/Zillow_Data_Clean/zillow_portal_zpiv.csv')
zillow_zpiv_ts = tsUtils.clean_zillow_ts_data(zillow_zpiv_ts, lookup_ix)
zillow_zpiv_ts.head()

Unnamed: 0,State,City,1997-02,1997-03,1997-04,1997-05,1997-06,1997-07,1997-08,1997-09,...,2018-06,2018-08,2018-09,2018-10,2018-11,2018-12,2019-01,2019-02,2019-03,2019-04
0,AL,Adamsville,0.7912,0.7358,0.5905,0.1004,0.3964,0.4181,0.5512,0.8473,...,0.884092,0.883041,0.6829,0.669909,0.677098,0.653495,0.57787,0.508083,0.485367,0.466795
1,AL,Addison,0.7849,0.7849,0.7849,0.7849,0.7849,0.7849,0.7849,0.7849,...,0.497226,0.616757,0.555377,0.463351,0.409036,0.374787,0.39163,0.329944,0.350684,0.336312
2,AL,Alabaster,0.3711,0.4108,0.5925,0.8298,0.9504,0.852,0.509,0.6752,...,0.761324,0.901996,0.921076,0.927003,0.932381,0.932705,0.927146,0.904728,0.907544,0.901604
3,AL,Alexander City,0.374,0.374,0.374,0.374,0.374,0.374,0.374,0.374,...,0.554625,0.285743,0.117682,0.118119,0.218253,0.267175,0.300942,0.332283,0.332051,0.303818
4,AL,Ardmore,0.8207,0.8207,0.8207,0.8207,0.8207,0.8207,0.8207,0.8207,...,0.766267,0.892922,0.839442,0.831345,0.825216,0.830107,0.80302,0.768582,0.670278,0.630585


In [21]:
zillow_zpiv_ts = zillow_zpiv_ts.dropna()

In [22]:
len(zillow_zpiv_ts)

10341

In [23]:
# Time-Series-KMeans
zpiv_kmeans_clustering = tsUtils.run_time_series_kmeans(zillow_zpiv_ts, lookup_ix, 23)
zpiv_kmeans_clustering.head()

0.521 --> 0.440 --> 0.408 --> 0.400 --> 0.396 --> 


Unnamed: 0,State,City,dtw_cluster_prediction
0,AL,Adamsville,2.0
1,AL,Addison,3.0
2,AL,Alabaster,0.0
3,AL,Alexander City,4.0
4,AL,Ardmore,2.0


In [31]:
zpiv_kmeans_clustering.loc[zpiv_kmeans_clustering['State'] == 'OR'].loc[zpiv_kmeans_clustering['City'] == 'Portland']

Unnamed: 0,State,City,dtw_cluster_prediction
7862,OR,Portland,4.0


In [30]:
city_dtw_housing.loc[city_dtw_housing['State'] == 'OR'].loc[city_dtw_housing['City'] == 'Portland']

Unnamed: 0,State,City,dtw_score,dtw_value,dtw_label
992,OR,Portland,0.6019,0.0,HIGH_GENT


In [32]:
zpiv_kmeans_clustering.dtw_cluster_prediction.value_counts()

1.0    2535
4.0    2298
2.0    2170
3.0    1834
0.0    1504
Name: dtw_cluster_prediction, dtype: int64