# Aim
Evaluate the difference between performance of the following:
1. Postal Codes 
2. Clustered (KMeans)
3. Clustered (TS Analysis)
***
#### Important Note: 
- First computed with three-room house type
- Number of clusters (KMeans) : Optimal according to SSE/SI/DB
- Error Metrics : (RMSE, MAE)
- Generalisation error calculated over the entire training set.
***

In [34]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tslearn.metrics import dtw
from sklearn.metrics import pairwise_distances, silhouette_score, davies_bouldin_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from tslearn.clustering import TimeSeriesKMeans, silhouette_score
import re

In [35]:
# read csv file
df = pd.read_csv("../Ken Folder/df_geo_imp.csv", index_col=0)
# rename column names
df.columns = ["PC", "BT", "Quarter", "EUR/m2", "latitude", "longitude"]
df

Unnamed: 0,PC,BT,Quarter,EUR/m2,latitude,longitude
0,100,one-room,2010-01-01,5458,60.1714,24.9316
1,100,two-room,2010-01-01,5164,60.1714,24.9316
2,100,three or more room,2010-01-01,4944,60.1714,24.9316
3,100,terrace house,2010-01-01,1783,60.1714,24.9316
4,100,one-room,2010-04-01,5347,60.1714,24.9316
...,...,...,...,...,...,...
308931,99980,terrace house,2021-01-01,2219,69.8467,26.8907
308932,99980,one-room,2021-04-01,2290,69.8467,26.8907
308933,99980,two-room,2021-04-01,2290,69.8467,26.8907
308934,99980,three or more room,2021-04-01,2290,69.8467,26.8907


In [47]:
# create dataframes of different room types
one_room = df[df["BT"] == "one-room"]
two_room = df[ df["BT"] == "two-room"]
three_room = df[ df["BT"] == "three or more room"]

In [58]:
# Use 3-room apartments, create time series
ts_df = pd.DataFrame()
grouping = three_room.groupby('PC')
for i in df.PC.unique():
    ts_df[str(i)] = grouping.get_group(i)["EUR/m2"].values
ts_df = ts_df
ts_df

# Preprocessing time series
scaler = MinMaxScaler(copy=False)
ts_df = pd.DataFrame(data=scaler.fit_transform(ts_df),
                     columns=ts_df.columns)
ts_df = ts_df.T
ts_df


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,36,37,38,39,40,41,42,43,44,45
100,0.034668,0.169755,0.000000,0.169157,0.222056,0.263299,0.185894,0.054393,0.261805,0.190675,...,0.614166,0.603108,0.762403,0.567842,0.710400,0.552899,0.702331,0.770173,0.963837,1.000000
120,0.283467,0.122147,0.174892,0.107650,0.115361,0.078347,0.060148,0.000000,0.166872,0.148982,...,0.747378,0.655768,0.597162,0.792721,0.706354,0.846083,0.907156,0.872610,1.000000,0.947563
130,0.000000,0.105438,0.129310,0.167109,0.232759,0.315650,0.336207,0.318302,0.383952,0.352122,...,0.580239,0.655172,0.590186,0.531830,0.718833,0.773873,0.786472,0.840849,0.905172,1.000000
140,0.086344,0.125399,0.119022,0.000000,0.202444,0.186238,0.151966,0.221838,0.296759,0.052604,...,0.667641,0.673751,0.552072,0.759830,0.600425,0.881775,0.695537,0.804729,0.976089,1.000000
150,0.098129,0.081158,0.024369,0.067885,0.164056,0.084856,0.152306,0.000000,0.107050,0.018930,...,0.598346,0.628590,0.468451,0.461923,0.726501,0.698651,0.469104,0.620975,1.000000,0.834204
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99830,0.000000,0.080868,0.106509,0.076923,0.169625,0.226824,0.218935,0.155819,0.230769,0.289941,...,0.593688,0.658777,0.627219,0.564103,0.694280,0.727811,0.769231,0.767258,0.859961,1.000000
99870,0.000000,0.080868,0.106509,0.076923,0.169625,0.226824,0.218935,0.155819,0.230769,0.289941,...,0.593688,0.658777,0.627219,0.564103,0.694280,0.727811,0.769231,0.767258,0.859961,1.000000
99930,0.000000,0.080868,0.106509,0.076923,0.169625,0.226824,0.218935,0.155819,0.230769,0.289941,...,0.593688,0.658777,0.627219,0.564103,0.694280,0.727811,0.769231,0.767258,0.859961,1.000000
99950,0.000000,0.080868,0.106509,0.076923,0.169625,0.226824,0.218935,0.155819,0.230769,0.289941,...,0.593688,0.658777,0.627219,0.564103,0.694280,0.727811,0.769231,0.767258,0.859961,1.000000


In [59]:
# Clustering
K = range(2,10)
sse_scores = []
si_scores = []
db_scores = []

for k in K :
    km = KMeans(
        n_clusters=k,
        n_init = 50,
        random_state=42
    )
    labels = km.fit_predict(ts_df)
    sse_scores.append(km.inertia_)
    si_scores.append(silhouette_score(ts_df, labels))
    db_scores.append(davies_bouldin_score(ts_df, labels))


In [44]:
plt.figure(figsize= (6,4))
plt.plot(si_scores)
plt.xlabel("K")
plt.ylabel("Silhouette Index")
plt.xticks(list(K))

[0;31mInit signature:[0m
[0mKMeans[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mn_clusters[0m[0;34m=[0m[0;36m8[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0minit[0m[0;34m=[0m[0;34m'k-means++'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mn_init[0m[0;34m=[0m[0;36m10[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmax_iter[0m[0;34m=[0m[0;36m300[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtol[0m[0;34m=[0m[0;36m0.0001[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mprecompute_distances[0m[0;34m=[0m[0;34m'deprecated'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mverbose[0m[0;34m=[0m[0;36m0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mrandom_state[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcopy_x[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mn_jobs[0m[0;34m=[0m[0;34m'deprecated'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0malgorithm[0m[0;34m=[0m[

In [None]:
plt.figure(figsize=(6, 4))
plt.plot(sse_scores)
plt.xlabel("K")
plt.ylabel("SSE")
plt.xticks(list(K))


In [None]:
plt.figure(figsize=(6, 4))
plt.plot(db_scores)
plt.xlabel("K")
plt.ylabel("DB")
plt.xticks(list(K))
