In [141]:
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import seaborn as sns
import pandas as pd
import numpy as np
import pygeohash as pgh
from sklearn import metrics, cluster

%matplotlib inline

In [142]:
data = 'data/unit_data.csv'
df = pd.read_csv(data)
df.head()

Unnamed: 0,id,avg_price,avg_cpm,zip_code,zip_code_id,lat,lon,size,price,rate_card_price,floor_price,cpm,impressions,supplier_id,supplier_face_id
0,468328,850.0,12.58,4042,7217,41.43777,-75.6549,"10' 6"" x 22' 9""",850.0,1020.0,,12.58,67568,39,290
1,107547,400.0,2.05,4012,15252,29.13522,-82.04484,10' x 40',400.0,480.0,,2.05,195304,39,3849
2,316324,250.0,1.54,4006,39634,34.062725,-118.0529,6' x 12',250.0,300.0,,1.54,162504,39,4484
3,2065349,460.0,0.28,4022,31588,30.219819,-93.358694,12' x 24',460.0,460.0,,0.28,1626044,799,0308A S/F
4,430511,1500.0,3.75,4045,12375,34.007264,-81.015278,10' x 40',1500.0,1800.0,,3.75,399972,39,71292


In [143]:
import re

heights = [] 
widths = []
for size_val in df['size']:
    height_match = re.search("^(.*?)'", size_val)
    if height_match:
        heights.append(height_match.group(1))
    else:
        heights.append(0)
    width_match = re.search("(?<=x ).*?(?=')", size_val)
    if width_match:
        widths.append(width_match.group(0))
    else:
        widths.append(0)
        
rounded_heights = []
for h in heights:
    if h == '3 units each 70"H x 48"W or 1 large unit 68':
        h = 70
    elif h == '2 (40':
        h = 40
    elif h == '123"h x 291"w 10':
        h = 123
    rounded_heights.append(5 * round(float(h) / 5))
    
rounded_widths = []
for w in widths:
    if w == '48"W or 1 large unit 68':
        w = 48
    elif w == '291"w 10':
        w = 291
    rounded_widths.append(5 * round(float(w) / 5))

df['rounded_height'] = rounded_heights
df['rounded_width'] = rounded_widths

In [148]:
df['sqft'] = [l[0] * l[1] for l in list(zip(df.rounded_height, df.rounded_width))]
df['geo_hash'] = [pgh.encode(l[0], l[1], precision=2) for l in list(zip(df.lat, df.lon))]
df.geo_hash.nunique

AttributeError: 'DataFrame' object has no attribute 'rounded_height'

In [145]:
df = df[['id', 'price', 'supplier_id', 'lat', 'lon', 'sqft', 'geo_hash', 'price', 'cpm', 'impressions']]
df.head()

Unnamed: 0,id,price,supplier_id,lat,lon,sqft,geo_hash,price.1,cpm,impressions
0,468328,850.0,39,41.43777,-75.6549,200,dr6,850.0,12.58,67568
1,107547,400.0,39,29.13522,-82.04484,400,djj,400.0,2.05,195304
2,316324,250.0,39,34.062725,-118.0529,50,9qh,250.0,1.54,162504
3,2065349,460.0,799,30.219819,-93.358694,250,9vm,460.0,0.28,1626044
4,430511,1500.0,39,34.007264,-81.015278,400,dnn,1500.0,3.75,399972


In [146]:
# supplier_df = pd.get_dummies(df, columns=['supplier_id'])
# supplier_df.head()
supplier_df = pd.get_dummies(df, columns=['geo_hash'])
supplier_df.head()

Unnamed: 0,id,price,supplier_id,lat,lon,sqft,price.1,cpm,impressions,geo_hash_9mg,...,geo_hash_drs,geo_hash_drt,geo_hash_f00,geo_hash_f01,geo_hash_f02,geo_hash_f03,geo_hash_f04,geo_hash_f05,geo_hash_f0h,geo_hash_f0k
0,468328,850.0,39,41.43777,-75.6549,200,850.0,12.58,67568,0,...,0,0,0,0,0,0,0,0,0,0
1,107547,400.0,39,29.13522,-82.04484,400,400.0,2.05,195304,0,...,0,0,0,0,0,0,0,0,0,0
2,316324,250.0,39,34.062725,-118.0529,50,250.0,1.54,162504,0,...,0,0,0,0,0,0,0,0,0,0
3,2065349,460.0,799,30.219819,-93.358694,250,460.0,0.28,1626044,0,...,0,0,0,0,0,0,0,0,0,0
4,430511,1500.0,39,34.007264,-81.015278,400,1500.0,3.75,399972,0,...,0,0,0,0,0,0,0,0,0,0


In [147]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

# feature_cols = ['sqft', 'impressions', 'cpm', 'price']
feature_cols = ['impressions', 'cpm', 'price']

# X = scaler.fit_transform(
#     df[feature_cols].to_numpy()
# )

X = scaler.fit_transform(df[feature_cols])
pd.DataFrame(X, columns=feature_cols).describe()

ValueError: Shape of passed values is (21145, 4), indices imply (21145, 3)

In [None]:
from sklearn.cluster import KMeans

In [None]:
wcss = []
for i in range(1, 16):
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)
plt.plot(range(1, 16), wcss)
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

In [None]:
kmeans = cluster.KMeans(n_clusters=5, init='k-means++', max_iter=300, n_init=10, random_state=0)
kmeans.fit(X)

labels = kmeans.labels_
centroids = kmeans.cluster_centers_
inertia = kmeans.inertia_
metrics.silhouette_score(X, labels, metric='euclidean')

In [None]:
X['label'] = labels
X.head()

In [None]:
cols = X.columns[:-2]
sns.pairplot(X.head(200), x_vars=cols, y_vars= cols, hue='label')