In [148]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pygeohash as pgh
from sklearn import metrics

%matplotlib inline

In [149]:
data = 'data/unit_data.csv'
df = pd.read_csv(data)
df.head()

Unnamed: 0,id,avg_price,avg_cpm,zip_code,zip_code_id,lat,lon,size,price,rate_card_price,floor_price,cpm,impressions,supplier_id,supplier_face_id
0,468328,850.0,12.58,4042,7217,41.43777,-75.6549,"10' 6"" x 22' 9""",850.0,1020.0,,12.58,67568,39,290
1,107547,400.0,2.05,4012,15252,29.13522,-82.04484,10' x 40',400.0,480.0,,2.05,195304,39,3849
2,316324,250.0,1.54,4006,39634,34.062725,-118.0529,6' x 12',250.0,300.0,,1.54,162504,39,4484
3,2065349,460.0,0.28,4022,31588,30.219819,-93.358694,12' x 24',460.0,460.0,,0.28,1626044,799,0308A S/F
4,430511,1500.0,3.75,4045,12375,34.007264,-81.015278,10' x 40',1500.0,1800.0,,3.75,399972,39,71292


In [150]:
import re

heights = [] 
widths = []
for size_val in df['size']:
    height_match = re.search("^(.*?)'", size_val)
    if height_match:
        heights.append(height_match.group(1))
    else:
        heights.append(0)
    width_match = re.search("(?<=x ).*?(?=')", size_val)
    if width_match:
        widths.append(width_match.group(0))
    else:
        widths.append(0)
        
rounded_heights = []
for h in heights:
    if h == '3 units each 70"H x 48"W or 1 large unit 68':
        h = 70
    elif h == '2 (40':
        h = 40
    elif h == '123"h x 291"w 10':
        h = 123
    rounded_heights.append(5 * round(float(h) / 5))
    
rounded_widths = []
for w in widths:
    if w == '48"W or 1 large unit 68':
        w = 48
    elif w == '291"w 10':
        w = 291
    rounded_widths.append(5 * round(float(w) / 5))

df['rounded_height'] = rounded_heights
df['rounded_width'] = rounded_widths

In [151]:
df['sqft'] = [l[0] * l[1] for l in list(zip(df.rounded_height, df.rounded_width))]
df['geo_hash'] = [pgh.encode(l[0], l[1], precision=4) for l in list(zip(df.lat, df.lon))]
df.head()

Unnamed: 0,id,avg_price,avg_cpm,zip_code,zip_code_id,lat,lon,size,price,rate_card_price,floor_price,cpm,impressions,supplier_id,supplier_face_id,rounded_height,rounded_width,sqft,geo_hash
0,468328,850.0,12.58,4042,7217,41.43777,-75.6549,"10' 6"" x 22' 9""",850.0,1020.0,,12.58,67568,39,290,10,20,200,dr65
1,107547,400.0,2.05,4012,15252,29.13522,-82.04484,10' x 40',400.0,480.0,,2.05,195304,39,3849,10,40,400,djjt
2,316324,250.0,1.54,4006,39634,34.062725,-118.0529,6' x 12',250.0,300.0,,1.54,162504,39,4484,5,10,50,9qh1
3,2065349,460.0,0.28,4022,31588,30.219819,-93.358694,12' x 24',460.0,460.0,,0.28,1626044,799,0308A S/F,10,25,250,9vme
4,430511,1500.0,3.75,4045,12375,34.007264,-81.015278,10' x 40',1500.0,1800.0,,3.75,399972,39,71292,10,40,400,dnn3


In [152]:
df = df[['id', 'price', 'supplier_id', 'lat', 'lon', 'sqft', 'geo_hash', 'price', 'cpm', 'impressions']]
df.head()

Unnamed: 0,id,price,supplier_id,lat,lon,sqft,geo_hash,price.1,cpm,impressions
0,468328,850.0,39,41.43777,-75.6549,200,dr65,850.0,12.58,67568
1,107547,400.0,39,29.13522,-82.04484,400,djjt,400.0,2.05,195304
2,316324,250.0,39,34.062725,-118.0529,50,9qh1,250.0,1.54,162504
3,2065349,460.0,799,30.219819,-93.358694,250,9vme,460.0,0.28,1626044
4,430511,1500.0,39,34.007264,-81.015278,400,dnn3,1500.0,3.75,399972


In [153]:
def geo_dist(lat1, lon1, lat2, lon2):
    x = np.absolute((float(lat1)**2) - (float(lat2)**2))
    y = np.absolute((float(lon1)**2) - (float(lon2)**2))
    coord_sum = x + y
    sqrt = np.sqrt(coord_sum)
    return sqrt

In [154]:
def dist(var1, var2):
    sq_sub = (var1 - var2)**2
    return np.sqrt(sq_sub)

In [155]:
output_array = []
for index1, row1 in df[:200].iterrows():
    for index2, row2 in df[:200].iterrows():
        loc_dist = geo_dist(row1['lat'], row1['lon'], row2['lat'], row2['lon'])
        sqft_dist = dist(row1['sqft'], row2['sqft'])
        cpm_dist = dist(row1['cpm'], row2['cpm'])
        output_array.append(
            [
                row1['id'], 
                row2['id'], 
                loc_dist, 
                sqft_dist, 
                cpm_dist, 
                row1['cpm'], 
                row2['cpm'], 
                row1['impressions'], 
                row2['impressions']
            ]
        )

In [156]:
matrix_df = pd.DataFrame(
    output_array,
    columns=[
        'id1', 
        'id2', 
        'location_distance', 
        'sqft_distance', 
        'cpm_distance', 
        'cpm_id1', 
        'cpm_id2',
        'impressions_id1', 
        'impressions_id2'
    ]
)