In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

%matplotlib inline

In [2]:
data = 'data/unit_data.csv'
df = pd.read_csv(data)
df.head()

Unnamed: 0,id,avg_price,avg_cpm,zip_code,zip_code_id,lat,lon,size,price,rate_card_price,floor_price,cpm,impressions,supplier_id,supplier_face_id
0,468328,850.0,12.58,4042,7217,41.43777,-75.6549,"10' 6"" x 22' 9""",850.0,1020.0,,12.58,67568,39,290
1,107547,400.0,2.05,4012,15252,29.13522,-82.04484,10' x 40',400.0,480.0,,2.05,195304,39,3849
2,316324,250.0,1.54,4006,39634,34.062725,-118.0529,6' x 12',250.0,300.0,,1.54,162504,39,4484
3,2065349,460.0,0.28,4022,31588,30.219819,-93.358694,12' x 24',460.0,460.0,,0.28,1626044,799,0308A S/F
4,430511,1500.0,3.75,4045,12375,34.007264,-81.015278,10' x 40',1500.0,1800.0,,3.75,399972,39,71292


In [3]:
import re

heights = [] 
widths = []
for size_val in df['size']:
    height_match = re.search("^(.*?)'", size_val)
    if height_match:
        heights.append(height_match.group(1))
    else:
        heights.append(0)
    width_match = re.search("(?<=x ).*?(?=')", size_val)
    if width_match:
        widths.append(width_match.group(0))
    else:
        widths.append(0)
        
rounded_heights = []
for h in heights:
    if h == '3 units each 70"H x 48"W or 1 large unit 68':
        h = 70
    elif h == '2 (40':
        h = 40
    elif h == '123"h x 291"w 10':
        h = 123
    rounded_heights.append(5 * round(float(h) / 5))
    
rounded_widths = []
for w in widths:
    if w == '48"W or 1 large unit 68':
        w = 48
    elif w == '291"w 10':
        w = 291
    rounded_widths.append(5 * round(float(w) / 5))

df['height'] = heights
df['width'] = widths
df['rounded_height'] = rounded_heights
df['rounded_width'] = rounded_widths
df['normal_size'] = ['x'.join(map(str, l)) for l in list(zip(df.height, df.width))]
df['rounded_size'] = ['x'.join(map(str, l)) for l in list(zip(df.rounded_height, df.rounded_width))]
df['sqft'] = [l[0] * l[1] for l in list(zip(df.rounded_height, df.rounded_width))]
df.head()

Unnamed: 0,id,avg_price,avg_cpm,zip_code,zip_code_id,lat,lon,size,price,rate_card_price,...,impressions,supplier_id,supplier_face_id,height,width,rounded_height,rounded_width,normal_size,rounded_size,sqft
0,468328,850.0,12.58,4042,7217,41.43777,-75.6549,"10' 6"" x 22' 9""",850.0,1020.0,...,67568,39,290,10,22,10,20,10x22,10x20,200
1,107547,400.0,2.05,4012,15252,29.13522,-82.04484,10' x 40',400.0,480.0,...,195304,39,3849,10,40,10,40,10x40,10x40,400
2,316324,250.0,1.54,4006,39634,34.062725,-118.0529,6' x 12',250.0,300.0,...,162504,39,4484,6,12,5,10,6x12,5x10,50
3,2065349,460.0,0.28,4022,31588,30.219819,-93.358694,12' x 24',460.0,460.0,...,1626044,799,0308A S/F,12,24,10,25,12x24,10x25,250
4,430511,1500.0,3.75,4045,12375,34.007264,-81.015278,10' x 40',1500.0,1800.0,...,399972,39,71292,10,40,10,40,10x40,10x40,400


In [4]:
df.shape

(21145, 22)

In [5]:
df.dtypes

id                    int64
avg_price           float64
avg_cpm             float64
zip_code              int64
zip_code_id           int64
lat                 float64
lon                 float64
size                 object
price               float64
rate_card_price     float64
floor_price         float64
cpm                 float64
impressions           int64
supplier_id           int64
supplier_face_id     object
height               object
width                object
rounded_height        int64
rounded_width         int64
normal_size          object
rounded_size         object
sqft                  int64
dtype: object

In [6]:
df.isnull().sum()

id                      0
avg_price               0
avg_cpm                 0
zip_code                0
zip_code_id             0
lat                     0
lon                     0
size                    0
price                   0
rate_card_price         7
floor_price         21134
cpm                     0
impressions             0
supplier_id             0
supplier_face_id        2
height                  0
width                   0
rounded_height          0
rounded_width           0
normal_size             0
rounded_size            0
sqft                    0
dtype: int64

In [7]:
df['rounded_size'].value_counts().sort_values(ascending=False)

10x20    9365
15x50    4116
10x35    1786
5x10     1319
10x25    1154
         ... 
25x45       1
20x75       1
10x55       1
15x20       1
30x95       1
Name: rounded_size, Length: 113, dtype: int64

## Ignore rows with unique sizes

In [8]:
counts = df['rounded_size'].value_counts()
df = df[df['rounded_size'].isin(counts[counts > 3].index)]
df['rounded_size'].value_counts().sort_values(ascending=False)

10x20     9365
15x50     4116
10x35     1786
5x10      1319
10x25     1154
10x30     1014
10x40      995
10x50      232
15x40      153
20x60      147
10x15      139
10x10       81
20x50       71
20x80       44
25x25       38
15x30       34
10x45       34
0x0         32
15x45       30
5x15        26
15x25       25
15x35       24
20x30       22
5x20        18
20x25       16
10x5        14
15x60       11
20x40       10
25x35       10
15x15        9
25x40        9
15x10        9
20x20        6
105x35       6
5x5          5
5x25         5
25x60        5
25x30        5
15x55        4
30x25        4
20x70        4
30x20        4
Name: rounded_size, dtype: int64

In [9]:
unit_df = df[['id', 'lat', 'lon', 'height', 'width', 'sqft', 'avg_price', 'price', 'avg_cpm', 'cpm', 'impressions']]
unit_df.set_index('id', inplace=True)
unit_df.head()

Unnamed: 0_level_0,lat,lon,height,width,sqft,avg_price,price,avg_cpm,cpm,impressions
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
468328,41.43777,-75.6549,10,22,200,850.0,850.0,12.58,12.58,67568
107547,29.13522,-82.04484,10,40,400,400.0,400.0,2.05,2.05,195304
316324,34.062725,-118.0529,6,12,50,250.0,250.0,1.54,1.54,162504
2065349,30.219819,-93.358694,12,24,250,460.0,460.0,0.28,0.28,1626044
430511,34.007264,-81.015278,10,40,400,1500.0,1500.0,3.75,3.75,399972


In [10]:
import numpy as np

def haversine_np(lat1, lon1, lat2, lon2):
    """
    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees)

    All args must be of equal length.    

    """
    lon1, lat1, lon2, lat2 = map(np.radians, [float(lon1), float(lat1), float(lon2), float(lat2)])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2

    c = 2 * np.arcsin(np.sqrt(a))
    km = 6367 * c
    return km

In [11]:
# euclidean distance between point
def euc_dist(lat1, lon1, lat2, lon2):
    x = (float(lat1) - float(lat2))**2
    y = (float(lon1) - float(lon2))**2
    return np.sqrt(x + y)

In [12]:
def dist(var1, var2):
    sq_sub = (var1 - var2)**2
    return np.sqrt(sq_sub)

In [13]:
# output_array = []
# for index1, row1 in unit_df[:200].iterrows():
#     for index2, row2 in unit_df[:200].iterrows():
#         # may need to remove this for pairwise distances
#         if index1 == index2:
#             pass
#         else:
# #             h_dist = haversine_np(row1[1], row1[2], row2[1], row2[2])
#             e_dist = euc_dist(row1[1], row1[2], row2[1], row2[2])
# #           other_features = calculate_distance(sqft, cpm)
# # subtract off comparison row, add all features up and take the sqrt
# # row1 - row2 as a series, square it, then sum, and sqrt for final distance
# #             output_array.append([index1, index2, h_dist])
#             output_array.append([index1, index2, e_dist])
    
# print(output_array[:10])

from scipy.spatial.distance import pdist

output_array = []
for index1, row1 in unit_df[:200].iterrows():
    for index2, row2 in unit_df[:200].iterrows():
        e_dist = euc_dist(row1['lat'], row1['lon'], row2['lat'], row2['lon'])
#         euclidean distance between sqfts
#         euclidean distance between cpms
        sqft_dist = dist(row1['sqft'], row2['sqft'])
        cpm_dist = dist(row1['cpm'], row2['cpm'])
        output_array.append([index1, index2, e_dist, sqft_dist, cpm_dist])
    
print(output_array[:10])

[[468328, 468328, 0.0, 0.0, 0.0], [468328, 107547, 13.863046912785807, 200.0, 10.530000000000001], [468328, 316324, 43.034656879682736, 150.0, 11.04], [468328, 2065349, 20.958691434172053, 50.0, 12.3], [468328, 430511, 9.162208888631612, 200.0, 8.83], [468328, 111959, 21.743975598715735, 50.0, 8.51], [468328, 353039, 5.79296048687543, 0.0, 3.860000000000001], [468328, 2065369, 20.963177595370816, 150.0, 11.97], [468328, 159440, 4.6461397812054805, 0.0, 3.8499999999999996], [468328, 307979, 15.748167361687674, 425.0, 10.77]]


In [14]:
output_array[0]

[468328, 468328, 0.0, 0.0, 0.0]

In [15]:
new_df = pd.DataFrame(output_array, columns=['id1', 'id2', 'lat_lon_distance', 'sqft_distance', 'cpm_distance'])

In [16]:
new_df.head()

Unnamed: 0,id1,id2,lat_lon_distance,sqft_distance,cpm_distance
0,468328,468328,0.0,0.0,0.0
1,468328,107547,13.863047,200.0,10.53
2,468328,316324,43.034657,150.0,11.04
3,468328,2065349,20.958691,50.0,12.3
4,468328,430511,9.162209,200.0,8.83


In [17]:
from sklearn.preprocessing import StandardScaler
# @TODO use min max scaler
# subtract from 1 for the geo dist
scaler = StandardScaler()
new_df['scaled_geo_dist'] = scaler.fit_transform(new_df['lat_lon_distance'].values.reshape(-1,1))
new_df['scaled_sqft'] = scaler.fit_transform(new_df['sqft_distance'].values.reshape(-1,1))
new_df['scaled_cpm'] = scaler.fit_transform(new_df['cpm_distance'].values.reshape(-1,1))
new_df.head(30)


# pairwise distance function - euclidean
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.paired_euclidean_distances.html#sklearn.metrics.pairwise.paired_euclidean_distances

Unnamed: 0,id1,id2,lat_lon_distance,sqft_distance,cpm_distance,scaled_geo_dist,scaled_sqft,scaled_cpm
0,468328,468328,0.0,0.0,0.0,-1.423307,-1.096156,-0.689267
1,468328,107547,13.863047,200.0,10.53,-0.057425,-0.223412,0.539425
2,468328,316324,43.034657,150.0,11.04,2.81676,-0.441598,0.598934
3,468328,2065349,20.958691,50.0,12.3,0.641686,-0.87797,0.745957
4,468328,430511,9.162209,200.0,8.83,-0.520584,-0.223412,0.341061
5,468328,111959,21.743976,50.0,8.51,0.719057,-0.87797,0.303721
6,468328,353039,5.79296,0.0,3.86,-0.852545,-1.096156,-0.238863
7,468328,2065369,20.963178,150.0,11.97,0.642128,-0.441598,0.707451
8,468328,159440,4.64614,0.0,3.85,-0.965538,-1.096156,-0.24003
9,468328,307979,15.748167,425.0,10.77,0.128309,0.758426,0.567429


In [18]:
new_df.lat_lon_distance.value_counts()

0.000000     206
9.660645       8
15.201322      8
19.293360      8
16.768599      4
            ... 
13.544386      2
5.338624       2
4.605615       2
10.328213      2
6.118058       2
Name: lat_lon_distance, Length: 19307, dtype: int64

In [1]:
import turicreate
# turicreate.item_similarity_recommender
m = turicreate.item_similarity_recommender.create(new_df,
                                                user_id='id1',
                                                item_id='id2')

ModuleNotFoundError: No module named 'turicreate'