In [1]:
from IPython.display import HTML
import base64
from itertools import combinations
import math
import os
import pandas as pd
import sys

In [2]:
site_sonar_csv = 'https://dl.dropboxusercontent.com/s/yax3qabtyqy9q7p/site_sonar.csv'
dist_threshold = 100           # To keep, a location needs to be 100+ miles away from the next closest location
prediction_threshold = 250000  # To keep, a location needs to be predicted to make $250,000+

In [3]:
def all_location_pairs(loc_coords, loc_predictions):
    """Given a Series of location coordinate tuples, creates a DataFrame of distances between them.
    
    Each location pair (loc1, loc2) will be listed twice; once with loc1 listed first, and again
    with loc2 listed first.
    """
    
    all_pairs = pd.DataFrame(columns=['loc1_coords', 'loc2_coords', 'distance', 'loc1_prediction'])
    
    for pair in list(combinations(loc_coords, 2)):
        startloc_coords = pair[0]
        endloc_coords = pair[1]

        dist = km_to_mi(distance(startloc_coords, endloc_coords))

        all_pairs = all_pairs.append(
            {
                'loc1_coords'     : startloc_coords,
                'loc2_coords'     : endloc_coords,
                'distance'        : dist,
                'loc1_prediction' : loc_predictions[startloc_coords]
            },
            ignore_index=True
        )

        all_pairs = all_pairs.append(
            {
                'loc1_coords'     : endloc_coords,
                'loc2_coords'     : startloc_coords,
                'distance'        : dist,
                'loc1_prediction' : loc_predictions[endloc_coords]
            },
            ignore_index=True
        )
        
    return all_pairs

def create_download_link(df, title = "Download Whitespace Analysis CSV", filename = "whitespace_results.csv"):      
    """Create a link to download a DataFrame as a CSV file.  Limited to files ~2MB or less.
    
    Taken from: https://www.kaggle.com/rtatman/download-a-csv-file-from-a-kernel
    """
    csv = df.to_csv()
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload,title=title,filename=filename)
    return HTML(html)

# Originally used geopy for this, but avoiding that dependency
def distance(origin, destination):
    """
    Calculate the Haversine distance.

    Parameters
    ----------
    origin : tuple of float
        (lat, long)
    destination : tuple of float
        (lat, long)

    Returns
    -------
    distance_in_km : float

    Examples
    --------
    >>> origin = (48.1372, 11.5756)  # Munich
    >>> destination = (52.5186, 13.4083)  # Berlin
    >>> round(distance(origin, destination), 1)
    504.2
    """
    lat1, lon1 = origin
    lat2, lon2 = destination
    radius = 6371  # km

    dlat = math.radians(lat2 - lat1)
    dlon = math.radians(lon2 - lon1)
    a = (math.sin(dlat / 2) * math.sin(dlat / 2) +
         math.cos(math.radians(lat1)) * math.cos(math.radians(lat2)) *
         math.sin(dlon / 2) * math.sin(dlon / 2))
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    d = radius * c
    
    return d
    
def get_next_biggest_conflict(loc_pairs, prediction_threshold):
    """Finds the coordinates of the location that should be dropped next.
    
    Using a DataFrame generated by all_location_pairs, returns the coordinates of
    the location in that DataFrame that has the most violations of the distance
    threshold and the lowest predicted revenue.
    """
    
    # Get the set of locations with the most conflicts
    maxcount_conflicts = loc_pairs.loc1_coords.value_counts()
    maxcount_conflicts = maxcount_conflicts[maxcount_conflicts == max(maxcount_conflicts)]  

    # Of those, find the one with the lowest predicted revenue, and return it.
    drop_coords = None
    drop_prediction = math.inf
    for coords, count in maxcount_conflicts.iteritems():
        prediction = site_sonar_predictions[coords]
        if prediction < drop_prediction:
            drop_prediction = prediction
            drop_coords = coords

    return drop_coords

def km_to_mi(km):
    # 1 km == 0.621371mi
    # 1 mi == 1.60934km
    miles = round(km * 0.621371, 2)
    return miles # returning value into main code

def make_coords_tuple(row):
    """Given a DataFrame row that contains Latitude & Longitude columns, returns those coordinates as a tuple."""
    return (row['Latitude'], row['Longitude'])

In [4]:
# Read in the Site Sonar data
site_sonar = pd.read_csv(site_sonar_csv)

print("Number of locations from original Site Sonar data:")
print(len(site_sonar))

print("Details of locations from original Site Sonar data:")
site_sonar

Number of locations from original Site Sonar data:
113
Details of locations from original Site Sonar data:


Unnamed: 0,POI Category,POI Sub Category,Site Name,Latitude,Longitude,Address,City,State,Zip Code,Country,Total Zeustimate
0,Art Supply,Misc Art Supply,Art Mart,40.017719,-105.279221,1222 Pearl Street,Boulder,CO,80302,United States,272869.46
1,Art Supply,Misc Art Supply,Binders Art Supplies & Frames,35.153278,-80.839859,3330 Piedmont Road,Charlotte,NC,28211,United States,239456.12
2,Art Supply,Misc Art Supply,Blick Art & Craft,40.100616,-83.091103,6486 Sawmill Road,Columbus,OH,43235,United States,263714.25
3,Art Supply,Misc Art Supply,Blick Art & Craft,42.050552,-88.030792,1975 East Golf Road,Schaumburg,IL,60173,United States,275192.25
4,Art Supply,Misc Art Supply,Blick Art & Craft,45.011761,-93.176765,2389 Fairview Avenue,Roseville,MN,55113,United States,273543.50
5,Art Supply,Misc Art Supply,Blick Art & Craft,36.170265,-115.206131,290 South Decatur Boulevard,Las Vegas,NV,89107,United States,246996.73
6,Art Supply,Misc Art Supply,Blick Art Materials,42.344147,-71.102364,401 Park Drive,Boston,MA,2215,United States,268256.08
7,Art Supply,Misc Art Supply,Blick Art Materials,41.691280,-72.845215,341 Cooke Street,Plainville,CT,6062,United States,346798.61
8,Art Supply,Misc Art Supply,Blick Art Materials,40.726540,-73.994064,1 5 Bond Street,New York,NY,10012,United States,323826.14
9,Art Supply,Misc Art Supply,Blick Art Materials,39.950542,-75.163033,1330 Chestnut Street,Philadelphia,PA,19107,United States,346974.98


In [5]:
# Keep the subset of with a predicted revenue below our threshold
site_sonar = site_sonar[site_sonar['Total Zeustimate'] > prediction_threshold]

site_sonar_coords = site_sonar.apply(lambda row: make_coords_tuple(row), axis=1)
site_sonar_predictions = dict(zip(site_sonar_coords, site_sonar['Total Zeustimate']))

# Create a DataFrame containing all pairs of locations and the distances between them
all_pairs = all_location_pairs(site_sonar_coords, site_sonar_predictions)

print("Number of locations meeting the prediction constraint:")
print(len(site_sonar))

print("Details of locations meeting the prediction constraint:")
site_sonar

Number of locations meeting the prediction constraint:
88
Details of locations meeting the prediction constraint:


Unnamed: 0,POI Category,POI Sub Category,Site Name,Latitude,Longitude,Address,City,State,Zip Code,Country,Total Zeustimate
0,Art Supply,Misc Art Supply,Art Mart,40.017719,-105.279221,1222 Pearl Street,Boulder,CO,80302,United States,272869.46
2,Art Supply,Misc Art Supply,Blick Art & Craft,40.100616,-83.091103,6486 Sawmill Road,Columbus,OH,43235,United States,263714.25
3,Art Supply,Misc Art Supply,Blick Art & Craft,42.050552,-88.030792,1975 East Golf Road,Schaumburg,IL,60173,United States,275192.25
4,Art Supply,Misc Art Supply,Blick Art & Craft,45.011761,-93.176765,2389 Fairview Avenue,Roseville,MN,55113,United States,273543.50
6,Art Supply,Misc Art Supply,Blick Art Materials,42.344147,-71.102364,401 Park Drive,Boston,MA,2215,United States,268256.08
7,Art Supply,Misc Art Supply,Blick Art Materials,41.691280,-72.845215,341 Cooke Street,Plainville,CT,6062,United States,346798.61
8,Art Supply,Misc Art Supply,Blick Art Materials,40.726540,-73.994064,1 5 Bond Street,New York,NY,10012,United States,323826.14
9,Art Supply,Misc Art Supply,Blick Art Materials,39.950542,-75.163033,1330 Chestnut Street,Philadelphia,PA,19107,United States,346974.98
10,Art Supply,Misc Art Supply,Blick Art Materials,40.557156,-75.489006,3152 Lehigh Street,Allentown,PA,18103,United States,317991.87
11,Art Supply,Misc Art Supply,Blick Art Materials,40.243580,-76.902519,836 Market Street,Lemoyne,PA,17043,United States,364719.90


In [6]:
# Figure out which location pairs are too close to each other
tooclose_pairs = all_pairs[all_pairs['distance'] < dist_threshold]

# Drop all the "too close" pairs. At each step, drop the next-biggest
# violation of our distance and revenue constraints.
while not tooclose_pairs.empty: 
    drop_coords = get_next_biggest_conflict(tooclose_pairs, prediction_threshold)
    
    tooclose_pairs = tooclose_pairs[tooclose_pairs.loc1_coords != drop_coords]
    tooclose_pairs = tooclose_pairs[tooclose_pairs.loc2_coords != drop_coords]
    
    site_sonar_coords = site_sonar_coords[site_sonar_coords != drop_coords]
    site_sonar_predictions.pop(drop_coords)
    site_sonar = site_sonar[ (site_sonar.Latitude != drop_coords[0]) | (site_sonar.Longitude != drop_coords[1]) ]

In [7]:
print("Number of locations meeting both prediction and distance constraints:")
print(len(site_sonar))

print("Details of locations meeting both prediction and distance constraints:")
site_sonar

Number of locations meeting both prediction and distance constraints:
27
Details of locations meeting both prediction and distance constraints:


Unnamed: 0,POI Category,POI Sub Category,Site Name,Latitude,Longitude,Address,City,State,Zip Code,Country,Total Zeustimate
0,Art Supply,Misc Art Supply,Art Mart,40.017719,-105.279221,1222 Pearl Street,Boulder,CO,80302,United States,272869.46
4,Art Supply,Misc Art Supply,Blick Art & Craft,45.011761,-93.176765,2389 Fairview Avenue,Roseville,MN,55113,United States,273543.5
7,Art Supply,Misc Art Supply,Blick Art Materials,41.69128,-72.845215,341 Cooke Street,Plainville,CT,6062,United States,346798.61
14,Art Supply,Misc Art Supply,Blick Art Materials,37.988411,-87.492943,1364 North Green River Road,Evansville,IN,47715,United States,384119.18
19,Art Supply,Misc Art Supply,Blick Art Materials,38.651897,-90.34005,8007 Maryland Avenue,St Louis,MO,63105,United States,263490.48
23,Art Supply,Misc Art Supply,Blick Art Materials,41.259586,-96.036148,7829 Dodge Street,Omaha,NE,68114,United States,255415.6
27,Art Supply,Misc Art Supply,Blick Art Materials,34.07621,-118.348755,7301 West Beverly Boulevard,Los Angeles,CA,90036,United States,269297.42
32,Art Supply,Misc Art Supply,Blick Art Materials Outlet,40.9212,-90.314095,695 Us Highway 150 East,Galesburg,IL,61401,United States,495409.45
41,Art Supply,Misc Art Supply,Jerry's Artarama,26.31485,-80.090889,242 South Federal Highway,Deerfield Beach,FL,33441,United States,340411.24
46,Art Supply,Misc Art Supply,Jerry's Artarama,35.824108,-78.621826,3060 Wake Forest Road,Raleigh,NC,27609,United States,265075.69


In [8]:
# If writing a local file:
#
# path, filename = os.path.split(site_sonar_csv)
# filename = os.path.splitext(filename)[0]
# newfilename = '%s_whitespace_recommendations.csv' % filename
# keep_csv = os.path.join(path, newfilename)
# site_sonar.to_csv(keep_csv, index=False)

create_download_link(site_sonar)