In [None]:
from IPython.display import HTML
import base64
from itertools import combinations
import math
import os
import pandas as pd
import sys

In [None]:
site_sonar_csv = 'https://dl.dropboxusercontent.com/s/yax3qabtyqy9q7p/site_sonar.csv'
dist_threshold = 100           # To keep, a location needs to be 100+ miles away from the next closest location
prediction_threshold = 250000  # To keep, a location needs to be predicted to make $250,000+

In [None]:
def create_download_link(df, title = "Download Whitespace Analysis CSV", filename = "whitespace_results.csv"):      
    """Create a link to download a DataFrame as a CSV file.  Limited to files ~2MB or less.
    
    Taken from: https://www.kaggle.com/rtatman/download-a-csv-file-from-a-kernel
    """
    csv = df.to_csv(index=False)
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload,title=title,filename=filename)
    return HTML(html)

# Originally used geopy for this, but avoiding that dependency
def distance(origin, destination):
    """
    Calculate the Haversine distance.

    Parameters
    ----------
    origin : tuple of float
        (lat, long)
    destination : tuple of float
        (lat, long)

    Returns
    -------
    distance_in_km : float

    Examples
    --------
    >>> origin = (48.1372, 11.5756)  # Munich
    >>> destination = (52.5186, 13.4083)  # Berlin
    >>> round(distance(origin, destination), 1)
    504.2
    """
    lat1, lon1 = origin
    lat2, lon2 = destination
    radius = 6371  # km

    dlat = math.radians(lat2 - lat1)
    dlon = math.radians(lon2 - lon1)
    a = (math.sin(dlat / 2) * math.sin(dlat / 2) +
         math.cos(math.radians(lat1)) * math.cos(math.radians(lat2)) *
         math.sin(dlon / 2) * math.sin(dlon / 2))
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    d = radius * c
    
    return d

def get_next_biggest_conflict(loc_pairs, prediction_threshold):
    """Finds the coordinates of the location that should be dropped next.
    
    Using a DataFrame generated by all_location_pairs, returns the coordinates of
    the location in that DataFrame that has the most violations of the distance
    threshold and the lowest predicted revenue.
    """
    
    # Get the set of locations with the most conflicts
    maxcount_conflicts = loc_pairs.loc1_coords.value_counts()
    maxcount_conflicts = maxcount_conflicts[maxcount_conflicts == max(maxcount_conflicts)]  

    # Of those, find the one with the lowest predicted revenue, and return it.
    drop_coords = None
    drop_prediction = math.inf
    for coords, count in maxcount_conflicts.iteritems():
        prediction = site_sonar_predictions[coords]
        if prediction < drop_prediction:
            drop_prediction = prediction
            drop_coords = coords

    return drop_coords

def km_to_mi(km):
    # 1 km == 0.621371mi
    # 1 mi == 1.60934km
    miles = round(km * 0.621371, 2)
    return miles # returning value into main code

def make_coords_tuple(row):
    """Given a DataFrame row that contains Latitude & Longitude columns, returns those coordinates as a tuple."""
    return (row['Latitude'], row['Longitude'])

def tooclose_location_pairs(loc_coords, loc_predictions, dist_threshold):
    """Given a Series of location coordinate tuples, creates a DataFrame of location pairs below the distance threshold.
    
    Each location pair (loc1, loc2) below the threshold will be listed twice; once with loc1 listed first, and again
    with loc2 listed first.
    """
    
    tooclose_pairs = {}
    i = 0
    
    for pair in combinations(loc_coords, 2):
        startloc_coords = pair[0]
        endloc_coords = pair[1]

        dist = km_to_mi(distance(startloc_coords, endloc_coords))
        
        if (dist < dist_threshold):
            tooclose_pairs[i] = {
                'loc1_coords'     : startloc_coords,
                'loc2_coords'     : endloc_coords,
                'distance'        : dist,
                'loc1_prediction' : loc_predictions[startloc_coords]
            }
            tooclose_pairs[i] = {
                'loc1_coords'     : endloc_coords,
                'loc2_coords'     : startloc_coords,
                'distance'        : dist,
                'loc1_prediction' : loc_predictions[endloc_coords]
            }
        
        i = i + 1
        
    return pd.DataFrame.from_dict(tooclose_pairs, "index")

In [None]:
# Read in the Site Sonar data
site_sonar = pd.read_csv(site_sonar_csv)

print("Number of locations from original Site Sonar data:")
print(len(site_sonar))

print("Details of locations from original Site Sonar data:")
site_sonar

In [None]:
# Keep the subset of with a predicted revenue below our threshold
site_sonar = site_sonar[site_sonar['Total Zeustimate'] > prediction_threshold]

site_sonar_coords = site_sonar.apply(lambda row: make_coords_tuple(row), axis=1)
site_sonar_predictions = dict(zip(site_sonar_coords, site_sonar['Total Zeustimate']))

print("Number of locations meeting the prediction constraint:")
print(len(site_sonar))

print("Details of locations meeting the prediction constraint:")
site_sonar

In [None]:
# Create a DataFrame containing pairs of locations violating the distance constraint, and the distances between them.
tooclose_pairs = tooclose_location_pairs(site_sonar_coords, site_sonar_predictions, dist_threshold)

print("Number of location pairs (symmetric) violating the distance constraint:")
print(len(tooclose_pairs))

print("Details of location pairs (symmetric) violating the distance constraint:")
tooclose_pairs

In [None]:
# Drop all the "too close" pairs. At each step, drop the next-biggest
# violation of our distance and revenue constraints.
while not tooclose_pairs.empty: 
    drop_coords = get_next_biggest_conflict(tooclose_pairs, prediction_threshold)
    
    tooclose_pairs = tooclose_pairs[tooclose_pairs.loc1_coords != drop_coords]
    tooclose_pairs = tooclose_pairs[tooclose_pairs.loc2_coords != drop_coords]
    
    site_sonar_coords = site_sonar_coords[site_sonar_coords != drop_coords]
    site_sonar_predictions.pop(drop_coords)
    site_sonar = site_sonar[ (site_sonar.Latitude != drop_coords[0]) | (site_sonar.Longitude != drop_coords[1]) ]

In [None]:
print("Number of locations meeting both prediction and distance constraints:")
print(len(site_sonar))

print("Details of locations meeting both prediction and distance constraints:")
site_sonar

In [None]:
# If writing a local file:
#
# path, filename = os.path.split(site_sonar_csv)
# filename = os.path.splitext(filename)[0]
# newfilename = '%s_whitespace_recommendations.csv' % filename
# keep_csv = os.path.join(path, newfilename)
# site_sonar.to_csv(keep_csv, index=False)

create_download_link(site_sonar)