In [None]:
import infostop
import pandas as pd
import numpy as np

import osmnx as ox
import requests
import matplotlib.cm as cm
import matplotlib.colors as colors
%matplotlib inline
ox.config(use_cache=True, log_console=True)
print(ox.__version__)

import networkx as nx
import geopandas as gpd
import multiprocessing as mp

from descartes import PolygonPatch
from shapely.geometry import Polygon, MultiPolygon

import folium
from folium.plugins import Fullscreen, HeatMapWithTime, TimestampedGeoJson
from folium.plugins import TimestampedGeoJson, HeatMap, HeatMapWithTime

import geopy
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import matplotlib.pyplot as plt
import plotly_express as px
import tqdm
from tqdm._tqdm_notebook import tqdm_notebook

In [None]:
from IPython.display import display, HTML, Markdown

def print_df(df):
    return display(HTML(df.to_html()))

stops = pd.read_csv('stops.csv')
stops.rename(columns={'loc':'label'}, inplace=True)
stops['start'] = pd.to_datetime(stops['start'], unit='ms')
stops['end'] = pd.to_datetime(stops['end'], unit='ms')
stops['label'] = stops['label'].apply(int)
stops['label'] = stops['label'].apply(str)
stops['user'] = stops['user'].apply(str)
print_df(stops.head())

In [None]:
oneday = stops.loc[(stops['start'] >= pd.to_datetime('2014-01-01')) & (stops['start'] < pd.to_datetime('2014-01-02')) & ((stops['user'] == '0') | (stops['user'] == '1') | (stops['user'] == '2'))].copy()
print_df(oneday.head())

In [None]:
display(Markdown("<font color=green> <font size=4>'oneday' dataframe has 3 people's information which has the user labels of 0, 1 and 2.\
                 \nShape of this dataframe is {}.".format(oneday.shape)))

In [None]:
loc_data = [[row['lat'],row['lon']] for index, row in oneday.iterrows()]
map_hooray = folium.Map(location=[55.636413, 11.298542], zoom_start = 3, tiles='Stamen Toner')
HeatMap(loc_data, radius = 20, max_zoom = 30).add_to(map_hooray)
map_hooray

In [None]:
oneday_median = pd.merge(oneday.groupby(by=['label'])['lat'].median().reset_index().copy(), \
                         oneday.groupby(by=['label'])['lon'].median().reset_index().copy(), how='outer', on='label')
oneday_median.rename(columns={'lat':'lat_median', 'lon':'lot_median'}, inplace=True)
print_df(oneday_median.head())

In [None]:
def percent_missing(df):
    percent_missing = df.isnull().sum() * 100 / len(df)
    missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'percent_missing': percent_missing})
    missing_value_df.sort_values('percent_missing', inplace=True)
    missing_value_df.reset_index(drop=True, inplace=True)
    return missing_value_df

missing_df_median = percent_missing(oneday_median)
missing_df_median

In [None]:
print('{} rows dropped after finding median latitudes and longitudes for each label.'.format(oneday.shape[0]-oneday_median.shape[0]))

In [None]:
oneday_real = pd.merge(oneday, oneday_median, on='label', how='left')
oneday_real.sort_values(by=['start', 'end', 'label'], inplace=True)
oneday_real.reset_index(drop=True, inplace=True)
print_df(oneday_real.head())

<font color = green>
<font size = 4>

- Now, median coordinates are found for each label.
- Only media coordinates will be kept and others will be dropped.

In [None]:
oneday_real.drop(columns=['lat', 'lon'], inplace=True)
oneday_real.rename(columns={'lat_median':'lat', 'lot_median':'lon'}, inplace=True)
oneday_real["geo"] = oneday_real["lat"].map(str) + ", " + oneday_real["lon"].map(str)
print_df(oneday_real.head())

In [None]:
%who DataFrame

In [None]:
del oneday_median
oneday_median = oneday_real.copy()
del missing_df_median, oneday, oneday_real

In [None]:
missing_df_median = percent_missing(oneday_median)
missing_df_median

In [None]:
del missing_df_median
print_df(oneday_median.head())

In [None]:
south = oneday_median['lat'].min()
north = oneday_median['lat'].max()
west = oneday_median['lon'].min()
east = oneday_median['lon'].max()
tags = {'amenity' : True,
        'landuse' : ['retail', 'commercial'],
        'highway' : 'bus_stop'}
gdf = ox.geometries_from_bbox(north = north, south = south, east = east, west = west, tags = tags)
print(gdf.shape)
print_df(gdf[gdf['amenity']=='bank'].dropna(axis=1, how='any').head())

In [None]:
ax = gdf.plot()

### According to GeoPandas [link here](https://geopandas.org/gallery/create_geopandas_from_pandas.html), __*longitude*__ is __*X*__ and __*latitude*__ is __*Y*__.

In [None]:
oneday_gdf = gpd.GeoDataFrame(
    oneday_median, geometry=gpd.points_from_xy(oneday_median.lon, oneday_median.lat))

gdf['center_point'] = gdf['geometry'].map(lambda row: row.centroid)
gdf["gdf_lon"] = gdf.center_point.map(lambda row: row.x)
gdf["gdf_lat"] = gdf.center_point.map(lambda row: row.y)

print_df(gdf[['geometry', 'center_point', 'gdf_lat', 'gdf_lon']].head())

<font size=4>
<font color=green>

- My haversine related test functions are as follows;

In [None]:
from math import radians, cos, sin, asin, sqrt

def haversine(lat1, lon1, lat2, lon2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 6371000 # Radius of earth in meters. Use 3956 for miles
    print('Result in radians:',c, '\n')
    print('Result in m:',c * r, '\n')

print(haversine(11.98057675, 56.00953381, 12.5777652, 55.6180708))

from sklearn.metrics.pairwise import haversine_distances
from math import radians

def haversine2(lat1, lon1, lat2, lon2):
    row0 = lat1, lon1
    row1 = lat2, lon2
    row0_in_radians = [radians(_) for _ in row0]
    row1_in_radians = [radians(_) for _ in row1]
    result = haversine_distances([row0_in_radians, row1_in_radians])
    print('Result in radians:\n',result)
    print('Result in m:\n',result * 6371000)
    
print(haversine2(11.98057675, 56.00953381, 12.5777652, 55.6180708))

In [None]:
from sklearn.neighbors import BallTree
from shapely.geometry import Point
import functools
import operator
import numpy as np
from math import radians


def ball_distance(df, gdf, distance_to_point_meters):
    earth_radius = 6371000 # meters in earth
    radian_radius = float(distance_to_point_meters / earth_radius)
    
    array_GDF = np.array(list(gdf.center_point.apply(lambda geo: [geo.y, geo.x])))
    list_GDF_rad = [[radians(i[0]), radians(i[1])] for i in array_GDF.tolist()] 
    array_GDF_rad = np.array(list_GDF_rad)

    array_DF = np.array(list(df.geometry.apply(lambda geo: [geo.y, geo.x]))) 
    list_DF_rad = [[radians(i[0]), radians(i[1])] for i in array_DF.tolist()]
    array_DF_rad = np.array(list_DF_rad)

    # Tree will be created according to our gdf_df since the closest points will be found out according to these 'triangulations'
    leafSize = round(len(array_GDF)) # to guarantee number of leaves

    btree = BallTree(array_GDF_rad, metric='haversine', leaf_size=leafSize)
    idx, dist = btree.query_radius(X=array_DF_rad, r=radian_radius, return_distance=True, sort_results=True) #query_radius

    dist = dist.tolist()
    idx = idx.tolist()

    dist = [(i[:1] or [np.nan])[0] for i in dist]
    idx = [(i[:1] or [np.nan])[0] for i in idx]

    idx = [ix for ix in idx if str(ix) != 'nan']
    dist = [dt for dt in dist if str(dt) != 'nan']

    dist_mt = [dt_r*6371000 for dt_r in dist]

    assert len(idx) == len(dist) == len(dist_mt)
    
    gdf_final = pd.concat([df.reset_index(drop=True), gdf.loc[idx, gdf.columns != 'geometry'].reset_index(drop=True), pd.Series(dist, name='dist'), pd.Series(dist_mt, name='dist_mt')], axis=1)
    
    return gdf_final, dist, idx

distance_gdf, distances, indices = ball_distance(oneday_gdf, gdf, 100)  
imp_params = ['label','start','end','user','lat','lon','geo','geometry', 'center_point','gdf_lat','gdf_lon','dist','dist_mt']
print_df(distance_gdf[imp_params].head())

### We can check the distance between two locations from [*this link](https://www.geodatasource.com/distance-calculator), too

In [None]:
lat1, lon1, lat2, lon2 = distance_gdf.loc[0, 'lat'], distance_gdf.loc[0, 'lon'], distance_gdf.loc[0, 'gdf_lat'], distance_gdf.loc[0, 'gdf_lon']
print('For the lat:{} and lon:{} from user df:\nMatching with osmnx lat:{} and lon:{} nearest:\n'.format(lat1, lon1, lat2, lon2))
print(haversine(lat1, lon1, lat2, lon2))
print(haversine2(lat2, lon2, lat1, lon1))

<font size=5>
<font color=green>

-  __*'ball_distance'*__  function doesn't work because of the __*'r'*__ paramter in __*'query_radius'*__ method.;

In [None]:
import functools
import operator

def ball_nearest(df, gdf):
    earth_radius = 6371000 # meters in earth

    array_DF = np.array(list(df.geometry.apply(lambda geo: [geo.y, geo.x])))
    list_DF_rad = [[radians(i[0]), radians(i[1])] for i in array_DF.tolist()] 
    array_DF_rad = np.array(list_DF_rad)


    array_GDF = np.array(list(gdf.center_point.apply(lambda geo: [geo.y, geo.x]))) 
    list_GDF_rad = [[radians(i[0]), radians(i[1])] for i in array_GDF.tolist()]
    array_GDF_rad = np.array(list_GDF_rad)

    leafSize = round(len(array_GDF_rad))
    
    btree = BallTree(array_GDF_rad, metric='haversine', leaf_size=leafSize)
    # Query will be conducted for the 'User' data.
    dist, idx = btree.query(array_DF_rad, k=1) 
    idx = functools.reduce(operator.iconcat, idx, [])
    dist = functools.reduce(operator.iconcat, dist, [])
    dist_mt = [dt_r*6371000 for dt_r in dist]

    gdf_final = pd.concat([oneday_gdf.reset_index(drop=True), gdf.loc[idx, gdf.columns != 'geometry'].reset_index(drop=True), pd.Series(dist, name='dist'),  pd.Series(dist_mt, name='dist_mt')], axis=1)

    
    return gdf_final, dist, idx

nearest_gdf, distances, indices = ball_nearest(oneday_gdf, gdf)
print_df(nearest_gdf[imp_params].head())

In [None]:
lat1, lon1, lat2, lon2 = nearest_gdf.loc[0, 'lat'], nearest_gdf.loc[0, 'lon'], nearest_gdf.loc[0, 'gdf_lat'], nearest_gdf.loc[0, 'gdf_lon']
print('For the lat:{} and lon:{} from user df:\nMatching with osmnx lat:{} and lon:{} nearest:\n'.format(lat1, lon1, lat2, lon2))
print(haversine(lat1, lon1, lat2, lon2))
print(haversine2(lat2, lon2, lat1, lon1))

<font size=5>
<font color=green>

-  __*'ball_nearest'*__  function works. If we want to select a radius, I can filter data frame either in function or later on outside the function.

## Example from the stackoverflow link. 
### Exact example doesn't work but I fixed it in the next cell.

<font size=4>

    
[Click for the link](https://stackoverflow.com/questions/56862277/interpreting-sklearn-haversine-outputs-to-kilometers)

In [None]:
from sklearn.neighbors import BallTree
earth_radius = 6371000 # meters in earth
test_radius = 10 # meters
radian_radius = float(test_radius/earth_radius)
test_points = [[32.027240,41.981876],[-81.093190,-87.969982]]
test_points_rad = [[x[0] * np.pi / 180, x[1] * np.pi / 180] for x in test_points]

tree = BallTree(np.array([test_points_rad[0]]), metric = 'haversine')
index__, dis__ = tree.query_radius([test_points_rad[1]], r=radian_radius, return_distance  = True)
print(index__)
print(dis__)

In [None]:
from sklearn.neighbors import BallTree
earth_radius = 6371000 # meters in earth
test_radius = 14172980.8 # meters

test_points = [[32.027240,41.981876],[-81.093190,-87.969982]]
test_points_rad = [[x[0] * np.pi / 180, x[1] * np.pi / 180] for x in test_points]

tree = BallTree(np.array([test_points_rad[0]]), metric = 'haversine')
index__, dis__ = tree.query_radius([test_points_rad[1]], r=test_radius/earth_radius, return_distance  = True)
print(index__)
print(dis__)

In [None]:
from sklearn.neighbors import BallTree
earth_radius = 6371000 # meters in earth
test_radius = 14172980.6 # meters

test_points = [[32.027240,41.981876],[-81.093190,-87.969982]]
test_points_rad = [[x[0] * np.pi / 180, x[1] * np.pi / 180] for x in test_points]

tree = BallTree(np.array([test_points_rad[0]]), metric = 'haversine')
index__, dis__ = tree.query_radius([test_points_rad[1]], r=test_radius/earth_radius, return_distance  = True)
print(index__)
print(dis__)

<font size=4>
<font color=green>

-  __*'query_radius'*__  method didn't work in our case even if the all steps are correct. Reason is that, it is pushing tree to create a somehow make wrong decisions.

<font color = green>
<font size = 5>

__*USE THIS TO RE-PRODUCE IPYNB WITHOUT OUTPUT*__
    
<font size=3>
<font color = blue>
jupyter nbconvert my_input_notebook.ipynb --to notebook --ClearOutputPreprocessor.enabled=True --stdout > my_output_notebook.ipynb