In [1]:
import folium
import pickle
import pandas as pd
import numpy as np
df = pickle.load(open('./data/data.p','rb'))
df['hour'] = df['time'].apply(lambda d:d.hour)
df['timestamp'] = df['time'].apply(lambda d:d.timestamp())


def get_enconters(data,distance_threshold=5,n_rows_threshold = 4,time_threshold=10*60,epsilon = 1e-3):
    if data.shape[0]<n_rows_threshold:
        return pd.DataFrame([],columns = ['owner','participant_identifier','start_time','end_time','mean_distance',
                                         'mean_latitude','mean_longitude','hour'])
    data = data.sort_values('timestamp').reset_index(drop=True)
    data_all = []
    for i,row in data.iterrows():
        if i==0:
            k = i
            start_time = row['timestamp']
        else:
            if row['timestamp'] - start_time <time_threshold:
                start_time = row['timestamp']
            else:
                if i-k>n_rows_threshold and data.loc[i]['timestamp']-data.loc[i]['timestamp']<time_threshold:
                    distances = data.loc[k:i]['distance_estimate'].values
                    rssi = data.loc[k:i]['RSSI'].values
                    weights1 = rssi  + np.abs(np.min(rssi)) + epsilon
                    weights2 = 100*(distances-np.min(distances)) + epsilon
                    weights = 1/(weights1+weights2)
                    mean_distance = np.average(distances,weights=weights)
                    if mean_distance <= distance_threshold:
                        data_all.append([row['user'],row['participant_identifier'],data.loc[k]['timestamp'],data.loc[i-1]['timestamp'],mean_distance,
                                         np.mean(data.loc[k:i]['latitude'].values),
                                        np.mean(data.loc[k:i]['longitude'].values),row['hour']])
                k = i
                start_time = row['timestamp']
#     print(len(data_all))
    return pd.DataFrame(data_all,columns = ['owner','participant_identifier','start_time','end_time','mean_distance',
                                         'mean_latitude','mean_longitude','hour'])

def generateBaseMap(default_location=[35.14953, -90.04898], default_zoom_start=12):
    base_map = folium.Map(location=default_location, control_scale=True, zoom_start=default_zoom_start)
    return base_map

data_parsed = df.groupby(by=['user','participant_identifier','hour'],as_index=False).apply(lambda a:get_enconters(a))

In [2]:
from folium.plugins import HeatMap,MarkerCluster
df_copy = df.copy()
df_copy['count'] = 1
data_parsed['count'] = 1
base_map = generateBaseMap()
# HeatMap(data=df_copy[['latitude', 'longitude', 'count']].groupby(['latitude', 'longitude']).sum().reset_index().values.tolist(), radius=8, max_zoom=13).add_to(base_map)
HeatMap(data=data_parsed[['mean_latitude', 'mean_longitude','count']].groupby(['mean_latitude', 'mean_longitude']).sum().reset_index().values.tolist(), radius=8, max_zoom=13).add_to(base_map)

<folium.plugins.heat_map.HeatMap at 0x24ef3de8b88>

In [3]:
base_map

In [4]:
df_hour_list = []
for hour in df_copy.hour.sort_values().unique():
    df_hour_list.append(df_copy.loc[df_copy.hour == hour, ['latitude', 'longitude', 'count']].groupby(['latitude', 'longitude']).sum().reset_index().values.tolist())
    df_hour_list.append(data_parsed.loc[data_parsed.hour == hour, ['mean_latitude', 'mean_longitude']].groupby(['mean_latitude', 'mean_longitude']).sum().reset_index().values.tolist())

In [5]:
from folium.plugins import HeatMapWithTime
base_map = generateBaseMap(default_zoom_start=11)
HeatMapWithTime(df_hour_list, radius=10, gradient={0.2: 'blue', 0.4: 'lime', 0.6: 'orange', 1: 'red'}, min_opacity=0.5, max_opacity=0.8, use_local_extrema=True).add_to(base_map)
base_map

In [None]:
data_parsed.shape

In [None]:
import numpy as np
import pandas as pd
from geopy.distance import great_circle
from shapely.geometry.multipoint import MultiPoint
from sklearn.cluster import DBSCAN, MeanShift, KMeans
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import OPTICS
# from sklearn.cluster import OPTICS

EPSILON_CONSTANT = 1000
LATITUDE = 0
LONGITUDE = 1
ACCURACY = -1
GPS_ACCURACY_THRESHOLD = 41.0
KM_PER_RADIAN = 6371.0088
GEO_FENCE_DISTANCE = 50
MINIMUM_POINTS_IN_CLUSTER = 10


def get_centermost_point(cluster: object) -> object:
    """
    :param cluster:
    :return:
    :rtype: object
    """
    centroid = (
        MultiPoint(cluster).centroid.x, MultiPoint(cluster).centroid.y)
    centermost_point = min(cluster, key=lambda point: great_circle(point,
                                                                   centroid).m)
    return tuple(centermost_point)

def gps_clustered(data):
    print(data.shape)
    if data.shape[0]<45:
        return pd.DataFrame([],columns=list(data.columns)+['labels','centroid_latitude','centroid_longitude'])
    print(data.shape)
    geo_fence_distance = GEO_FENCE_DISTANCE
    min_points_in_cluster = MINIMUM_POINTS_IN_CLUSTER
    coords = np.float64(data[['mean_latitude', 'mean_longitude']].values)
    
    epsilon = geo_fence_distance / (
                EPSILON_CONSTANT * KM_PER_RADIAN)
    
    db = DBSCAN(eps=epsilon, min_samples=min_points_in_cluster,
                algorithm='ball_tree', metric='haversine').fit(
                np.radians(coords))
    
    data['labels'] = db.labels_
    cluster_labels = db.labels_
    clusters = pd.Series([coords[cluster_labels == n] for n in np.unique(cluster_labels)])
    cluster_names = np.array([n for n in np.unique(cluster_labels)])
    centermost_points = clusters.map(get_centermost_point)
    centermost_points = np.array(centermost_points)
    all_dict = []
    for i,col in enumerate(cluster_names):
        cols = np.array(centermost_points[i])
        cols.flatten()
        all_dict.append([col,cols[0],cols[1]])
    temp_df = pd.DataFrame(all_dict,columns=['labels','centroid_latitude','centroid_longitude'])
    data = pd.merge(data, temp_df,  how='left', left_on=['labels'], right_on = ['labels'])
    return data

In [None]:
!conda install shapely