# Curate_V2_GPSDerived_Clusters_Data

In [4]:
import datetime as dt
import itertools
import numpy as np
import pandas as pd
import subprocess

import matplotlib.pyplot as plt
from IPython.core.interactiveshell import InteractiveShell
import synapseclient
syn = synapseclient.Synapse()
syn.login()

from synapseclient import Activity, File, Schema, Table, as_table_columns
from tabulate import tabulate
from tqdm import tqdm

Welcome, Abhishek Pratap!



In [None]:
from plotly import tools
import plotly.graph_objs as go
import plotly.figure_factory as figf
import plotly.io as pio
from plotly.offline import init_notebook_mode, iplot


# get the latest source files from the GSCAP repo
try:
    t = subprocess.call(['./get_gscap_source.sh']); del t
except:
    p = subprocess.Popen(
        ["powershell.exe", "./get_gscap_source.ps1"],
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE)
import gps
import utils

init_notebook_mode()

InteractiveShell.ast_node_interactivity = 'all'

tqdm.pandas()

In [6]:
sid = 'syn15667781'
df = pd.read_csv(syn.get(sid).path).rename(columns={
    'username':'participant_id',
})
df.head()

Unnamed: 0,participant_id,cid,name,lat,lon,categories,max_duration,min_duration,mean_duration,std_duration,total_duration,times_entered,mean_ti_between_visits
0,EN00590,home,home,41.66319,-87.47607,home,5.683,0.0,2.267,2.027,342.322,151,13.417
1,EN00590,work,work,41.46633,-87.30726,work,7.633,0.0,2.151,2.36,77.45,36,56.281
2,EN00590,x0,not found,41.65877,-87.46267,none,2.117,0.0,0.426,0.563,39.167,92,22.078
3,EN00590,x1,not found,41.66083,-87.46868,none,2.25,0.0,0.09,0.45,2.25,25,70.095
4,EN00590,x10,not found,41.6026,-87.27194,none,0.017,0.0,0.008,0.012,0.017,2,3.708


### Send through Google/Yelp for context

We only want to spend money finding the clusters with semantic context, this excludes the home and work locations. To create a set of those we first split the set of clusters by these values.

In [None]:
home_and_work = df.loc[
    (df.cid == 'home') |
    (df.cid == 'work')
].copy()

df_ = df.loc[
    ~((df.cid == 'home') |
    (df.cid == 'work'))
].copy()
assert len(home_and_work) + len(df_) == len(df)

del df
df = df_
del df_

print(f'{len(df)} calls will be made per API.')

The GSCAP scripts require API requests to be in a specific datatype, a PlaceRequest. So, we create a list of PlaceRequests to process. This will change in future versions to not require the specific datatype.

In [None]:
# setup the API requests to Google
gmap_requests = [
    gps.PlaceRequest(
        lat=r.lat, lon=r.lon, radius=50, 
        source=gps.ApiSource.GMAPS, 
        rankby=gps.GmapsRankBy.PROMINENCE
    )
    for r in df.itertuples()
]

# setup the API requests to Yelp
yelp_requests = [
    gps.PlaceRequest(
        lat=r.lat, lon=r.lon, radius=50, 
        source=gps.ApiSource.YELP, 
        rankby=gps.YelpRankBy.BEST_MATCH
    )
    for r in df.itertuples()
]

# setup a progress bar
pbar, qu = utils.progress_bar(tqdm(total=2*len(df)))

# process each series of requests
gmap_results = gps.request_nearby_places(gmap_requests, n_jobs=-1, cache_only=True, progress_qu=qu)
yelp_results = gps.request_nearby_places(yelp_requests, n_jobs=-1, cache_only=True, progress_qu=qu)

# terminate, join and print cache status to maintain API awareness
pbar.terminate(); pbar.join(); del pbar, qu
print(f'hits: {gmap_results["hits"]}, misses: {gmap_results["misses"]}')
print(f'hits: {yelp_results["hits"]}, misses: {yelp_results["misses"]}')

Combine the results into a single data frame for comparison.

In [None]:
# extract the results
gr = gmap_results['request']
yr = yelp_results['request']

# generate category value counts
grvc = gr.major_categories.value_counts()
yvc = yr.major_categories.value_counts()

# create an index based on the category for outer joins
idx = (set(grvc.index).union(set(yvc.index)))

# create the data frame and join with the Google results
results = pd.DataFrame(index=idx).join(grvc, how='outer')
results = results.rename(columns={'major_categories':'Google'})

# join with the Yelp results
results = results.join(yvc, how='outer')
results = results.rename(columns={'major_categories':'Yelp'})

# replace all na values with zero
results = results.fillna(0).sort_index()

In [None]:
print(f'Google could not identify {int(results.loc["none"].Google)} of {len(df)} locations')
print(f'Yelp could not identify {int(results.loc["none"].Yelp)} of {len(df)} locations')

m = np.max([np.max(results.Google), np.max(results.Yelp)])

cols = [i for i in results.index if i != 'none']
iplot(go.Figure(
    [go.Bar(name=c, x=cols, y=results.loc[cols, c]) for c in results.columns],
    go.Layout(
        title=f'Google Places returns more uniformly distributed results',
        yaxis=dict(title='Number of Results'),
        xaxis=dict(title='Category')
    )
))

Of note is how tuned Yelp's dataset is for places within the category of dining_out. Google is aware of 'every' type while Yelp is much more focused on 'activities'. For this reason, I chose to prioritize the Google results as they're much more evenly distributed across the categories. Although Google returned many more places, there are still some that Yelp found but Google did not. For those, I take whatever results Yelp provided. 

In [None]:
# merge and drop columns
df_ = gr.merge(yr, on=['lat', 'lon', 'radius'])\
    .drop(columns=['dtRetrieved_x', 'dtRetrieved_y', 'radius'])

# create a new column, taking first whatever categories and name Google provided
df_['categories'] = df_.major_categories_x
df_['name'] = df_.name_x
df_['source'] = df_.source_x

# for those where Google did not find anything, assign the Yelp results
xmask = (df_.name_x == 'not found') | (df_.name_x == 'not found in cache')
ymask = (df_.name_y != 'not found') & (df_.name_y != 'not found in cache')

df_.loc[xmask & ymask, 'name']       = df_.loc[xmask & ymask, 'name_y']
df_.loc[xmask & ymask, 'categories'] = df_.loc[xmask & ymask, 'major_categories_y']
df_.loc[xmask & ymask, 'source']     = df_.loc[xmask & ymask, 'source_y']

# take only the required columns
df_ = df_.reindex(columns=['lat', 'lon', 'name', 'categories'])

# # drop any duplicates
df_ = df_.loc[[not b for b in df.duplicated(subset=['lat', 'lon'])]]

# drop the name and categories columns because we'll rejoin it with the looked up values
df = df.drop(columns=['name', 'categories'], errors='ignore')

# left join with the main dataframe
df = df.merge(df_, on=['lat', 'lon'], how='left')

# resort the columns excluding any pii
prefix = ['participant_id', 'cid', 'categories']
pii = ['lat', 'lon', 'name']
to_keep = prefix+sorted(list(set(df.columns)-set(prefix)-set(pii)))
df = df.reindex(columns=to_keep)

del df_

Recombine our results with the home and work clusters.

In [None]:
# drop the pii columns
home_and_work = home_and_work.reindex(columns=to_keep)

# concatenate the dataframes
df = pd.concat([home_and_work, df], axis=0).sort_values(by=['participant_id', 'cid'])

del home_and_work

### Set provenance and upload to Synapse

In [None]:
t = syn.delete(
    syn.tableQuery('select * from syn17023313')
)

In [None]:
final = syn.store(Table(
    Schema(
            name='Passive Cluster Entries Brighten V2',
            columns=as_table_columns(df), 
            parent='syn10848316'),
        df
    )
)

In [5]:
final = syn.setProvenance(
    'syn17116695',
    activity=Activity(
        name='Prepare clusters for public release',
        description='Query Google and Yelp APIs for location information, remove PII, and upload as table',
        used=[sid],
        executed=[
            dict(
                name='Curate_V2_GPSDerived_Clusters_Data',
                url='https://github.com/apratap/BRIGHTEN-Data-Release/blob/master/Curate_V2_GPSDerived_Clusters_Data.ipynb'
            )
        ]
    )
)

NameError: name 'sid' is not defined

### Create data dictionary

In [None]:
cheat = []
for c in df.columns:
    cheat.append(
        (c, str(df[c].dtype)\
             .replace('object', 'str')\
             .replace('float64', 'float')\
             .replace('int64', 'int')\
             .replace('datetime64[ns]', 'DateTime')
        )
    )
    
headers = ['#', 'Variable Name', 'Data Type', 'Description']

descriptions = [
    'Unique ID',
    'Unique ID of participant\'s identified location',
    'Semantic category of location',
    'Maximum time in hours spent at location',
    'Mean time in hours spent at location',
    'Mean time in hours between visits to location',
    'Minimum time in hours spent at location',
    'Standard deviation of time in hours spent at location',
    'Number of times the research participant entered the cluster',
    'Total time in hours the research participant spent in the cluster'
]

cheat = pd.DataFrame(
    cheat, 
    columns=headers[1:-1],
    index=list(range(1, len(cheat)+1))
)
cheat['Description'] = descriptions

print(tabulate(
    cheat,
    headers=headers,
    tablefmt='orgtbl'
).replace('+', '|'))