In [35]:
import bamboolib as bam
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
from tqdm.notebook import tqdm
tqdm.pandas()

from gps_utils import *
# Parameters
MIN_PTS = 3

# Clustering  + Merge data points based on clusters 

In [36]:
gps = pd.read_csv(r'files/test/cleaned_gps.csv', sep=',', decimal='.')
minute_id_to_index = {minute_id:i for (i, minute_id) in enumerate(gps['minute_id'].values.tolist())}

## Cluster the rest

In [37]:
eps=0.05/6371 #0.01 = 10 meters
cluster_to_stop = []
MONTHS = ["201901"]
DAYS = [f"{i+1:0>2}" for i in range(31)]
for MONTH in tqdm(MONTHS):
    for DAY in DAYS:
        # Filter by month
        gps_drop = gps.loc[gps['minute_id'].str.startswith(MONTH + DAY, na=False)]
        gps_drop = gps_drop.dropna(subset=['latitude'])
        # Remove special places
    #     gps_drop = gps_drop.loc[(gps_drop['special_place'].isna())]

        # Clustering
        clustering = DBSCAN(eps=eps, min_samples=MIN_PTS, algorithm='ball_tree', metric='haversine') 
        clustering.fit(np.radians(gps_drop[['latitude', 'longitude']]))
        gps_drop['cluster_label'] = clustering.labels_
        gps_drop['core'] = [i in clustering.core_sample_indices_ for i in range(len(gps_drop))]
        # Assign cluster labels
        # gps_drop = gps_drop.loc[~(gps_drop['cluster_label'] == -1)]
        gps_drop = gps_drop.loc[gps_drop['core'] == True]
        # print(f"Outliers from {MONTH + DAY}:", clustering.labels_[clustering.labels_ == -1].size)

        clusters = gps_drop.groupby((gps_drop['cluster_label'].shift() != gps_drop['cluster_label']).cumsum()).agg(
                                                         time_duration=('minute_id', time_duration),
                                                         start=('minute_id', 'first'),
                                                         end=('minute_id', 'last'),
                                                         mean_spead=('speed', 'mean'),
                                                         minute_id=('minute_id', list_all),
                                                         label=('cluster_label', 'first'))
        # Merge data points together based on their clusters
        clusters = clusters.reset_index()
        # clusters = clusters.loc[~(clusters['time_duration'] < 3)]
        # Classify intial data points as stop/move
        all_names = []
        for index, row in clusters.iterrows():
            if row["label"] != -1:
                cluster_to_stop.append((row["start"], row['end'], row['minute_id'], f'{DAY}_{row["label"]}'))

  0%|          | 0/1 [00:00<?, ?it/s]

## Classify intial data points as stop/move

In [38]:
def blank_column(value, length):
    return [value for i in range(length)]

stop_values = blank_column(False, len(gps))
cluster_label_values = blank_column("", len(gps))

for start, end, minute_ids, cluster in tqdm(cluster_to_stop):
    start_id = minute_id_to_index[start]
    end_id = minute_id_to_index[end]
    for minute_id in range(start_id, end_id + 1):
        stop_values[minute_id] = True
        cluster_label_values[minute_id] = cluster
        
gps = gps.assign(stop=stop_values,
                 cluster_label=cluster_label_values)

  0%|          | 0/441 [00:00<?, ?it/s]

# Post Processing

## Smoothing

In [39]:
def smooth(series, window_size=3):
    smoothed_series = []
    appended_series = list(series)
    appended_series = [appended_series[0]] * (window_size//2) + appended_series + [appended_series[-1]] * (window_size//2)
    for i in range(len(series)):
        window = [appended_series[i+window_size//2]] + appended_series[i:i+window_size//2] + appended_series[i+window_size//2+1:i+window_size]
        assert len(window) == window_size, "Not equal window size"
        smoothed_series.append(Counter(window).most_common(1)[0][0])
        
    return smoothed_series

In [40]:
# Smooth stop/move label
gps["stop"] = smooth(gps["stop"])
gps["cluster_label"] = smooth(gps["cluster_label"])
gps["stop_label"] = gps.progress_apply(lambda x: x['cluster_label'] if x['stop'] else "", axis=1)

  0%|          | 0/44640 [00:00<?, ?it/s]

## Change to ImageID index and recalculate movement

In [41]:
# Split images into different rows:
gps["ImageID"]=gps["ImageID"].str.split(",")
gps = gps.explode("ImageID").reset_index()
gps["ImageID"] = gps["ImageID"].str.replace(r'(\[|\]|\'|\s)', '', regex=True)
gps.loc[gps['ImageID'] == "", 'ImageID'] = np.nan
gps = gps.loc[gps['ImageID'].notna()]

# Merge into one file with the visual concepts
visual = pd.read_csv(r'../../original_data/lsc22_visual_concepts.csv', sep=',', decimal='.')
# start from here
both = pd.merge(
    visual,
    gps,
    how="right",
    on='ImageID',
)
both = both.drop(columns=['Unnamed: 0_x', 'index'])

In [42]:
from vision_utils import *

moves = {"I am sitting on an airplane": "Airplane",
         "I am in a car": "Car",
         "I am in an airport": "Inside",
         "I am walking outside or on the street": "Walking Outside",
         "I am on public transport": "Public Transport",
         "I am inside a building or a house": "Inside"}

# Assign movement
both["movement"] = [None for i in range(len(both))]
both["movement_prob"] = [0 for i in range(len(both))]
for i, row in tqdm(both.iterrows(), total=len(both)):
    image_features = get_stop_embeddings([row["ImageID"]]) 
    try:
        image_features = torch.tensor(image_features).cuda().float()
    except RuntimeError as e:
        continue
    movement, prob = movement_mode(list(moves.keys()), image_features)
    both.loc[i, "movement"] = moves[movement]
    both.loc[i, "movement_prob"] = prob

  0%|          | 0/42640 [00:00<?, ?it/s]

In [43]:
stops

In [54]:
reset()

In [55]:
back_up()
theta = 0
both["movement"] = smooth(both["movement"], 3)
both["inside"] = both["movement"] == "Inside"
# both.loc[(both["inside"] == False) & (both["movement_prob"] > theta), 'stop'] = False
# both.loc[(both["inside"] == False) & (both["movement_prob"] > theta), 'stop_label'] = ""

## Remove short stops

In [56]:
stops = both.groupby(((both['stop_label'].shift() != both['stop_label']) | (both['stop'].shift() != both['stop'])).cumsum()).agg(
                                                          inside=('inside', 'first'),
                                                          lat=('latitude', 'mean'),
                                                          lon=('longitude', 'mean'),
                                                          all_lon=('longitude', list_all),
                                                          all_lat=('latitude', list_all),
                                                          images=('ImageID', list_all),
                                                          stop=('stop', 'first'),
                                                          stop_label2=('stop_label', most_common),
                                                          movement=('movement', most_common),
                                                          duration=('ImageID', image_time_duration))
stops = stops.reset_index()
# stops = stops.drop(columns=['stop_label'])
stops = stops.rename(columns={'stop_label2': 'stop_label'})

In [57]:
stops = stops[['stop', 'movement'] + ['lat', 'lon', 'stop_label', 'all_lat', 'all_lon', 'images', 'duration']]
stops.loc[stops['duration'] < 3, 'stop'] = False
bam.plot(stops, 'stop')

TabSection(children=(BrowserCheck(), HBox(children=(Tab(closable=False, title='plot', _dom_classes=('bamboolib…

## Adjusting boundaries

In [58]:
all_image_ids = list(both["ImageID"])
stop_values = [False] * len(both)
cluster_label_values = [""] * len(both)
boundaries = [None] * len(both)

for i, row in tqdm(stops.iterrows(), total=len(stops)):
    if row["stop"]:
        start = row["images"][0]
        boundaries[all_image_ids.index(start)] = "start"
        end = row["images"][-1]
        boundaries[all_image_ids.index(end)] = "end"
    
    for image in row["images"]:
        image_id = all_image_ids.index(image)
        stop_values[image_id] = row["stop"]
        if row["stop"]:
            cluster_label_values[image_id] = row["stop_label"]
        
both = both.assign(stop=stop_values,
                   stop_label=cluster_label_values,
                   boundary=boundaries)

  0%|          | 0/553 [00:00<?, ?it/s]

In [59]:
stop_values = list(both["stop"])
cluster_label_values = list(both["stop_label"])
movements = list(both["movement"])
boundaries = list(both["boundary"])

# Forward
for i in tqdm(range(1, len(both))):
    if boundaries[i] == "end": #Considering a stop ending boundaries
        # Checking if the cluster should end later (still inside)
        j = i + 1
        while j < len(both) and movements[j] == "Inside" and cluster_label_values[j] == "": 
            stop_values[j] = True
            cluster_label_values[j] = cluster_label_values[i]
            j +=1
        
        # Checking if the cluster should end earlier (not inside anymore)
        j = i
        while j > 0 and movements[j] != "Inside" and cluster_label_values[j] == cluster_label_values[i]: 
            stop_values[j] = False
            cluster_label_values[j] = ""
            j -= 1
            
            
# Backward
for i in tqdm(range(1, len(both))):
    if boundaries[-i] == "start": #Considering a stop ending boundaries
        # Checking if the cluster should start sooner (going inside already)
        j = i + 1
        while j < len(both) and movements[-j] == "Inside" and cluster_label_values[-j] == "": 
            stop_values[-j] = True
            cluster_label_values[-j] = cluster_label_values[-i]
            j += 1
        
        # Checking if the cluster should start later (not inside yet)
        j = i
        while j > 0 and movements[-j] != "Inside" and cluster_label_values[j] == cluster_label_values[i]: 
            stop_values[j] = False
            cluster_label_values[j] = ""
            j -= 1
            
both = both.assign(stop=stop_values,
                 stop_label=cluster_label_values)

  0%|          | 0/42639 [00:00<?, ?it/s]

  0%|          | 0/42639 [00:00<?, ?it/s]

In [60]:
# # Smooth stop/move label
# both["stop"] = smooth(both["stop"])
# both["stop_label"] = smooth(both["stop_label"])
both.loc[(both["stop_label"] == "") & (both["inside"] == True), "stop_label"] = "INSIDE"

# Final stop/move

In [61]:
stops = both.groupby(((both['stop_label'].shift() != both['stop_label'])).cumsum()).agg(
                                                          inside=('inside', 'first'),
                                                          lat=('latitude', 'mean'),
                                                          lon=('longitude', 'mean'),
                                                          all_lon=('longitude', list_all),
                                                          all_lat=('latitude', list_all),
                                                          images=('ImageID', list_all),
                                                          stop=('stop', 'first'),
                                                          stop_label2=('stop_label', most_common),
                                                          movement=('movement', most_common),
                                                          duration=('ImageID', image_time_duration))
stops = stops.reset_index()
stops = stops.drop(columns=['stop_label'])
stops = stops.rename(columns={'stop_label2': 'stop_label'})

In [64]:
def calculate_distance(all_lat, all_lon, lat, lon):
    dists = [distance(lt, ln, lat, lon) for (lt, ln) in zip(all_lat, all_lon)]
    dists = [d for d in dists if d]
    if dists:
        return max(dists)
    return 50
    
stops["max_radius"] = stops.progress_apply(lambda x: calculate_distance(x['all_lat'], 
                                                               x['all_lon'],
                                                               x['lat'],
                                                               x['lon']), axis=1)

  0%|          | 0/627 [00:00<?, ?it/s]

In [65]:
stops = stops[['inside', 'stop', 'movement', 'stop_label'] + ['lat', 'lon', 'all_lat', 'all_lon','max_radius', 'images', 'duration']]
stops = stops.reset_index()
bam.plot(stops, 'stop')

TabSection(children=(BrowserCheck(), HBox(children=(Tab(closable=False, title='plot', _dom_classes=('bamboolib…

# Verification

In [68]:
theta = 0.9
def get_checkin(row, logging=False):
    stop = stops.loc[row, "stop"]
    lat = stops.loc[row, "lat"]
    lon = stops.loc[row, "lon"]
    max_radius = stops.loc[row, "max_radius"]
    images = stops.loc[row, "images"]
    image_features = None
    if isinstance(images, str):
        images = json.loads(images.replace("'", '"'))
    image_features = get_stop_embeddings(images)

    # Get transport move
    if not image_features is None:
        image_features = torch.tensor(image_features).cuda().float()
        movement, movement_prob = movement_mode(list(moves.keys()), image_features)
        movement = moves[movement]
        if logging:
            print("Movement:", movement, movement_prob)
        if movement_prob > theta and movement in ["Inside", "Airport"]:
            stop = True
        elif movement_prob > theta:
            stop = False
        elif max_radius < 100 and movement in ["Inside", "Airport"]: # Low probability but small distance
            stop = True

        stops.loc[row, "stop"] = stop
        if not stop:
            stops.loc[row, "movement"] = movement
            stops.loc[row, "movement_prob"] = movement_prob

num = len(stops)
# num = 20
for i in tqdm(range(num)):
    get_checkin(i)

  0%|          | 0/627 [00:00<?, ?it/s]

In [69]:
stops = stops[['inside', 'stop', 'movement', 'stop_label'] + ['lat', 'lon', 'max_radius', 'all_lat', 'all_lon', 'images', 'duration']]
stops = stops.reset_index()
bam.plot(stops, 'stop')

TabSection(children=(BrowserCheck(), HBox(children=(Tab(closable=False, title='plot', _dom_classes=('bamboolib…

# Save

In [70]:
stops.to_csv("files/test/stops.csv")
both.to_csv("files/test/both.csv")

In [29]:
test = both.loc[both['minute_id'].str.startswith('20190101_15', na=False)]
test = test[['ImageID','minute_id', 'movement', 'stop', 'stop_label', 'ori_movement', 'ori_stop', 'ori_stop_label', 'inside', 'boundary']]
test.loc[(test["stop_label"] == "") & (test["inside"] == True), "stop_label"] = "INSIDE"
test

                     ImageID      minute_id movement   stop stop_label  \
445  20190101_150016_000.jpg  20190101_1500   Inside   True       01_5   
446  20190101_150048_000.jpg  20190101_1500   Inside   True       01_5   
447  20190101_150120_000.jpg  20190101_1501   Inside   True       01_5   
448  20190101_150152_000.jpg  20190101_1501   Inside   True       01_5   
449  20190101_150224_000.jpg  20190101_1502   Inside   True       01_5   
..                       ...            ...      ...    ...        ...   
551  20190101_155727_000.jpg  20190101_1557      Car  False              
552  20190101_155759_000.jpg  20190101_1557      Car  False              
553  20190101_155831_000.jpg  20190101_1558      Car  False              
554  20190101_155903_000.jpg  20190101_1559      Car  False              
555  20190101_155935_000.jpg  20190101_1559      Car  False              

    ori_movement  ori_stop ori_stop_label  inside boundary  
445       Inside      True           01_5    True 