In [1]:
from tqdm.notebook import tqdm
import bamboolib as bam
import pandas as pd
tqdm.pandas()
from gps_utils import *
from map_apis import *
from vision_utils import *




In [2]:
from datetime import datetime, timezone, timedelta
# Language libraries
from unidecode import unidecode
from googletrans import Translator, constants
from langdetect import detect
import re

# Read Files

In [3]:
checkin_file = '../../original_data/checkins.json'
stop_file = 'files/test/stops.csv'

In [35]:
stops = pd.read_csv(stop_file, sep=',', decimal='.')
# Leave out stops without images
# stops['images'] = stops['images'].astype('string')
# stops = stops.loc[~(stops['images'].isin(['[]']))]
# stops = stops.loc[~(stops['lat'].isna())]
stops = stops.reset_index()

EMPTY_STRINGS = ["" for i in range(len(stops))]
ZEROS = [0 for i in range(len(stops))]
FALSES = [False for i in range(len(stops))]

stops = stops.assign(checkin=EMPTY_STRINGS,
                     original_name=EMPTY_STRINGS,
                     in_checkin=FALSES,
                     place_id=EMPTY_STRINGS,
                     categories=EMPTY_STRINGS,
                     prob=ZEROS,
                     parent=EMPTY_STRINGS,
                     parent_id=EMPTY_STRINGS)
                     # movement=EMPTY_STRINGS,
                     # movement_prob=ZEROS)

In [5]:
checkins = json.load(open(checkin_file))
named_checkins = [{
                   "name": checkin["venue"]["name"],
                   "place_id": checkin["venue"]["id"],
                   "latitude": checkin["venue"]["latitude"],
                   "longitude": checkin["venue"]["longitude"],
                   "regions": [checkin["venue"][key] for key in ["city", "state", "country"] if key in checkin["venue"]],
                   "time": datetime.fromtimestamp(checkin["createdAt"]), #not sure
                   "timeZoneOffset": checkin["timeZoneOffset"]} for checkin in checkins][::-1]
# Filter time
named_checkins = [checkin for checkin in named_checkins if checkin["time"].year in [2019, 2020]]
NULL_CHECKIN = {"name": "Unknown Place", "place_id": "No Places Found", "categories": [], "parent": "", "parent_id": ""}

## Enrich checkins

In [6]:
for checkin in tqdm(named_checkins):
    categories_res = get_categories(checkin['place_id'])
    checkin["categories"] = [cat['name'] for cat in categories_res["categories"]]
    if "description" in categories_res:
        checkin["description"] = categories_res["description"]
    else:
        checkin["description"] = ""
    if "related_places" in categories_res and "parent" in categories_res["related_places"]:
        checkin["parent"] = categories_res["related_places"]["parent"]["name"]
        checkin["parent_id"] = categories_res["related_places"]["parent"]["fsq_id"]
    else:
        checkin["parent"] = ""
        checkin["parent_id"] = ""
    if "related_places" in categories_res and "children" in categories_res["related_places"]:
        checkin["children"] = []
        for children in categories_res["related_places"]["children"][:20]:
            if "fsq_id" in children:
                children_res = get_categories(children["fsq_id"])
                if "description" in children_res:
                    children["description"] = children_res["description"]
                children["categories"] = [cat['name'] for cat in children_res["categories"]]
                checkin["children"].append(children)
    else:
        checkin["children"] = []

    photos_res = get_photos(checkin["place_id"])
    checkin["indoor_photos"] = [photo["prefix"] + "original" + photo["suffix"] for photo in photos_res if 'indoor' in photo["classifications"]]
    checkin["outdoor_photos"] = [photo["prefix"] + "original" + photo["suffix"] for photo in photos_res if 'outdoor' in photo["classifications"]]    

  0%|          | 0/2256 [00:00<?, ?it/s]

## Various checkin functions

In [7]:
# Translating
translator = Translator()
text_exceptions={"Adapt Centre @ Dcu": "Adapt Centre @ Dcu",
                "777": "777",
                "Nip@tuck": "Nip@tuck"}
def to_english(text, debug=False):
    if text:
        if text in text_exceptions:
            return text_exceptions[text]
        try:
            lang = detect(text)
            if lang != "en":
                text = translator.translate(text).text
            return unidecode(text)
        except Exception as e:
            if debug:
                print(f"Error translating \"{text}\"")
                raise(e)
            else:
                pass
    return text

## Assign checkins

In [8]:
def change_airport_parents(possible_checkins, weights, existed_place_ids):
    # Change parents for airports:
    for checkin in possible_checkins:
        if detect_airport(checkin):
            categories_res = get_categories(checkin["parent_id"])
            categories = [cat['name'] for cat in categories_res["categories"]]
            parent = checkin["parent"]
            parent_id = checkin["parent_id"]
            if parent:
                while True:
                    if "related_places" in categories_res and "parent" in categories_res["related_places"]:
                        parent = categories_res["related_places"]["parent"]["name"]
                        parent_id = categories_res["related_places"]["parent"]["fsq_id"]
                        categories_res = get_categories(parent_id)
                        categories = [cat['name'] for cat in categories_res["categories"]]
                    else:
                        break
                checkin["parent"] = parent
                checkin["parent_id"] = parent_id

            if parent_id in existed_place_ids:
                weights[existed_place_ids.index(parent_id)] += 1
    return possible_checkins, weights

In [9]:
def get_nearby_parents(nearbys, possible_checkins, weights, existed_place_ids):
    for i, checkin in enumerate(nearbys):
        checkin = parse_checkin(checkin)
        if detect_airport(checkin):
            categories_res = get_categories(checkin["parent_id"])
            categories = [cat['name'] for cat in categories_res["categories"]]
            parent = checkin["parent"]
            parent_id = checkin["parent_id"]
            if parent:
                while True:
                    if "related_places" in categories_res and "parent" in categories_res["related_places"]:
                        parent = categories_res["related_places"]["parent"]["name"]
                        parent_id = categories_res["related_places"]["parent"]["fsq_id"]
                        categories_res = get_categories(parent_id)
                        categories = [cat['name'] for cat in categories_res["categories"]]
                    else:
                        break
                checkin["parent"] = parent
                checkin["parent_id"] = parent_id

        if checkin["place_id"] not in existed_place_ids:
            existed_place_ids.append(checkin["place_id"])
            possible_checkins.append(checkin)
            weights.append(1) #TODO!
        # else:
            # weights[existed_place_ids.index(checkin["place_id"])] += 1
    
    # Add parent to the list 
    for checkin in possible_checkins:
        if checkin["parent_id"] and checkin["parent_id"] not in existed_place_ids:
            if checkin["parent"] != checkin["name"]:
                categories_res = get_categories(checkin["parent_id"])
                new_checkin = {"name": checkin["parent"],
                               "place_id": checkin["parent_id"],
                               "categories": [cat['name'] for cat in categories_res["categories"]],
                               "parent": "",
                               "parent_id": ""}
                if "description" in categories_res:
                    new_checkin["description"] = categories_res["description"]
                if "related_places" in categories_res and "parent" in categories_res["related_places"]:
                    new_checkin["parent"] = categories_res["related_places"]["parent"]["name"]
                    new_checkin["parent_id"] = categories_res["related_places"]["parent"]["fsq_id"]
                existed_place_ids.append(checkin["parent_id"])
                possible_checkins.append(new_checkin)
                weights.append(1)
    return possible_checkins, weights, existed_place_ids

## Fill in rows

In [10]:
moves = {"I am sitting on an airplane": "Airplane",
         "I am in a car": "Car",
         "I am in an airport": "Inside",
         "I am walking outside or on the street": "Walking Outside",
         "I am on public transport": "Public Transport",
         "I am inside a building or a house": "Inside"}

In [11]:
def detect_airport(checkin):
    if "airport" in ", ".join(checkin["categories"]).lower() or "airport" in checkin["name"].lower():
        return True
    return "airport" in checkin["parent"].lower()

In [12]:
def find_closest_index(a, x): # to the right only
    i = bisect.bisect_left(a, x)
    if i >= len(a):
        i = len(a) - 1
    return i

In [13]:
checkin_times = [checkin["time"] for checkin in named_checkins]

def get_nearby_checkins(start, end, lat=None, lon=None, max_radius=500, time_limit=timedelta(hours=1)):
    start_ind = find_closest_index(checkin_times, start)
    end_ind = find_closest_index(checkin_times, end + time_limit)
    base_checkins, gaps = [], []
    for i in range(start_ind, end_ind+1):
        checkin = named_checkins[i]
        if lat:
            if distance(lat, lon, checkin["latitude"], checkin["longitude"]) > max(500, max_radius):
                continue
        base_checkins.append(checkin)
        gaps.append(checkin_times[i] - start)
    weights = [1 for i in range(len(base_checkins))]
    children_checkins = []
    for checkin, gap in zip(base_checkins, gaps):
        if not detect_airport(checkin):
            for children in checkin["children"]:
                children["place_id"] = children["fsq_id"]
                children["parent"] = checkin["name"]
                children["parent_id"] = checkin["place_id"]
                children_checkins.append(children)
                weights.append(1)
    return base_checkins + children_checkins, weights

In [14]:
def image_to_date(image_id):
    return datetime.strptime(image_id, "%Y%m%d_%H%M%S_000.jpg")

In [15]:
def find_named_checkins_nearby(images, image_features, stop, lat, lon, max_radius, logging=False):
    start = image_to_date(images[0])
    end = image_to_date(images[-1])    
    if stop:
        possible_checkins, weights, existed_place_ids = [], [], []
        num_nearby_checkins = 0
        
        # Use checkins
        possible_checkins, weights = get_nearby_checkins(start, end, lat, lon, max_radius)
        existed_place_ids = [checkin["place_id"] for checkin in possible_checkins]
        num_nearby_checkins = len(possible_checkins)
        possible_checkins, weights = change_airport_parents(possible_checkins, weights, existed_place_ids)
        
        nearbys = get_nearby_places(round(lat, 3), round(lon, 3))["results"]
        if logging:
            print(images)
            print([(checkin["name"], checkin["distance"]) for checkin in nearbys])
            print("Max radius:", max_radius)
        # Filter nearbys by distance
        nearbys = [checkin for checkin in nearbys if checkin["distance"] < max(400, max_radius)]
        
        if not nearbys:
            if logging:
                print(images)
                print([checkin["name"] for checkin in get_nearby_places(round(lat, 3), round(lon, 3))["results"]])
                print("Max radius:", max_radius)
        else:
            # Get parents for nearbys
            if logging:
                print("Nearbys:")
                print([checkin["name"] for checkin in nearbys])

            possible_checkins, weights, existed_place_ids = get_nearby_parents(nearbys, possible_checkins, weights, existed_place_ids)
            if logging:
                print("Expanded nearbys:")
                print([(checkin["name"], checkin["parent"], weight) for checkin, weight in zip(possible_checkins, weights)])

            parent_weights = [[i] for i in range(len(weights))]
            for i, checkin in enumerate(possible_checkins):
                if checkin["parent_id"] in existed_place_ids:
                    parent_weights[existed_place_ids.index(checkin["parent_id"])].append(i)

            checkin_labels = []
            all_embeddings = []
            embedding_index = []
            filtered_checkins = []

            for i, checkin in enumerate(possible_checkins):
                name = to_english(re.sub("[\(\[].*?[\)\]]", "", checkin["name"]))
                all_label = "I am in a " + ", ".join(checkin["categories"]) + " called " + name
                checkin_labels.append(all_label)
                filtered_checkins.append(checkin)
                if "IMG" in MODES:
                    photos_res = get_photos(checkin["place_id"])
                    checkin["indoor_photos"] = [photo["prefix"] + "original" + photo["suffix"] for photo in photos_res if 'classifications' in photo and 'indoor' in photo["classifications"]]
                    checkin["outdoor_photos"] = [photo["prefix"] + "original" + photo["suffix"] for photo in photos_res if 'classifications' in photo and 'outdoor' in photo["classifications"]]    

                    checkin_embeddings, _, _ = get_embeddings(checkin)
                    if checkin_embeddings is not None:
                        embedding_index.append(i)
                        all_embeddings.append(np.mean(checkin_embeddings, axis=0))

            # Text similarity
            text_tokens = clip.tokenize(checkin_labels, truncate=True).cuda()
            with torch.no_grad():
                text_features = model.encode_text(text_tokens).float()
                text_features /= text_features.norm(dim=-1, keepdim=True)
            mean_similarities = (image_features @ text_features.T).mean(dim=0)

            # Image similarity
            full_image_similarities = torch.clone(mean_similarities)
            if "IMG" in MODES:
                if all_embeddings:
                    checkin_embeddings = torch.tensor(np.stack(all_embeddings)).cuda().float()
                    image_similarities = (image_features @ checkin_embeddings.T).mean(dim=0)
                    for i, similarity in zip(embedding_index, image_similarities):
                        full_image_similarities[i] = (similarity + full_image_similarities[i]) / 2

            weighted_similarities = torch.tensor(weights) * full_image_similarities.cpu()
            mean_probs, mean_labels = (100 * weighted_similarities).softmax(dim=-1).topk(min(len(checkin_labels), 5))

            if logging:
                print("All checkins:")
                print([(checkin["name"], checkin["parent"]) for checkin in possible_checkins])
                print("Photos available:")
                print([possible_checkins[i]["name"] for i in embedding_index])
                print("Weighted similarities")
                print([(weights[label], checkin_labels[label], prob.numpy()) for label, prob in zip(mean_labels, mean_probs)])


            if "REL" not in MODES or mean_probs[0].numpy() > 0.75:
                return filtered_checkins[mean_labels[0]], (mean_probs[0]/weights[mean_labels[0]]).numpy(), mean_labels[0].numpy() < num_nearby_checkins
            else: # Considering parents
                parent_similarities = []
                for ids in parent_weights:
                    sim = 0
                    for i in ids:
                        sim += weighted_similarities[i]
                    parent_similarities.append(sim)
                parent_similarities = torch.tensor(parent_similarities)
                mean_probs, mean_labels = parent_similarities.topk(min(len(checkin_labels), 5))

                if logging:
                    print("Considering parents weights")
                    print([(weights[label], checkin_labels[label], prob.numpy()) for label, prob in zip(mean_labels, mean_probs)])
                return filtered_checkins[mean_labels[0]], (mean_probs[0]/weights[mean_labels[0]]).numpy(), mean_labels[0].numpy() < num_nearby_checkins
    return NULL_CHECKIN, 0, False

In [16]:
def get_checkin(row, logging=False):
    stop = stops.loc[row, "stop"]
    lat = stops.loc[row, "lat"]
    lon = stops.loc[row, "lon"]
    max_radius = stops.loc[row, "max_radius"]
    images = stops.loc[row, "images"]
    image_features = None
    if isinstance(images, str):
        images = json.loads(images.replace("'", '"'))
    image_features = get_stop_embeddings(images)

    # Get transport move
    if not image_features is None:
        image_features = torch.tensor(image_features).cuda().float()
#         movement, movement_prob = movement_mode(list(moves.keys()), image_features)
#         movement = moves[movement]
#         if logging:
#             print("Movement:", movement, movement_prob)
#         if movement_prob > 0.7 and movement in ["Inside", "Airport"]:
#             stop = True
#         elif movement_prob > 0.7:
#             stop = False
#         elif max_radius < 100 and movement in ["Inside", "Airport"]: # Low probability
#             stop = True

#         if logging:
#             print("Stop:", stop)

#         stops.loc[row, "stop"] = stop
        if stop:
            if not np.isnan(lat):
                if distance(lat, lon, 53.38998, -6.1457602) < 100:
                    stops.loc[row, "checkin"] = "HOME"
                elif distance(lat, lon, 53.386859863999995, -6.147444621999999) < 100:
                    stops.loc[row, "checkin"] = "Charm Hand & Foot Spa"
                # elif movement_prob > 0.7 and movement in ["Private Home"]:
                #     stops.loc[row, "checkin"] = "Private Home"
                else:
                    checkin, prob, in_checkin = find_named_checkins_nearby(images, image_features, stop, lat, lon, max_radius, logging=logging)
                    stops.loc[row, "checkin"] = checkin["name"]
                    stops.loc[row, "original_name"] = checkin["name"]
                    stops.loc[row, "parent"] = checkin["parent"]
                    stops.loc[row, "parent_id"] = checkin["parent_id"]
                    stops.loc[row, "categories"] = ", ".join(checkin["categories"])
                    stops.loc[row, "prob"] = prob
                    stops.loc[row, "in_checkin"] = in_checkin
            else:
                stops.loc[row, "checkin"] = "Unknown Place"
        # else:                    
        #     stops.loc[row, "movement"] = movement
        #     stops.loc[row, "movement_prob"] = movement_prob

## Start

In [36]:
MODES = ["IMG", "REL"]
# stops = stops.loc[stops['first'].str.startswith('20190126', na=False)]
# stops = stops.reset_index()
num = len(stops)
# num = 20
for i in tqdm(range(num)):
    get_checkin(i)

  0%|          | 0/627 [00:00<?, ?it/s]

In [37]:
confusion_matrix = pd.crosstab(metadata['stop_GT'], metadata['stop'], rownames=['Actual'], colnames=['Predicted'])
print(confusion_matrix)

      stop                checkin categories movement parent  in_checkin  \
0     True                   HOME              Inside              False   
1    False                                        Car              False   
2     True  Charm Hand & Foot Spa              Inside              False   
3    False                                        Car              False   
4     True          Unknown Place              Inside              False   
..     ...                    ...        ...      ...    ...         ...   
622   True  Charm Hand & Foot Spa              Inside              False   
623  False                                        Car              False   
624   True          Unknown Place              Inside              False   
625  False                                        Car              False   
626   True                   HOME              Inside              False   

    original_name        lat       lon stop_label   max_radius  \
0                  53

In [38]:
from sklearn.metrics import cohen_kappa_score, accuracy_score
accuracy_score(metadata['stop_GT'], metadata['stop'])

     stop                       checkin                   categories movement  \
451  True                          HOME                                Inside   
453  True         Charm Hand & Foot Spa                                Inside   
457  True         Charm Hand & Foot Spa                   Nail Salon   Inside   
459  True                          HOME                                Inside   
461  True         Charm Hand & Foot Spa                   Nail Salon   Inside   
462  True         Charm Hand & Foot Spa                                Inside   
464  True       Cois Bá Family Practice              Doctor's Office   Inside   
470  True          Omni Shopping Centre                Shopping Mall   Inside   
472  True         Charm Hand & Foot Spa                                Inside   
474  True                          HOME                                Inside   
476  True         Charm Hand & Foot Spa                                Inside   
477  True         Charm Hand

In [20]:
# stops.to_csv("files/semantic_stops.csv")

In [39]:
import importlib
import agg_stops
importlib.reload(agg_stops)

stops = agg_stops.agg_stop(stops)

  0%|          | 0/627 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

# Reclassify Unknown Place

In [40]:
# stops = pd.read_csv('files/agg_stops.csv', sep=',', decimal='.')
def calculate_distance(checkin, all_lat, all_lon, lat, lon):
    if checkin == "Unknown Place":
        dists = [distance(lt, ln, lat, lon) for (lt, ln) in zip(all_lat, all_lon)]
        dists = [d for d in dists if d]
        if dists:
            return max(dists)
    return 50
    
stops["max_radius"] = stops.progress_apply(lambda x: calculate_distance(x['checkin'], x['all_lat'], 
                                                               x['all_lon'],
                                                               x['lat'],
                                                               x['lon']), axis=1)

  0%|          | 0/487 [00:00<?, ?it/s]

In [41]:
sum(stops['checkin'] == 'Unknown Place')

64

In [42]:
for i, checkin in tqdm(enumerate(stops['checkin']), total=len(stops)):
    if checkin == "Unknown Place":
        get_checkin(i)

  0%|          | 0/487 [00:00<?, ?it/s]

In [25]:
# stops.to_csv("files/final_stops.csv")

In [43]:
stops = agg_stops.agg_stop(stops)
# stops['checkin'] = stops['checkin'].progress_apply(to_english)

  0%|          | 0/487 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

In [44]:
bam.plot(stops, 'stop')

TabSection(children=(BrowserCheck(), HBox(children=(Tab(closable=False, title='plot', _dom_classes=('bamboolib…

In [45]:
stops.to_csv("files/test/final_com.csv")

In [29]:
stops_only = stops.loc[~(stops['stop'] == False)]
test = stops_only.loc[stops_only['first'].str.startswith('20190126', na=False)]
test

     new_name  stop                       checkin  \
318       319  True         Charm Hand & Foot Spa   
320       321  True         Charm Hand & Foot Spa   
322       323  True                          HOME   
324       325  True         Charm Hand & Foot Spa   
326       327  True       Cois Bá Family Practice   
328       329  True          Omni Shopping Centre   
330       331  True         Charm Hand & Foot Spa   
332       333  True                          HOME   
334       335  True         Charm Hand & Foot Spa   
336       337  True         Gullivers Retail Park   
338       339  True          Musgrave Marketplace   
340       341  True  Dublin City University (DCU)   
341       342  True         Charm Hand & Foot Spa   
343       344  True                      Kopitiam   
345       346  True    Clare Hall Shopping Centre   
347       348  True                          HOME   

                      categories parent        lat       lon  \
318                               

# Merge places

In [None]:
df = stops.loc[stops['stop'] == True]
df = df.dropna(subset=['lat', 'lon'])
df = df.loc[~(df['checkin'].isin(['Car', 'Airplane']))]
df

In [None]:
test = stops.loc[stops['first'].str.startswith('20190101', na=False)]
test

In [None]:
# df = pd.read_csv('temp/stops_updated.tsv', sep='\t')
# df = pd.concat([df, df_new])


# df = df[['best_name_google', 'found', 'checkin', 'cluster_label'] + ['first', 'last'] + ['lat', 'lon', 'stop', 'best_label_google', 'best_prob_google', 'best_place_id_google', 'occurances']]
# df = df.loc[~df['first'].isin(['nan'])]
df = df.sort_values(by=['first'], ascending=[True])
df['occurances'] = df['occurances'].astype('Int64')
df['occurances'] = df.groupby(['cluster_label'])['cluster_label'].transform('count')
df = df.loc[~(df['first'].isin(['nan']))]
df = df.loc[~(df['last'].isin(['nan']))]
df = df.dropna(subset=['lat', 'first', 'last'])
df = df.sort_values(by=['first'], ascending=[True])
df = df.drop(columns=['index'])
df = df.reset_index()
df


In [None]:
gps = pd.read_csv(r'/home/tlduyen/LSC22/process/VAISL/files/cleaned_gps.csv', sep=',', decimal='.')
# Split images into different rows:
gps["ImageID"]=gps["ImageID"].str.split(",")
gps = gps.explode("ImageID").reset_index()
gps["ImageID"] = gps["ImageID"].str.replace(r'(\[|\]|\'|\s)', '', regex=True)
gps.loc[gps['ImageID'] == "", 'ImageID'] = np.nan
gps = gps.loc[gps['ImageID'].notna()]
gps

In [None]:
corrects = stops.loc[stops["best_name"] == stops["checkin"]]
corrects

In [None]:
def first_last_to_list(first, last):
    start = image_id_to_index[first.strip('[], ').split('.')[0] + ".jpg"]
    end = image_id_to_index[last.strip('[] ').split('.')[0] + ".jpg"]
    return end - start + 1

def average(first, last, default, col):
    start = image_id_to_index[first.strip('[], ').split('.')[0] + ".jpg"]
    end = image_id_to_index[last.strip('[] ').split('.')[0] + ".jpg"]
    values = [default if np.isnan(i) else i  for i in both.iloc[start:end+1][col]]
    return sum(values) / len(values)

    
df["len"] = df.progress_apply(lambda x: first_last_to_list(x["first"], x["last"]), axis=1)
df["lat"] = df.progress_apply(lambda x: average(x["first"], x["last"], x["lat"], "latitude"), axis=1)
df["lon"] = df.progress_apply(lambda x: average(x["first"], x["last"], x["lon"], "longitude"), axis=1)

df = df.sort_values(by=['first'], ascending=[True])

In [None]:
df["gap"] = [-1 for i in range(len(df))]

def next_gap(i):
    df['gap'].iloc[i] = first_last_to_list(df.iloc[i]["last"], df.iloc[i + 1]["first"])
    
for i in tqdm(range(len(df) - 1)):
    next_gap(i)

In [None]:
df = df.sort_values(by=['gap'], ascending=[False])
# df = df[['lat', 'lon', 'best_name_google', 'found', 'checkin', 'len'] + ['gap'] + ['cluster_label', 'first', 'last', 'stop', 'best_label_google', 'best_prob_google', 'best_place_id_google', 'occurances', 'core', 'name_occurances']]
df

In [None]:
re_merge = True
if re_merge:
    stop_clustering = DBSCAN(eps=0.01/6371, min_samples=2, algorithm='ball_tree', metric='haversine') #0.01 = 10 meters
    stop_clustering.fit(np.radians(df[['lat', 'lon']]))
    df['cluster_label'] = stop_clustering.labels_
    df['core'] = [i in stop_clustering.core_sample_indices_ for i in range(len(df))]

    new_labels = []
    count = max(df['cluster_label']) + 1
    print("Max:", count)
    for label in df['cluster_label']:
        if label == -1:
            label = count
            count += 1
        new_labels.append(str(label))
    df["cluster_label"] = new_labels
    df['occurances'] = df.groupby(['cluster_label'])['cluster_label'].transform('count')

In [None]:
merged_stops = df.groupby(['cluster_label', 'checkin']).agg(lat=('lat', 'mean'),
                                                           lon=('lon', 'mean'),
                                                           occurances=('occurances', 'first'))
merged_stops = merged_stops.reset_index()
merged_stops = merged_stops.sort_values(by=['occurances', 'cluster_label'], ascending=[False, True])
# merged_stops = merged_stops.sort_values(by='occurances', ascending=False)

merged_stops = merged_stops.reset_index()
merged_stops = merged_stops.drop(columns=['index'])
merged_stops

In [None]:
df.to_csv('files/semantic_stops.csv')
merged_stops.to_csv('files/merged_stops.csv')

In [None]:
# FOR MYSCEAL
map_visualisation = []

for index, row in merged_stops.iterrows():
    map_visualisation.append((unidecode(row["checkin"]), (row["lat"], row["lon"])))

json.dump(map_visualisation, open(f"../files/backend/map_visualisation.json", 'w'))
with open(f"/home/tlduyen/LSC2020/LSC2020/ui/src/commonplace.js", 'w') as f:
    f.write("var commonPlace=" + json.dumps(map_visualisation) + ";\n\nexport default commonPlace;")