In [3]:
import bamboolib as bam
from tqdm.notebook import tqdm
from collections import defaultdict
import pandas as pd
import multiprocess as mp

import numpy as np
tqdm.pandas()

# Read files

In [2]:
gps = pd.read_csv(r'files/cleaned_gps.csv', sep=',', decimal='.')
minute_id_to_index = {minute_id:i for (i, minute_id) in enumerate(gps['minute_id'].values.tolist())}

In [3]:
# Split images into different rows:
gps["ImageID"]=gps["ImageID"].str.split(",")
gps = gps.explode("ImageID").reset_index()
gps["ImageID"] = gps["ImageID"].str.replace(r'(\[|\]|\'|\s)', '', regex=True)
gps.loc[gps['ImageID'] == "", 'ImageID'] = np.nan
gps = gps.loc[gps['ImageID'].notna()]

In [4]:
# Merge into one file with the visual concepts
visual = pd.read_csv(r'../../original_data/lsc22_visual_concepts.csv', sep=',', decimal='.')
# start from here
both = pd.merge(
    visual,
    gps,
    how="right",
    on='ImageID',
)

all_minute_ids = both['minute_id'].values.tolist()
all_image_ids = both['ImageID'].values.tolist()
image_id_to_index = {image_id: i for (i, image_id) in enumerate(all_image_ids)}
minute_id_to_images = defaultdict(lambda:[])
for minute_id, image_id in zip(all_minute_ids, all_image_ids):
    minute_id_to_images[minute_id].append(image_id)

# VAISL results

In [6]:
stops = pd.read_csv(r'/home/tlduyen/LSC22/process/VAISL/files/semantic_stops.csv', sep=',', decimal='.')

In [7]:
import json
# cluster_to_name = [(metadata.index.values[:782].tolist(), "HOME", 53.38998, -6.1457602, True)]
cluster_to_name = []
all_images = set()

for index, row in stops.iterrows():
    try:
        if row["first"] == "nan":
            continue
#         minute_ids = json.loads(row["minute_id"].replace("'", '"'))
#         cluster_to_name.append((minute_ids, row["checkin"], row["lat"], row["lon"], True))
        start = image_id_to_index[row["first"].strip('[], ').split('.')[0] + ".jpg"]
        end = image_id_to_index[row["last"].strip('[] ').split('.')[0] + ".jpg"]
        assert start <= end, "wrong order"
        image_ids = all_image_ids[start : end+1]
        if "movement" in row:
            cluster_to_name.append((image_ids, row["checkin"] if row["stop"] else row["movement"], row["lat"], row["lon"], row["stop"]))
        else:
            cluster_to_name.append((image_ids, row["checkin"], row["lat"], row["lon"], row["stop"]))
    except Exception as e:
        print(row)
        raise(e)

# Ground Truth

In [133]:
df = pd.read_csv(r'/home/tlduyen/LSC22/process/VAISL/files/segment_df.csv', sep=',', decimal='.')

In [134]:
cluster_to_name = []
all_images = set()

for index, row in df.iterrows():
    try:
        if row["first"] == "nan":
            continue
#         minute_ids = json.loads(row["minute_id"].replace("'", '"'))
#         cluster_to_name.append((minute_ids, row["checkin"], row["lat"], row["lon"], True))
        start = image_id_to_index[row["first"].strip('[], ').split('.')[0] + ".jpg"]
        end = image_id_to_index[row["last"].strip('[] ').split('.')[0] + ".jpg"]
        assert start <= end, "wrong order"
        image_ids = all_image_ids[start : end+1] 
        cluster_to_name.append((image_ids, row["checkin"], row["lat"], row["lon"], True))
    except Exception as e:
        print(row)
        raise(e)

# Assign results

In [8]:
both = both.set_index("ImageID")

def classify(params):
    index, cluster = params
    image_ids, name, centre_lat, centre_lon, is_stop = cluster
    results = []
    for image_id in image_ids:
        if not is_stop or not np.isnan(both.loc[image_id, "latitude"]):
            centre_lat = both.loc[image_id, "latitude"]
            centre_lon = both.loc[image_id, "longitude"] 
#         results.append([image_id, name] + [df.iloc[index][label] for label in ["found", "best_name_google", "best_label_google",
#                          "best_prob_google", "best_place_id_google", "cluster_label"]] + [centre_lat, centre_lon])
        results.append([image_id, name, centre_lat, centre_lon, is_stop])
    return results
            
with mp.Pool(mp.cpu_count()) as pool:
    results = list(tqdm(pool.imap_unordered(classify, enumerate(cluster_to_name)), total=len(cluster_to_name)))
results = [r for res in results for r in res]
len(results)

  0%|          | 0/9206 [00:00<?, ?it/s]

723329

In [9]:
image_ids_all, new_names, lats, lons, is_stops = zip(*results)
both["new_lat"] = both["latitude"]
both["new_long"] = both["longitude"]
both["new_name"] = [None] * len(both)
both["stop"] = ["ERR"] * len(both)
def get_column(params):
    label, values = params
    row_to_name = {image_id_to_index[image_id]: name for image_id, name in zip(image_ids_all, values)}
    column = [row_to_name[i] if i in row_to_name else both.iloc[i][label] for i in range(len(both))]
    return column
    
with mp.Pool(mp.cpu_count()) as pool:
    rr =list(tqdm(pool.imap(get_column, [("new_name", new_names),
                                           ("new_lat", lats),
                                           ("new_long", lons),
                                           ("stop", is_stops)]), total=4))

  0%|          | 0/4 [00:00<?, ?it/s]

In [10]:
rr[0] = ["HOME" for i in range(240)] + rr[0][240:]
rr[1] = [53.38998 for i in range(240)] + rr[1][240:]
rr[2] = [-6.1457602 for i in range(240)] + rr[2][240:]
rr[3] = [True for i in range(240)] + rr[3][240:]
both["new_name"] = rr[0]
both["new_lat"] = rr[1]
both["new_long"] = rr[2]
both["stop"] = rr[3]

In [11]:
both["new_lat"] = both["new_lat"].ffill()
both["new_long"] = both["new_long"].ffill()
both = both.reset_index()
both[['new_name']] = both[['new_name']].fillna('')

In [1]:
stops = pd.read_csv(r'/home/tlduyen/LSC22/process/VAISL/files/stops.csv', sep=',', decimal='.')

NameError: name 'pd' is not defined

In [27]:
stops = stops.loc[stops['images'].str.contains('20190208_17', case=False, regex=False, na=False)]
stops

       Unnamed: 0  index  inside   stop         movement stop_label  \
0               0      0    True   True           Inside       01_0   
1               1      1    True   True           Inside     INSIDE   
2               2      2    True   True           Inside       01_0   
3               3      3    True   True           Inside     INSIDE   
4               4      4    True   True           Inside       01_0   
...           ...    ...     ...    ...              ...        ...   
11449       11449  11449   False   True  Walking Outside       30_0   
11450       11450  11450   False  False  Walking Outside        NaN   
11451       11451  11451   False   True           Inside       30_0   
11452       11452  11452   False  False              Car        NaN   
11453       11453  11453   False   True           Inside       30_0   

             lat       lon  max_radius  \
0      53.389991 -6.145720    9.810891   
1            NaN       NaN   50.000000   
2      53.389997 -6.1

In [16]:
from scripts.agg_stops import assign_to_images
both = assign_to_images(stops)

  0%|          | 0/9203 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

In [8]:
both = pd.read_csv(r'files/final_metadata.csv', sep=',', decimal='.')

In [9]:
both = both.loc[both['ImageID'].str.contains('20190208_174', case=False, regex=False, na=False)]
both

       Unnamed: 0.1                  ImageID  Unnamed: 0  \
54341         54341  20190208_174030_000.jpg       54341   
54342         54342  20190208_174102_000.jpg       54342   
54343         54343  20190208_174134_000.jpg       54343   
54344         54344  20190208_174206_000.jpg       54344   
54345         54345  20190208_174238_000.jpg       54345   
54346         54346  20190208_174310_000.jpg       54346   
54347         54347  20190208_174342_000.jpg       54347   
54348         54348  20190208_174414_000.jpg       54348   
54349         54349  20190208_174446_000.jpg       54349   
54350         54350  20190208_174611_000.jpg       54350   
54351         54351  20190208_174643_000.jpg       54351   
54352         54352  20190208_174715_000.jpg       54352   
54353         54353  20190208_174747_000.jpg       54353   
54354         54354  20190208_174819_000.jpg       54354   
54355         54355  20190208_174851_000.jpg       54355   
54356         54356  20190208_174923_000

# Get city names

In [13]:
from scripts.map_apis import *
both['city'] = both.progress_apply(lambda x: get_cities(round(x['new_lat'], 3), round(x['new_long'], 3)), axis=1)
both['country'] = both.progress_apply(lambda x: get_countries(round(x['new_lat'], 3), round(x['new_long'], 3)), axis=1)

  0%|          | 0/723329 [00:00<?, ?it/s]

  0%|          | 0/723329 [00:00<?, ?it/s]

In [14]:
from tzwhere import tzwhere
tz = tzwhere.tzwhere(forceTZ=True)
        
both["new_timezone"] = both.progress_apply(lambda x: tz.tzNameAt(round(x['new_lat'], 4), 
                                                                 round(x['new_long'], 4), 
                                                                 forceTZ=True), axis=1)


Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.


Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.



  0%|          | 0/723329 [00:00<?, ?it/s]

In [15]:
both.to_csv('files/final_metadata.csv')

ModuleNotFoundError: No module named 'gps_utils'