In [1]:
# Import libraries
from sklearn.cluster import DBSCAN
import numpy as np
import bamboolib as bam
import pandas as pd
from collections import Counter, defaultdict
from datetime import datetime
from dateutil import tz
import json
import geopy.distance
from nltk import ngrams
import joblib
import os

# CREATE DICT

In [8]:
unhelpful_images = json.load(open("files/unhelpful_images.json"))
metadata = pd.read_csv('VAISL/files/final_metadata.csv', sep=',', decimal='.')

In [9]:
metadata['checkin'] = metadata['checkin'].fillna("")
metadata['city'] = metadata['city'].fillna("")

In [None]:
words_to_remove = ["Province", "Área metropolitana de Madrid y Corredor del Henares", "Community of", "The Municipal District of",
                  "Kreis", "Landkreis", "Regional Unit", "Municipal Unit", "Municipality", "Administrative District", "Region of",
                  "Provence-Alpes-Côte d'Azur", "Municipal Borough District", "Subdistrict Administrative Organization", "Subdistrict",
                  "District", "Distretto di", "Municipal District", "City", "Land ", "Urban agglomeration"]
words_to_remove = sorted(words_to_remove, key=lambda x: -len(x))

In [None]:
def remove_extra(city):
    city = city.split(",")
    new_city = []
    for name in city:
        for word in words_to_remove:
            name = name.replace(word, "")
        name = name.replace("of ", "")
        name = name.strip()
        if name and name not in new_city:
            new_city.append(name)
    return ", ".join(new_city)
        
metadata["city"] = metadata["city"].apply(remove_extra)

## New timezone processing

In [10]:
import geojson
country_geojson = geojson.load(open("files/countries.geojson"))

In [11]:
all_countries = set(metadata["country"].tolist())

In [12]:
geojson_data = {}
for country in country_geojson["features"]:
    name = country["properties"]["ADMIN"]
    if name in all_countries or name in ["United Kingdom", "South Korea"]:
        geojson_data[name] = country
geojson_data["Korea"] = geojson_data["South Korea"]
geojson_data["England"] = geojson_data["United Kingdom"]

In [13]:
json.dump(geojson_data, open("files/backend/countries.json", "w"))

In [15]:
metadata["new_timezone"] = metadata["new_timezone"].ffill()
metadata["country"] = metadata["country"].fillna("")
metadata["OCR"] = metadata["OCR"].fillna("")
metadata["location_info"] = metadata.apply(lambda row: row["categories"] if row["stop"] else row["checkin"], axis=1)
metadata["location_info"] = metadata["location_info"].fillna("")

In [16]:
from tqdm import tqdm_notebook as tqdm
info_dict = {}
def to_full_key(image):
    return f"{image[:6]}/{image[6:8]}/{image}"

def to_local_time(utc_time, time_zone):
    return utc_time.astimezone(tz.gettz(time_zone))

# Calculate seconds from midnight from a datetime object
def seconds_from_midnight(time):
    return time.hour * 3600 + time.minute * 60 + time.second

for index, row in tqdm(metadata.iterrows(), total=len(metadata)):
    image = row['ImageID']
    if isinstance(image, str):
        if image not in unhelpful_images:
            image = to_full_key(image)
            utc_time = datetime.strptime(row["minute_id"]+"00", "%Y%m%d_%H%M%S").replace(tzinfo=tz.gettz('UTC'))
            local_time = to_local_time(utc_time, row["new_timezone"])
            info_dict[image] = {
                "image_path": image,
                "minute_id": row["minute_id"],
                "time": datetime.strftime(local_time, "%Y/%m/%d %H:%M:00%z"),
                "utc_time": datetime.strftime(utc_time, "%Y/%m/%d %H:%M:00%z"),
                "weekday": datetime.strftime(local_time, "%A").lower(),
                "descriptions": row['Tags'].lower().split(',') if isinstance(row['Tags'], str) else "",
                "address": row["city"],
                "location": row["checkin"] if row["stop"] else "None",
                "location_info": row["location_info"],
                "gps": {"lat": row["new_lat"],
                        "lon": row["new_lng"]},
                "region": row["city"].lower().split(', '),
                "country": row["country"].lower(),
                "ocr": row["OCR"].split(', '),
                "timestamp": utc_time.timestamp(),
                "seconds_from_midnight": seconds_from_midnight(local_time)
            }

  0%|          | 0/723329 [00:00<?, ?it/s]

In [None]:
from unidecode import unidecode

fields_to_fix = ["address", "location", "region"]
for image in info_dict:
    for field in fields_to_fix:
        if isinstance(info_dict[image][field], str):
            info_dict[image][field] = unidecode(
                info_dict[image][field])
        elif isinstance(info_dict[image][field], list):
            info_dict[image][field] = [unidecode(s) for s in info_dict[image][field]]
        elif np.isnan(info_dict[image][field]):
            info_dict[image][field] = "NONE"
        else:
            print(field, info_dict[image][field])

In [17]:
info_dict["201901/06/20190106_171105_000.jpg"]

{'image_path': '201902/08/20190208_172845_000.jpg',
 'minute_id': '20190208_1728',
 'time': '2019/02/08 20:28:00+0300',
 'utc_time': '2019/02/08 17:28:00+0000',
 'weekday': 'friday',
 'descriptions': ['text', 'person', 'indoor', 'store'],
 'address': 'Turkey, Marmara',
 'location': 'Istanbul Ataturk Airport',
 'location_info': 'Airport',
 'gps': {'lat': 40.984292, 'lon': 28.8156077},
 'region': ['turkey', 'marmara'],
 'country': 'turkey',
 'ocr': ['gặp,lại,ain,TRÀ,CHANH,CROS,KHUY,Chupa,Chúps,TMINT,DOUDLEMIN,ININTIN'],
 'timestamp': 1549646880.0,
 'ocr_score': {}}

In [18]:
groups = json.load(open('files/group_segments.json'))

assigned = []
count = 0
for group_name in groups:
    group_id = int(group_name.split('_')[-1])
    before = []
    if group_id > 1:
        before_group_id = group_id - 1
        while before_group_id >= 1:
            before_group = groups[f"G_{before_group_id}"]
            if before_group["location"] != "NONE":
                for scene in before_group["scenes"]:
                    before.extend(scene[1])
                break
            before_group_id -= 1
            
    after = []
    if group_id < len(groups):
        after_group_id = group_id + 1
        while after_group_id  <= len(groups) - 1:
            after_group = groups[f"G_{after_group_id}"]
            if after_group["location"] != "NONE":
                for scene in after_group["scenes"]:
                    after.extend(scene[1])
                break
            after_group_id +=1
    for scene_name, images in groups[group_name]["scenes"]:
        for image in images:
            if image in info_dict:
                info_dict[image]["scene"] = scene_name
                info_dict[image]["group"] = group_name
                info_dict[image]["before"] = before[:10]
                info_dict[image]["after"] = after[:10]
                count += 1
                assigned.append(image)
            else:
                print("Skipping", image)

print(len(set(assigned)), len(info_dict))
# I THINK THERE'S SOMETHING WRONG HERE
if len(set(assigned)) < len(info_dict):
    to_remove = set(info_dict.keys()).difference(assigned)
    for img in to_remove:
        del info_dict[img]

714583 714583


In [19]:
info_dict["201901/01/20190101_164846_000.jpg"]

{'image_path': '201901/01/20190101_164846_000.jpg',
 'minute_id': '20190101_1648',
 'time': '2019/01/01 16:48:00+0000',
 'utc_time': '2019/01/01 16:48:00+0000',
 'weekday': 'tuesday',
 'descriptions': ['person',
  'food',
  'table',
  'plate',
  'indoor',
  'eating',
  'dessert',
  'meal'],
 'address': 'Dublin, Ireland, Leinster',
 'location': "Eddie Rocket's",
 'location_info': 'Burger Joint, Diner, Fast Food Restaurant',
 'gps': {'lat': 53.2828644, 'lon': -6.4222863},
 'region': ['dublin', 'ireland', 'leinster'],
 'country': 'ireland',
 'ocr': '',
 'timestamp': 1546361280.0,
 'ocr_score': {},
 'scene': 'S_250',
 'group': 'G_17',
 'before': ['201901/01/20190101_154407_000.jpg',
  '201901/01/20190101_154439_000.jpg',
  '201901/01/20190101_154511_000.jpg',
  '201901/01/20190101_154543_000.jpg',
  '201901/01/20190101_154615_000.jpg',
  '201901/01/20190101_154647_000.jpg',
  '201901/01/20190101_154719_000.jpg',
  '201901/01/20190101_154751_000.jpg',
  '201901/01/20190101_154823_000.jpg',


In [21]:
json.dump(info_dict, open(f"files/info_dict.json", "w"))

# PREPARE BACKEND

In [22]:
locations = set([img["location"].lower().strip() for img in info_dict.values()])
if "none" in locations:
    locations.remove("none")
extra = set()
location_with_extras = {}
for loc in locations:
    if loc:
        location_with_extras[loc] = []
        for lengram in range(2, len(loc)):
            for ngram in ngrams(loc.split(), lengram):
                location_with_extras[loc].append(" ".join(ngram))
        location_with_extras[loc].append(loc)
        location_with_extras[loc] = location_with_extras[loc][::-1]
json.dump(location_with_extras, open(f'files/backend/locations.json', 'w'))
print(len(locations))

719


In [23]:
regions = set([loc.lower().strip() for img in info_dict.values()
               for loc in img["region"]])
json.dump(list(regions), open(f'files/backend/regions.json', 'w'))

In [24]:
with open(f"../UI/src/regions.js", 'w') as f:
    f.write("var regions=" + json.dumps(list(regions)) + ";\n\nexport default regions;")

In [30]:
def filter_dict(image):
    return { key: info_dict[image][key] for key in ["group", "scene", "time", "gps", "location", "location_info"]}

basic_dict = {image: filter_dict(image) for image in info_dict}
json.dump(basic_dict, open(f'files/backend/basic_dict.json', 'w'))

In [28]:
time_info = {}
def get_hour_minute(date_string):
    datetime_value = datetime.strptime(date_string, "%Y/%m/%d %H:%M:00%z")
    return datetime_value.strftime("%I:%M%p")

def get_final_time(first_info, last_info):
    if first_info == last_info:
        return first_info
    return f"{first_info} - {last_info}"

for group_name in groups:
    group_first_info = None
    group_last_info = None
    for scene_name, images in groups[group_name]["scenes"]:
        first_info = info_dict[images[0]]["time"]
        last_info = info_dict[images[-1]]["time"]
        if not group_first_info:
            group_first_info = first_info
        group_last_info = last_info
        time_info[scene_name] = get_final_time(get_hour_minute(first_info), get_hour_minute(last_info))
    time_info[group_name] = get_final_time(get_hour_minute(group_first_info), get_hour_minute(group_last_info))

json.dump(time_info, open(f"files/backend/time_info.json", "w"))

In [29]:
time_info["S_36"]

'11:02AM'