In [4]:
# Import libraries
import numpy as np
import pandas as pd
from collections import Counter
from datetime import datetime
from dateutil import tz
import json
from nltk import ngrams

# CREATE DICT

In [12]:
unhelpful_images = json.load(open("files/unhelpful_images.json"))
metadata = pd.read_csv('VAISL/files/final_metadata.csv', sep=',', decimal='.')

  metadata = pd.read_csv('VAISL/files/final_metadata.csv', sep=',', decimal='.')


In [13]:
metadata['checkin'] = metadata['checkin'].fillna("")
metadata['city'] = metadata['city'].fillna("")

In [14]:
words_to_remove = ["Province", "Área metropolitana de Madrid y Corredor del Henares", "Community of", "The Municipal District of",
                  "Kreis", "Landkreis", "Regional Unit", "Municipal Unit", "Municipality", "Administrative District", "Region of",
                  "Provence-Alpes-Côte d'Azur", "Municipal Borough District", "Subdistrict Administrative Organization", "Subdistrict",
                  "District", "Distretto di", "Municipal District", "City", "Land ", "Urban agglomeration"]
words_to_remove = sorted(words_to_remove, key=lambda x: -len(x))

In [15]:
def remove_extra(city):
    city = city.split(",")
    new_city = []
    for name in city:
        for word in words_to_remove:
            name = name.replace(word, "")
        name = name.replace("of ", "")
        name = name.strip()
        if name and name not in new_city:
            new_city.append(name)
    return ", ".join(new_city)
        
metadata["city"] = metadata["city"].apply(remove_extra)

## New timezone processing

In [16]:
import geojson
country_geojson = geojson.load(open("files/countries.geojson"))

In [17]:
all_countries = set(metadata["country"].tolist())

In [18]:
geojson_data = {}
for country in country_geojson["features"]:
    name = country["properties"]["ADMIN"]
    if name in all_countries or name in ["United Kingdom", "South Korea"]:
        geojson_data[name] = country
geojson_data["Korea"] = geojson_data["South Korea"]
geojson_data["England"] = geojson_data["United Kingdom"]

In [19]:
json.dump(geojson_data, open("files/backend/countries.json", "w"))

In [20]:
metadata["new_timezone"] = metadata["new_timezone"].ffill()
metadata["country"] = metadata["country"].fillna("")
metadata["OCR"] = metadata["OCR"].fillna("")
metadata["location_info"] = metadata.apply(lambda row: row["categories"] if row["stop"] else row["checkin"], axis=1)
metadata["location_info"] = metadata["location_info"].fillna("")

In [29]:
from tqdm import tqdm_notebook as tqdm
info_dict = {}
def to_full_key(image):
    return f"{image[:6]}/{image[6:8]}/{image}"

def to_local_time(utc_time, time_zone):
    return utc_time.astimezone(tz.gettz(time_zone))

# Calculate seconds from midnight from a datetime object
def seconds_from_midnight(time):
    return time.hour * 3600 + time.minute * 60 + time.second

for index, row in tqdm(metadata.iterrows(), total=len(metadata)):
    image = row['ImageID']
    if isinstance(image, str):
        if image not in unhelpful_images:
            image = to_full_key(image)
            utc_time = datetime.strptime(row["minute_id"]+"00", "%Y%m%d_%H%M%S").replace(tzinfo=tz.gettz('UTC'))
            local_time = to_local_time(utc_time, row["new_timezone"])
            info_dict[image] = {
                "image_path": image,
                "minute_id": row["minute_id"],
                "time": datetime.strftime(local_time, "%Y/%m/%d %H:%M:00%z"),
                "utc_time": datetime.strftime(utc_time, "%Y/%m/%d %H:%M:00%z"),
                "weekday": datetime.strftime(local_time, "%A").lower(),
                "descriptions": row['Tags'].lower().split(',') if isinstance(row['Tags'], str) else "",
                "address": row["city"],
                "location": row["checkin"] if row["stop"] else "None",
                "location_info": row["location_info"],
                "gps": {"lat": row["new_lat"],
                        "lon": row["new_lng"]},
                "region": row["city"].lower().split(', '),
                "country": row["country"].lower(),
                "ocr": str(row["OCR"]).split(','),
                "timestamp": utc_time.timestamp(),
                "seconds_from_midnight": seconds_from_midnight(local_time)
            }

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for index, row in tqdm(metadata.iterrows(), total=len(metadata)):


  0%|          | 0/723329 [00:00<?, ?it/s]

In [30]:
from unidecode import unidecode

fields_to_fix = ["address", "location", "region"]
for image in info_dict:
    for field in fields_to_fix:
        if isinstance(info_dict[image][field], str):
            info_dict[image][field] = unidecode(
                info_dict[image][field])
        elif isinstance(info_dict[image][field], list):
            info_dict[image][field] = [unidecode(s) for s in info_dict[image][field]]
        elif np.isnan(info_dict[image][field]):
            info_dict[image][field] = "NONE"
        else:
            print(field, info_dict[image][field])

In [5]:
# json.dump(info_dict, open("files/info_dict.json", "w"))
# import json 
info_dict = json.load(open(f"files/info_dict.json"))

In [6]:
info_dict["202003/01/20200301_081459_000.jpg"]

{'image_path': '202003/01/20200301_081459_000.jpg',
 'minute_id': '20200301_0814',
 'time': '2020/03/01 08:14:00+0000',
 'utc_time': '2020/03/01 08:14:00+0000',
 'weekday': 'sunday',
 'descriptions': ['text',
  'outdoor',
  'road',
  'sky',
  'tree',
  'street',
  'way',
  'highway',
  'car'],
 'address': 'Dublin, Ireland, Leinster',
 'location': 'None',
 'location_info': 'Car',
 'gps': {'lat': 53.37971829369007, 'lon': -6.174530699771785},
 'region': ['dublin', 'ireland', 'leinster'],
 'country': 'ireland',
 'ocr': ['09-D-27845', 'ELLL'],
 'timestamp': 1583050440.0,
 'seconds_from_midnight': 29640,
 'scene': 'S_131984',
 'group': 'G_6998'}

In [7]:
groups = json.load(open('files/group_segments.json'))
scene_info = {}

assigned = []
count = 0
for group_name in groups:
    group_id = int(group_name.split('_')[-1])
    valid_scenes = []
    for scene_name, images in groups[group_name]["scenes"]:
        images = [image for image in images if image in info_dict]
        if not images:
            continue
        valid_scenes.append(scene_name)
        scene_info[scene_name] = {
            "group": group_name,
            "images": images,
            "start_time": info_dict[images[0]]["time"],
            "end_time": info_dict[images[-1]]["time"],
            "start_timestamp": info_dict[images[0]]["timestamp"],
            "end_timestamp": info_dict[images[-1]]["timestamp"],
            "start_seconds_from_midnight": info_dict[images[0]]["seconds_from_midnight"],
            "end_seconds_from_midnight": info_dict[images[-1]]["seconds_from_midnight"],
            "duration": info_dict[images[-1]]["seconds_from_midnight"] - info_dict[images[0]]["seconds_from_midnight"] + 1,
        }
        for key in ["location", "location_info", "region", "country", "weekday"]:
            scene_info[scene_name][key] = info_dict[images[0]][key]
        
        for key in ["gps"]:
            scene_info[scene_name][key] = [info_dict[image][key] for image in images]
        
        for key in ["ocr"]:
            merged = Counter()
            for image in images:
                for text in info_dict[image][key]:
                    if text not in merged:
                        merged[text] += 1
            scene_info[scene_name][key] = [a for a, _ in merged.most_common(10)]
        
        for image in images:
            info_dict[image]["scene"] = scene_name
            info_dict[image]["group"] = group_name
            count += 1
            assigned.append(image)
    group_duration = scene_info[valid_scenes[-1]]["end_seconds_from_midnight"] - scene_info[valid_scenes[0]]["start_seconds_from_midnight"] + 1
    for scene in valid_scenes:
        scene_info[scene]["group_duration"] = group_duration

print(len(set(assigned)), len(info_dict))
# I THINK THERE'S SOMETHING WRONG HERE
if len(set(assigned)) < len(info_dict):
    to_remove = set(info_dict.keys()).difference(assigned)
    for img in to_remove:
        del info_dict[img]

713861 713861


In [8]:
print(scene_info["S_131984"])

{'group': 'G_6998', 'images': ['202003/01/20200301_081459_000.jpg', '202003/01/20200301_081531_000.jpg', '202003/01/20200301_081603_000.jpg', '202003/01/20200301_081635_000.jpg', '202003/01/20200301_081707_000.jpg', '202003/01/20200301_081739_000.jpg', '202003/01/20200301_081811_000.jpg', '202003/01/20200301_081843_000.jpg', '202003/01/20200301_081915_000.jpg', '202003/01/20200301_081947_000.jpg', '202003/01/20200301_082019_000.jpg', '202003/01/20200301_082051_000.jpg', '202003/01/20200301_082123_000.jpg', '202003/01/20200301_082155_000.jpg', '202003/01/20200301_082227_000.jpg', '202003/01/20200301_082259_000.jpg', '202003/01/20200301_082331_000.jpg', '202003/01/20200301_082403_000.jpg', '202003/01/20200301_082435_000.jpg', '202003/01/20200301_082507_000.jpg', '202003/01/20200301_082552_000.jpg', '202003/01/20200301_082624_000.jpg', '202003/01/20200301_082656_000.jpg', '202003/01/20200301_082728_000.jpg', '202003/01/20200301_082800_000.jpg', '202003/01/20200301_082832_000.jpg', '202003

In [9]:
import json
json.dump(info_dict, open("files/info_dict.json", "w"))
json.dump(scene_info, open(f"files/scene_dict.json", "w"))

# PREPARE BACKEND

In [16]:
locations = set([img["location"].lower().strip() for img in info_dict.values()])
if "none" in locations:
    locations.remove("none")
extra = set()
location_with_extras = {}
for loc in locations:
    if loc:
        location_with_extras[loc] = []
        for lengram in range(2, len(loc)):
            for ngram in ngrams(loc.split(), lengram):
                location_with_extras[loc].append(" ".join(ngram))
        location_with_extras[loc].append(loc)
        location_with_extras[loc] = location_with_extras[loc][::-1]
json.dump(location_with_extras, open(f'files/backend/locations.json', 'w'))
print(len(locations))

718


In [17]:
"home" in locations

True

In [18]:
regions = set([loc.lower().strip() for img in info_dict.values()
               for loc in img["region"]])
json.dump(list(regions), open(f'files/backend/regions.json', 'w'))

In [20]:
with open(f"../UI/src/regions.js", 'w') as f:
    f.write("var regions=" + json.dumps(list(regions)) + ";\n\nexport default regions;")

In [21]:
def filter_dict(image):
    return { key: info_dict[image][key] for key in ["group", "scene", "time", "gps", "location", "location_info"]}

basic_dict = {image: filter_dict(image) for image in info_dict}
json.dump(basic_dict, open(f'files/backend/basic_dict.json', 'w'))

In [22]:
time_info = {}
def get_hour_minute(date_string):
    datetime_value = datetime.strptime(date_string, "%Y/%m/%d %H:%M:00%z")
    return datetime_value.strftime("%I:%M%p")

def get_final_time(first_info, last_info):
    if first_info == last_info:
        return first_info
    return f"{first_info} - {last_info}"

for group_name in groups:
    group_first_info = None
    group_last_info = None
    for scene_name, images in groups[group_name]["scenes"]:
        first_info = info_dict[images[0]]["time"]
        last_info = info_dict[images[-1]]["time"]
        if not group_first_info:
            group_first_info = first_info
        group_last_info = last_info
        time_info[scene_name] = get_final_time(get_hour_minute(first_info), get_hour_minute(last_info))
    time_info[group_name] = get_final_time(get_hour_minute(group_first_info), get_hour_minute(group_last_info))

json.dump(time_info, open(f"files/backend/time_info.json", "w"))

In [23]:
time_info["S_36"]

'11:05AM - 11:11AM'