In [6]:
# Import libraries
import numpy as np
import pandas as pd
from collections import Counter
from datetime import datetime
from dateutil import tz
import json
from nltk import ngrams
import bamboolib as bam

# CREATE DICT

In [7]:
unhelpful_images = json.load(open("files/unhelpful_images.json"))
metadata = pd.read_csv('VAISL/files/final_metadata.csv', sep=',', decimal='.')

In [7]:
def preprocess(metadata):
    metadata = metadata[['ImageID', 'minute_id', 'OCR', 'stop', 'movement', 'new_lat', 'new_lng', 'checkin', 'city', 'country', 'parent', 'new_timezone', 'Tags', 'categories']]
    metadata['checkin'] = metadata['checkin'].fillna("")
    metadata['city'] = metadata['city'].fillna("")
    metadata.loc[metadata['new_timezone'] == 'uninhabited', 'new_timezone'] = ""
    metadata['new_timezone'] = metadata['new_timezone'].ffill()
    metadata["country"] = metadata["country"].fillna("")
    metadata["OCR"] = metadata["OCR"].fillna("")
    metadata["location_info"] = metadata.apply(lambda row: row["categories"] if row["stop"] else "", axis=1)
    metadata["location_info"] = metadata["location_info"].fillna("")
    return metadata
metadata = preprocess(metadata)

In [8]:
words_to_remove = ["District", "Province", "Área metropolitana de Madrid y Corredor del Henares", "Community of", "The Municipal District of",
                  "Kreis", "Landkreis", "Regional Unit", "Municipal Unit", "Municipality", "Administrative District", "Region of",
                  "Provence-Alpes-Côte d'Azur", "Municipal Borough District", "Subdistrict Administrative Organization", "Subdistrict",
                  "District", "Distretto di", "Municipal District", "City", "Land ", "Urban agglomeration"]
words_to_remove = sorted(words_to_remove, key=lambda x: -len(x))

In [9]:
def remove_extra(city):
    city = city.split(",")
    new_city = []
    for name in city:
        for word in words_to_remove:
            name = name.replace(word, "")
        name = name.replace("of ", "")
        name = name.strip()
        if name and name not in new_city:
            new_city.append(name)
    return ", ".join(new_city)
        
metadata["city"] = metadata["city"].apply(remove_extra)

## New timezone processing

In [10]:
import geojson
country_geojson = geojson.load(open("files/countries.geojson"))

In [11]:
all_countries = set(metadata["country"].tolist())

In [12]:
geojson_data = {}
for country in country_geojson["features"]:
    name = country["properties"]["ADMIN"]
    if name in all_countries or name in ["United Kingdom", "South Korea"]:
        geojson_data[name] = country
geojson_data["Korea"] = geojson_data["South Korea"]
geojson_data["England"] = geojson_data["United Kingdom"]

In [14]:
json.dump(geojson_data, open("files/backend/countries.json", "w"))

In [15]:
from tqdm import tqdm_notebook as tqdm
info_dict = {}
def to_full_key(image):
    return f"{image[:6]}/{image[6:8]}/{image}"

def to_local_time(utc_time, time_zone):
    return utc_time.astimezone(tz.gettz(time_zone))

# Calculate seconds from midnight from a datetime object
def seconds_from_midnight(time):
    return time.hour * 3600 + time.minute * 60 + time.second

for index, row in tqdm(metadata.iterrows(), total=len(metadata)):
    image = row['ImageID']
    if isinstance(image, str):
        if image not in unhelpful_images:
            image = to_full_key(image)
            utc_time = datetime.strptime(row["minute_id"]+"00", "%Y%m%d_%H%M%S").replace(tzinfo=tz.gettz('UTC'))
            local_time = to_local_time(utc_time, row["new_timezone"])
            info_dict[image] = {
                "image_path": image,
                "minute_id": row["minute_id"],
                "time": datetime.strftime(local_time, "%Y/%m/%d %H:%M:00%z"),
                "utc_time": datetime.strftime(utc_time, "%Y/%m/%d %H:%M:00%z"),
                "weekday": datetime.strftime(local_time, "%A").lower(),
                "descriptions": row['Tags'].lower().split(',') if isinstance(row['Tags'], str) else "",
                "address": row["city"],
                "location": row["checkin"] if row["stop"] else "---",
                "location_info": row["location_info"],
                "gps": {"lat": row["new_lat"],
                        "lon": row["new_lng"]},
                "region": row["city"].lower().split(', '),
                "country": row["country"].lower(),
                "ocr": str(row["OCR"]).split(','),
                "timestamp": utc_time.timestamp(),
                "seconds_from_midnight": seconds_from_midnight(local_time)
            }

  0%|          | 0/723329 [00:00<?, ?it/s]

In [34]:
from unidecode import unidecode

fields_to_fix = ["address", "location", "region", "location_info"]
for image in info_dict:
    for field in fields_to_fix:
        if isinstance(info_dict[image][field], str):
            info_dict[image][field] = unidecode(
                info_dict[image][field])
        elif isinstance(info_dict[image][field], list):
            info_dict[image][field] = [unidecode(s) for s in info_dict[image][field]]
        elif np.isnan(info_dict[image][field]):
            info_dict[image][field] = "NONE"
        else:
            print(field, info_dict[image][field])

In [17]:
json.dump(info_dict, open("files/info_dict.json", "w"))
# import json 
# info_dict = json.load(open(f"files/info_dict.json"))

In [18]:
info_dict["202003/01/20200301_081459_000.jpg"]

{'image_path': '202003/01/20200301_081459_000.jpg',
 'minute_id': '20200301_0814',
 'time': '2020/03/01 08:14:00+0000',
 'utc_time': '2020/03/01 08:14:00+0000',
 'weekday': 'sunday',
 'descriptions': ['text',
  'outdoor',
  'road',
  'sky',
  'tree',
  'street',
  'way',
  'highway',
  'car'],
 'address': 'Dublin, Ireland, Leinster',
 'location': '---',
 'location_info': '',
 'gps': {'lat': 53.37971829369007, 'lon': -6.174530699771785},
 'region': ['dublin', 'ireland', 'leinster'],
 'country': 'ireland',
 'ocr': ['09-D-27845', 'ELLL'],
 'timestamp': 1583050440.0,
 'seconds_from_midnight': 29640}

In [35]:
groups = json.load(open('files/group_segments.json'))
scene_info = {}

assigned = []
count = 0
for group_name in groups:
    group_id = int(group_name.split('_')[-1])
    valid_scenes = []
    for scene_name, images in groups[group_name]["scenes"]:
        images = [image for image in images if image in info_dict]
        if not images:
            continue
        valid_scenes.append(scene_name)
        scene_info[scene_name] = {
            "group": group_name,
            "images": images,
            "start_time": info_dict[images[0]]["time"],
            "end_time": info_dict[images[-1]]["time"],
            "start_timestamp": info_dict[images[0]]["timestamp"],
            "end_timestamp": info_dict[images[-1]]["timestamp"],
            "start_seconds_from_midnight": info_dict[images[0]]["seconds_from_midnight"],
            "end_seconds_from_midnight": info_dict[images[-1]]["seconds_from_midnight"],
            "duration": info_dict[images[-1]]["seconds_from_midnight"] - info_dict[images[0]]["seconds_from_midnight"] + 1,
        }
        for key in ["location", "location_info", "region", "country", "weekday"]:
            scene_info[scene_name][key] = info_dict[images[0]][key]
        
        for key in ["gps"]:
            scene_info[scene_name][key] = [info_dict[image][key] for image in images]
        
        for key in ["ocr"]:
            merged = Counter()
            for image in images:
                for text in info_dict[image][key]:
                    if text not in merged:
                        merged[text] += 1
            scene_info[scene_name][key] = [a for a, _ in merged.most_common(10)]
        
        for image in images:
            info_dict[image]["scene"] = scene_name
            info_dict[image]["group"] = group_name
            count += 1
            assigned.append(image)
    group_duration = scene_info[valid_scenes[-1]]["end_seconds_from_midnight"] - scene_info[valid_scenes[0]]["start_seconds_from_midnight"] + 1
    for scene in valid_scenes:
        scene_info[scene]["group_duration"] = group_duration

print(len(set(assigned)), len(info_dict))
# I THINK THERE'S SOMETHING WRONG HERE
if len(set(assigned)) < len(info_dict):
    to_remove = set(info_dict.keys()).difference(assigned)
    for img in to_remove:
        del info_dict[img]

713861 713861


In [36]:
import json
json.dump(info_dict, open("files/info_dict.json", "w"))
json.dump(scene_info, open(f"files/scene_dict.json", "w"))

# PREPARE BACKEND

## Location infos

In [37]:
import clip
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model, clip_preprocess = clip.load("ViT-B/32", device=device)

def encode_query(main_query):
    main_query = clip.tokenize([main_query]).to(device)
    with torch.no_grad():
        text_encoded = clip_model.encode_text(main_query)
        text_encoded /= text_encoded.norm(dim=-1, keepdim=True)
    text_features = text_encoded.cpu().numpy()
    return text_features

locations = set([img["location"].lower().strip() for img in info_dict.values()])
if "none" in locations:
    locations.remove("none")
    
# TfidfVectorizer 
# CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
# set of documents
train = [location for location in locations]
# instantiate the vectorizer object
tfidfvectorizer = TfidfVectorizer(analyzer='word',stop_words= 'english')
# convert th documents into a matrix
tfidf_wm = tfidfvectorizer.fit_transform(train)    

# Find the most similar locations based on tfidf
from sklearn.metrics.pairwise import cosine_similarity
def get_similar_locations_tfidf(location, top=5):
    # generate tf-idf for the given document
    tfidf_vector = tfidfvectorizer.transform([location])
    # find similar locations
    cosine_similarities = cosine_similarity(tfidf_vector, tfidf_wm)
    similar_indices = cosine_similarities.argsort().flatten()[-top:]
    # sort the similar locations by similarity
    similar_locations = sorted([(train[i], cosine_similarities[0,i]) for i in similar_indices], key=lambda x: -x[1])
    # filter zero similarity
    similar_locations = [similar_location[0] for similar_location in similar_locations if similar_location[1] > 0]
    return similar_locations

import nltk
from nltk import FreqDist
from nltk.corpus import brown
from nltk.corpus import stopwords

def get_ngram_freqdist(n):
    # Get the words from the Brown Corpus
    words = brown.words()

    # Tokenize and filter stopwords
    stop_words = set(stopwords.words('english'))
    filtered_words = [word.lower() for word in words if word.isalpha() and word.lower() not in stop_words]

    # Generate n-grams
    ngrams = list(nltk.ngrams(filtered_words, n))

    # Compute frequency distribution of n-grams
    freq_dist = FreqDist(ngrams)
    return freq_dist

ngrams_list = []
for n in range(2, 5):
    freq_dist = get_ngram_freqdist(n)
    for word, frequency in freq_dist.most_common(500):
        ngrams_list.append(word)

from nltk.util import ngrams
from nltk.tokenize import MWETokenizer
tokenizer = MWETokenizer(ngrams_list)
tokenizer.add_mwe(('chiang', 'mai'))
stop_words = set(stopwords.words('english'))

location_with_extras = {}
from itertools import combinations
locations = [location for location in locations]
for loc in locations:
    if loc:
        n_grams = []
        tokens = tokenizer.tokenize(loc.split())
        for lengram in range(2, len(tokens)):
            for ngram in combinations(tokens, lengram):
                # Remove non-alpha words
                ngram = [word for word in ngram if word.isalpha()]
                if not ngram:
                    continue
                while ngram[-1] in stop_words:
                    ngram = ngram[:-1]
                    if not ngram:
                        break
                if not ngram:
                    continue
                while ngram[0] in stop_words:
                    ngram = ngram[1:]
                    if not ngram:
                        break
                if len(ngram) <= 1:
                    continue
                word = " ".join(ngram)
                if word and word not in n_grams:
                    n_grams.append(word)
        n_grams = n_grams[::-1]
        new_n_grams = []
        loc_features = encode_query(loc)
        for word in n_grams:
            word_features = encode_query(word)
            if loc_features @ word_features.T > 0.8:
                if loc in get_similar_locations_tfidf(word, top=5):
                    new_n_grams.append(word)
        location_with_extras[loc] = new_n_grams
location_with_extras['porridge in front of phrommet shrine'] = ['phrommet shrine', 'phrommet']
location_with_extras['on the matthews coach in transit'] = ['matthews coach']
location_with_extras['the inn on the mile'] = ['inn on the mile']
json.dump(location_with_extras, open(f'files/backend/locations.json', 'w'))
print(len(locations))

719


## Others

In [38]:
regions = set([loc.lower().strip() for img in info_dict.values()
               for loc in img["region"]])
json.dump(list(regions), open(f'files/backend/regions.json', 'w'))

In [39]:
with open(f"../UI/src/regions.js", 'w') as f:
    f.write("var regions=" + json.dumps(list(regions)) + ";\n\nexport default regions;")

In [46]:
location_infos = Counter([loc for img in info_dict.values() for loc in img['location_info'].lower().strip().split(',')])
location_infos = list(location_infos.keys())
location_infos = [loc.strip() for loc in location_infos if loc]
json.dump(list(location_infos), open(f'files/backend/location_info.json', 'w'))

In [2]:
import json
info_dict = json.load(open("files/info_dict.json"))

In [9]:
def filter_dict(image):
    return { key: info_dict[image][key] for key in ["group", "scene", "time", "gps", 
                                                    "location", "location_info", "country", "ocr"]}

basic_dict = {image: filter_dict(image) for image in info_dict}
json.dump(basic_dict, open(f'files/backend/basic_dict.json', 'w'))

In [44]:
time_info = {}
def get_hour_minute(date_string):
    datetime_value = datetime.strptime(date_string, "%Y/%m/%d %H:%M:00%z")
    return datetime_value.strftime("%I:%M%p")

def get_final_time(first_info, last_info):
    if first_info == last_info:
        return first_info
    return f"{first_info} - {last_info}"

for group_name in groups:
    group_first_info = None
    group_last_info = None
    for scene_name, images in groups[group_name]["scenes"]:
        first_info = info_dict[images[0]]["time"]
        last_info = info_dict[images[-1]]["time"]
        if not group_first_info:
            group_first_info = first_info
        group_last_info = last_info
        time_info[scene_name] = get_final_time(get_hour_minute(first_info), get_hour_minute(last_info))
    time_info[group_name] = get_final_time(get_hour_minute(group_first_info), get_hour_minute(group_last_info))

json.dump(time_info, open(f"files/backend/time_info.json", "w"))