In [1]:
# Import libraries
from sklearn.cluster import DBSCAN
import numpy as np
import bamboolib as bam
import pandas as pd
from collections import Counter, defaultdict
from datetime import datetime
from dateutil import tz
import json
import geopy.distance
from nltk import ngrams
import joblib
import os

# CREATE DICT

In [2]:
ocr_scores = {}
# for year in os.listdir("/home/nmduy/LSC2022/LSC_Metada/OCR/text_area/"):
#     if ".json" in year:
#         ocr_scores.update(json.load(open(f"/home/nmduy/LSC2022/LSC_Metada/OCR/text_area/{year}")))

In [3]:
unhelpful_images = json.load(open("files/unhelpful_images.json"))
metadata = pd.read_csv('VAISL/files/final_metadata.csv', sep=',', decimal='.')

In [4]:
metadata['checkin'] = metadata['checkin'].fillna("")
metadata['city'] = metadata['city'].fillna("")

In [5]:
words_to_remove = ["Province", "Área metropolitana de Madrid y Corredor del Henares", "Community of", "The Municipal District of",
                  "Kreis", "Landkreis", "Regional Unit", "Municipal Unit", "Municipality", "Administrative District", "Region of",
                  "Provence-Alpes-Côte d'Azur", "Municipal Borough District", "Subdistrict Administrative Organization", "Subdistrict",
                  "District", "Distretto di", "Municipal District", "City", "Land ", "Urban agglomeration"]
words_to_remove = sorted(words_to_remove, key=lambda x: -len(x))

In [6]:
def remove_extra(city):
    city = city.split(",")
    new_city = []
    for name in city:
        for word in words_to_remove:
            name = name.replace(word, "")
        name = name.replace("of ", "")
        name = name.strip()
        if name and name not in new_city:
            new_city.append(name)
    return ", ".join(new_city)
        
metadata["city"] = metadata["city"].apply(remove_extra)

## New timezone processing

In [7]:
import geojson
country_geojson = geojson.load(open("../original_data/countries.geojson"))

In [8]:
all_countries = set(metadata["country"].tolist())

In [9]:
geojson_data = {}
for country in country_geojson["features"]:
    name = country["properties"]["ADMIN"]
    if name.lower() in all_countries or name in ["United Kingdom", "South Korea"]:
        geojson_data[name] = country
geojson_data["Korea"] = geojson_data["South Korea"]
geojson_data["England"] = geojson_data["United Kingdom"]

with open(f"/home/tlduyen/LSC22/lsc2020-UI/src/worldmap.js", 'w') as f:
    f.write("var worldmap=" + json.dumps(geojson_data) + ";\n\nexport {worldmap};")

In [10]:
json.dump(geojson_data, open("files/backend/countries.json", "w"))

In [11]:
# from nltk.corpus import stopwords
# from tqdm.auto import tqdm
# stop_words = set(stopwords.words('english'))
# def process_for_ocr(word):
#     final_text = defaultdict(float)
#     final_text[word] = 1
#     for i in range(0, len(word)-1):
#         if len(word[:i+1]) > 1:
#             final_text[word[:i+1]] += (i+1) / len(word)
#         if len(word[i+1:]) > 1:
#             final_text[word[i+1:]] += 1 - (i+1)/len(word)
#     return final_text

# def create(ocr_scores):
#     tf = {}
#     idf = defaultdict(lambda: 0)
#     for image, scores in tqdm(ocr_scores.items()):
#         tf[image] = defaultdict(float)
#         word_set = set()
#         for score in scores:
#             word = score['text'].lower()
#             for subword in word.lower().split():
#                 if len(subword) > 1:
#                     splited_words = process_for_ocr(subword)
#                     for w, s in splited_words.items():
#                         if w not in stop_words:
#                             word_set.add(w)
#                             tf[image][w] += np.log(1 + score['area'] * 5000 * s)
#         for word in word_set:
#             idf[word] += 1

#     tf_idf = {}
#     print(len(tf))
#     for image in tqdm(tf):
#         tf_idf[image] = {}
#         for word in tf[image]:
#             if idf[word]:
#                 tf_idf[image][word] = tf[image][word] * np.log(len(tf) / idf[word])
#                 assert (tf_idf[image][word] >= 0), f"negative value {tf_idf[image][word]}, {tf[image][word]}, {word}, {idf[word]}, {np.log(len(tf) / idf[word])}"
#             else:
#                 tf_idf[image][word] = 0
#     return idf, tf_idf

# idf, tf_idf = create(ocr_scores)

In [12]:
# idf = dict(idf.items())
# joblib.dump((tf_idf, idf), "files/ocr_tfidf.joblib")
# tf_idf, idf = joblib.load("files/ocr_tfidf.joblib")
tf_idf, idf = {}, {}

In [13]:
metadata["new_timezone"] = metadata["new_timezone"].ffill()
metadata["country"] = metadata["country"].fillna("")
metadata["OCR"] = metadata["OCR"].fillna("")
metadata["location_info"] = metadata.apply(lambda row: row["categories"] if row["stop"] else row["checkin"], axis=1)
metadata["location_info"] = metadata["location_info"].fillna("")

In [14]:
from tqdm import tqdm_notebook as tqdm
info_dict = {}
def to_full_key(image):
    return f"{image[:6]}/{image[6:8]}/{image}"

def to_local_time(utc_time, time_zone):
    return utc_time.astimezone(tz.gettz(time_zone))

for index, row in tqdm(metadata.iterrows(), total=len(metadata)):
    image = row['ImageID']
    if isinstance(image, str):
        if image not in unhelpful_images:
            image = to_full_key(image)
            utc_time = datetime.strptime(row["minute_id"]+"00", "%Y%m%d_%H%M%S").replace(tzinfo=tz.gettz('UTC'))
            local_time = to_local_time(utc_time, row["new_timezone"])
            info_dict[image] = {
                "image_path": image,
                "minute_id": row["minute_id"],
                "time": datetime.strftime(local_time, "%Y/%m/%d %H:%M:00%z"),
                "utc_time": datetime.strftime(utc_time, "%Y/%m/%d %H:%M:00%z"),
                "weekday": datetime.strftime(local_time, "%A").lower(),
                "descriptions": row['Tags'].lower().split(',') if isinstance(row['Tags'], str) else "",
                "address": row["city"],
                "location": row["checkin"] if row["stop"] else "None",
                "location_info": row["location_info"],
                "gps": {"lat": row["new_lat"],
                        "lon": row["new_lng"]},
                "region": row["city"].lower().split(', '),
                "country": row["country"].lower(),
                "ocr": row["OCR"].split(', '),
                "timestamp": utc_time.timestamp() #!TODO in es.py
            }

            if image in tf_idf:
                info_dict[image]["ocr_score"] = dict([item for item in tf_idf[image].items() if item[1] > 0])
            else:
                info_dict[image]["ocr_score"] = {}

  0%|          | 0/723329 [00:00<?, ?it/s]

In [15]:
import os
from unidecode import unidecode

fields_to_fix = ["address", "location", "region"]
for image in info_dict:
    for field in fields_to_fix:
        if isinstance(info_dict[image][field], str):
            info_dict[image][field] = unidecode(
                info_dict[image][field])
        elif isinstance(info_dict[image][field], list):
            info_dict[image][field] = [unidecode(s) for s in info_dict[image][field]]
        elif np.isnan(info_dict[image][field]):
            info_dict[image][field] = "NONE"
        else:
            print(field, info_dict[image][field])

In [16]:
# import json 
# info_dict = json.load(open(f"files/info_dict.json"))

In [17]:
info_dict["201901/06/20190106_171105_000.jpg"]

{'image_path': '201901/06/20190106_171105_000.jpg',
 'minute_id': '20190106_1711',
 'time': '2019/01/06 19:11:00+0200',
 'utc_time': '2019/01/06 17:11:00+0000',
 'weekday': 'sunday',
 'descriptions': ['table', 'food', 'plate', 'dish', 'meal'],
 'address': 'Thessaloniki, Macedonia and Thrace, Greece',
 'location': 'Eat Skaste',
 'location_info': 'Souvlaki Shop',
 'gps': {'lat': 40.6361591, 'lon': 22.9366191},
 'region': ['thessaloniki', 'macedonia and thrace', 'greece'],
 'country': 'greece',
 'ocr': [''],
 'timestamp': 1546794660.0,
 'ocr_score': {}}

In [18]:
groups = json.load(open('files/group_segments.json'))
scene_info = {}

assigned = []
count = 0
for group_name in groups:
    group_id = int(group_name.split('_')[-1])
    for scene_name, images in groups[group_name]["scenes"]:
        images = [image for image in images if image in info_dict]
        if not images:
            continue
        scene_info[scene_name] = {
            "group": group_name,
            "images": images,
            "start_time": info_dict[images[0]]["time"],
            "end_time": info_dict[images[-1]]["time"],
            "start_timestamp": info_dict[images[0]]["timestamp"],
            "end_timestamp": info_dict[images[-1]]["timestamp"]
        }
        for key in ["location", "location_info", "region", "country", "weekday"]:
            scene_info[scene_name][key] = info_dict[images[0]][key]
        
        # for key in ["gps"]:
            # scene_info[scene_name][key] = [info_dict[image][key] for image in images]
        
        for image in images:
            info_dict[image]["scene"] = scene_name
            info_dict[image]["group"] = group_name
            count += 1
            assigned.append(image)

print(len(set(assigned)), len(info_dict))
# I THINK THERE'S SOMETHING WRONG HERE
if len(set(assigned)) < len(info_dict):
    to_remove = set(info_dict.keys()).difference(assigned)
    for img in to_remove:
        del info_dict[img]

713861 714583


In [19]:
scene_info["S_218"]

{'group': 'G_16',
 'images': ['201901/01/20190101_164638_000.jpg',
  '201901/01/20190101_164710_000.jpg',
  '201901/01/20190101_164742_000.jpg',
  '201901/01/20190101_164814_000.jpg',
  '201901/01/20190101_164846_000.jpg',
  '201901/01/20190101_164918_000.jpg',
  '201901/01/20190101_164950_000.jpg',
  '201901/01/20190101_165022_000.jpg',
  '201901/01/20190101_165054_000.jpg',
  '201901/01/20190101_165126_000.jpg',
  '201901/01/20190101_165158_000.jpg'],
 'start_time': '2019/01/01 16:46:00+0000',
 'end_time': '2019/01/01 16:51:00+0000',
 'start_timestamp': 1546361160.0,
 'end_timestamp': 1546361460.0,
 'location': "Eddie Rocket's",
 'location_info': 'Burger Joint, Diner, Fast Food Restaurant',
 'region': ['dublin', 'ireland', 'leinster'],
 'country': 'ireland',
 'weekday': 'tuesday'}

In [20]:
import json

json.dump(info_dict, open(f"files/info_dict.json", "w"))
json.dump(scene_info, open(f"files/scene_dict.json", "w"))

# PREPARE BACKEND

In [22]:
locations = set([img["location"].lower().strip() for img in info_dict.values()])
if "none" in locations:
    locations.remove("none")
extra = set()
location_with_extras = {}
for loc in locations:
    if loc:
        location_with_extras[loc] = []
        for lengram in range(2, len(loc)):
            for ngram in ngrams(loc.split(), lengram):
                location_with_extras[loc].append(" ".join(ngram))
        location_with_extras[loc].append(loc)
        location_with_extras[loc] = location_with_extras[loc][::-1]
json.dump(location_with_extras, open(f'files/backend/locations.json', 'w'))
print(len(locations))

718


In [23]:
"home" in locations

True

In [24]:
regions = set([loc.lower().strip() for img in info_dict.values()
               for loc in img["region"]])
json.dump(list(regions), open(f'files/backend/regions.json', 'w'))

In [25]:
with open(f"/home/tlduyen/LSC22/lsc2020-UI/src/regions.js", 'w') as f:
    f.write("var regions=" + json.dumps(list(regions)) + ";\n\nexport default regions;")

In [26]:
all_keywords_counter = Counter([w for img in info_dict.values() for w in img["descriptions"]])

json.dump(all_keywords_counter, open(f'files/backend/all_keywords.json', 'w'))
# all_keywords_counter

In [28]:
def filter_dict(image):
    return { key: info_dict[image][key] for key in ["group", "scene", "time", "gps", "location", "location_info"]}

basic_dict = {image: filter_dict(image) for image in info_dict}
json.dump(basic_dict, open(f'files/backend/basic_dict.json', 'w'))

In [29]:
time_info = {}
def get_hour_minute(date_string):
    datetime_value = datetime.strptime(date_string, "%Y/%m/%d %H:%M:00%z")
    return datetime_value.strftime("%I:%M%p")

def get_final_time(first_info, last_info):
    if first_info == last_info:
        return first_info
    return f"{first_info} - {last_info}"

for group_name in groups:
    group_first_info = None
    group_last_info = None
    for scene_name, images in groups[group_name]["scenes"]:
        first_info = info_dict[images[0]]["time"]
        last_info = info_dict[images[-1]]["time"]
        if not group_first_info:
            group_first_info = first_info
        group_last_info = last_info
        time_info[scene_name] = get_final_time(get_hour_minute(first_info), get_hour_minute(last_info))
    time_info[group_name] = get_final_time(get_hour_minute(group_first_info), get_hour_minute(group_last_info))

json.dump(time_info, open(f"files/backend/time_info.json", "w"))

In [30]:
time_info["S_36"]

'11:05AM - 11:11AM'