In [1]:
# Import libraries
from sklearn.cluster import DBSCAN
import numpy as np
import bamboolib as bam
import pandas as pd
from collections import Counter, defaultdict
from datetime import datetime
from dateutil import tz
import json
import geopy.distance
from nltk import ngrams
import joblib
import os

In [2]:
def process_for_ocr(word):
    final_text = defaultdict(float)
    final_text[word] = 1
    for i in range(0, len(word)-1):
        if len(word[:i+1]) > 1:
            final_text[word[:i+1]] += (i+1) / len(word)
        if len(word[i+1:]) > 1:
            final_text[word[i+1:]] += 1 - (i+1)/len(word)
    return final_text

# CREATE DICT

In [3]:
# ocr_scores = json.load(open("../original_data/OCR_201901.json"))
ocr_scores = {}
# for year in os.listdir("/home/nmduy/LSC2022/LSC_Metada/OCR/text_area/"):
#     if ".json" in year:
#         ocr_scores.update(json.load(open(f"/home/nmduy/LSC2022/LSC_Metada/OCR/text_area/{year}")))

In [4]:
unhelpful_images = json.load(open("files/unhelpful_images.json"))
# metadata = pd.read_csv('files/processed.csv', sep=',', decimal='.')
metadata = pd.read_csv('VAISL/files/final_metadata.csv', sep=',', decimal='.')

## Old timezone processing

In [5]:
import numpy as np
metadata['time_zone'] = metadata['time_zone'].astype('string')
metadata['time_zone'].value_counts()

Europe/Dublin          150918
Asia/Bangkok             7504
Europe/Athens            2042
Asia/Ho_Chi_Minh         1763
Europe/Madrid            1494
Europe/Istanbul          1156
Europe/London            1017
Asia/Seoul                938
Europe/Paris              588
Europe/Berlin             502
Europe/Zurich             399
Australia/Melbourne       383
Europe/Copenhagen         220
Europe/Bucharest          131
Europe/Oslo               127
America/Toronto            83
Etc/GMT-6                  67
Etc/GMT                    62
Europe/Sofia               55
Etc/GMT-8                  36
Europe/Belgrade            10
Europe/Amsterdam            9
Asia/Phnom_Penh             7
Europe/Zagreb               6
Asia/Kolkata                5
Europe/Budapest             5
Europe/Brussels             3
Europe/Ljubljana            2
Europe/Vienna               2
Etc/GMT-2                   2
Name: time_zone, dtype: Int64

In [6]:
threshold = 100
timezones = metadata['time_zone'].value_counts()
cutoff = len(timezones)
for i in range(len(timezones)):
    if timezones[i] < threshold:
        cutoff = i
        break
okay_timezones = timezones.index.values[:cutoff].tolist()

In [7]:
tmp_condition = metadata['time_zone'].isin(okay_timezones)
metadata.loc[tmp_condition, 'time_zone'] = metadata['time_zone']
metadata.loc[~tmp_condition, 'time_zone'] = np.nan
metadata[['time_zone']] = metadata[['time_zone']].fillna(method='ffill')
metadata[['time_zone']] = metadata[['time_zone']].fillna(method='bfill')
metadata['ImageID'] = metadata['ImageID'].astype('string')
# metadata

In [8]:
time_zones = {"Europe/Dublin": "ireland",
              "Europe/Athens": "greece",
              "Europe/Berlin": "germany",
              "Asia/Bangkok": "thailand",
              "Asia/Ho_Chi_Minh": "vietnam",
              "Europe/Madrid": "spain",
              "Europe/Istanbul": "turkey",
              "Europe/London": "england",
              "Asia/Seoul": "korea",
              "Europe/Paris": "france",
              "Europe/Zurich": "switzerland",
              "Australia/Melbourne": "australia",
              "Europe/Copenhagen": "denmark",
              "Europe/Bucharest": "romania",
              "Europe/Oslo": "norway"}
all_countries = time_zones.values()

## New timezone processing

In [5]:
import geojson
country_geojson = geojson.load(open("../original_data/countries.geojson"))

In [6]:
all_countries = set(metadata["country"].tolist())

In [7]:
geojson_data = {}
for country in country_geojson["features"]:
    name = country["properties"]["ADMIN"]
    if name.lower() in all_countries or name in ["United Kingdom", "South Korea"]:
        geojson_data[name] = country
geojson_data["Korea"] = geojson_data["South Korea"]
geojson_data["England"] = geojson_data["United Kingdom"]

with open(f"/home/tlduyen/LSC2020/LSC2020/ui/src/worldmap.js", 'w') as f:
    f.write("var worldmap=" + json.dumps(geojson_data) + ";\n\nexport {worldmap};")

In [8]:
json.dump(geojson_data, open("files/backend/countries.json", "w"))

In [72]:
# from nltk.corpus import stopwords
# stop_words = set(stopwords.words('english'))

# def create(ocr_scores):
#     tf = {}
#     idf = defaultdict(lambda: 0)
#     for image, scores in tqdm(ocr_scores.items()):
#         tf[image] = defaultdict(float)
#         word_set = set()
#         for score in scores:
#             word = score['text'].lower()
#             for subword in word.lower().split():
#                 if len(subword) > 1:
#                     splited_words = process_for_ocr(subword)
#                     for w, s in splited_words.items():
#                         if w not in stop_words:
#                             word_set.add(w)
#                             tf[image][w] += np.log(1 + score['area'] * 5000 * s)
#         for word in word_set:
#             idf[word] += 1

#     tf_idf = {}
#     print(len(tf))
#     for image in tqdm(tf):
#         tf_idf[image] = {}
#         for word in tf[image]:
#             if idf[word]:
#                 tf_idf[image][word] = tf[image][word] * np.log(len(tf) / idf[word])
#                 assert (tf_idf[image][word] >= 0), f"negative value {tf_idf[image][word]}, {tf[image][word]}, {word}, {idf[word]}, {np.log(len(tf) / idf[word])}"
#             else:
#                 tf_idf[image][word] = 0
#     return idf, tf_idf

# idf, tf_idf = create(ocr_scores)

In [9]:
# idf = dict(idf.items())
# joblib.dump((tf_idf, idf), "files/ocr_tfidf.joblib")
# tf_idf, idf = joblib.load("files/ocr_tfidf.joblib")
tf_idf, idf = {}, {}

In [10]:
metadata["new_timezone"] = metadata["new_timezone"].ffill()

In [11]:
from tqdm import tqdm_notebook as tqdm
info_dict = {}

def to_local_time(utc_time, time_zone):
    return utc_time.astimezone(tz.gettz(time_zone))

def to_full_key(image):
    return f"{image[:6]}/{image[6:8]}/{image}"

for index, row in tqdm(metadata.iterrows(), total=len(metadata)):
    image = row['ImageID']
    if isinstance(image, str):
        if image not in unhelpful_images:
            utc_time = datetime.strptime(row["minute_id"]+"00", "%Y%m%d_%H%M%S").replace(tzinfo=tz.gettz('UTC'))
            local_time = to_local_time(utc_time, row["new_timezone"])
            #TODO!
            image = to_full_key(image)
            info_dict[image] = {
                "image_path": image,
                "minute_id": row["minute_id"],
                "time": datetime.strftime(local_time, "%Y/%m/%d %H:%M:00%z"),
                "utc_time": datetime.strftime(utc_time, "%Y/%m/%d %H:%M:00%z"),
                "weekday": datetime.strftime(local_time, "%A").lower(),
                "descriptions": row['Tags'].lower().split(',') if isinstance(row['Tags'], str) else "",
                "address": row["city"],
                "location": row["new_name"],
                "gps": {"lat": row["new_lat"],
                        "lon": row["new_long"]},
                "region": row["city"].lower().split(', ') if isinstance(row["city"], str) else [],
                "country": row["country"].lower() if isinstance(row["country"], str) else None,
                "ocr": row["OCR"].split(', ') if isinstance(row['OCR'], str) else "",
                "timestamp": utc_time.timestamp() #!TODO in es.py
            }

            if image in tf_idf:
                info_dict[image]["ocr_score"] = dict([item for item in tf_idf[image].items() if item[1] > 0])
            else:
                info_dict[image]["ocr_score"] = {}

  0%|          | 0/723329 [00:00<?, ?it/s]

In [26]:
info_dict["201902/08/20190208_172845_000.jpg"]

{'image_path': '201902/08/20190208_172845_000.jpg',
 'minute_id': '20190208_1728',
 'time': '2019/02/08 20:28:00+0300',
 'utc_time': '2019/02/08 17:28:00+0000',
 'weekday': 'friday',
 'descriptions': ['text', 'person', 'indoor', 'store'],
 'address': 'Turkey, Marmara',
 'location': 'Istanbul Ataturk Havalimani',
 'gps': {'lat': 40.984292, 'lon': 28.8156077},
 'region': ['turkey', 'marmara'],
 'country': 'turkey',
 'ocr': ['gặp,lại,ain,TRÀ,CHANH,CROS,KHUY,Chupa,Chúps,TMINT,DOUDLEMIN,ININTIN'],
 'timestamp': 1549646880.0,
 'ocr_score': {},
 'scene': 'S_16002',
 'group': 'G_705',
 'before': ['201902/08/20190208_172136_000.jpg',
  '201902/08/20190208_172208_000.jpg',
  '201902/08/20190208_172240_000.jpg',
  '201902/08/20190208_172312_000.jpg',
  '201902/08/20190208_172344_000.jpg',
  '201902/08/20190208_172416_000.jpg',
  '201902/08/20190208_172501_000.jpg'],
 'after': ['201902/08/20190208_172949_000.jpg',
  '201902/08/20190208_173021_000.jpg',
  '201902/08/20190208_173053_000.jpg',
  '201

In [13]:
both = both.loc[both['ImageID'].str.contains('20190208_174', case=False, regex=False, na=False)]
both

714583 714583


In [14]:
info_dict["201901/01/20190101_164846_000.jpg"]

{'image_path': '201901/01/20190101_164846_000.jpg',
 'minute_id': '20190101_1648',
 'time': '2019/01/01 16:48:00+0000',
 'utc_time': '2019/01/01 16:48:00+0000',
 'weekday': 'tuesday',
 'descriptions': ['person',
  'food',
  'table',
  'plate',
  'indoor',
  'eating',
  'dessert',
  'meal'],
 'address': 'Dublin, Ireland, Leinster',
 'location': "Eddie Rocket's",
 'gps': {'lat': 53.2828644, 'lon': -6.4222863},
 'region': ['dublin', 'ireland', 'leinster'],
 'country': 'ireland',
 'ocr': '',
 'timestamp': 1546361280.0,
 'ocr_score': {},
 'scene': 'S_253',
 'group': 'G_17',
 'before': ['201901/01/20190101_154407_000.jpg',
  '201901/01/20190101_154439_000.jpg',
  '201901/01/20190101_154511_000.jpg',
  '201901/01/20190101_154543_000.jpg',
  '201901/01/20190101_154615_000.jpg',
  '201901/01/20190101_154647_000.jpg',
  '201901/01/20190101_154719_000.jpg',
  '201901/01/20190101_154751_000.jpg',
  '201901/01/20190101_154823_000.jpg',
  '201901/01/20190101_154855_000.jpg'],
 'after': ['201901/01/2

In [15]:
import json
import os
from unidecode import unidecode

fields_to_fix = ["address", "location", "region"]
for image in info_dict:
    for field in fields_to_fix:
        if isinstance(info_dict[image][field], str):
            info_dict[image][field] = unidecode(
                info_dict[image][field])
        elif isinstance(info_dict[image][field], list):
            info_dict[image][field] = [unidecode(s) for s in info_dict[image][field]]
        elif np.isnan(info_dict[image][field]):
            info_dict[image][field] = "NONE"
        else:
            print(field, info_dict[image][field])

In [16]:
json.dump(info_dict, open(f"files/info_dict.json", "w"))

# PREPARE BACKEND

In [17]:
locations = set([img["location"].lower().strip() for img in info_dict.values()])
if "none" in locations:
    locations.remove("none")
extra = set()
location_with_extras = {}
for loc in locations:
    location_with_extras[loc] = []
    for lengram in range(2, len(loc)):
        for ngram in ngrams(loc.split(), lengram):
            location_with_extras[loc].append(" ".join(ngram))
    location_with_extras[loc].append(loc)
    location_with_extras[loc] = location_with_extras[loc][::-1]
json.dump(location_with_extras, open(f'files/backend/locations.json', 'w'))
print(len(locations))

779


In [18]:
regions = set([loc.lower().strip() for img in info_dict.values()
               for loc in img["region"]])
json.dump(list(regions), open(f'files/backend/regions.json', 'w'))

In [19]:
with open(f"/home/tlduyen/LSC2020/LSC2020/ui/src/regions.js", 'w') as f:
    f.write("var regions=" + json.dumps(list(regions)) + ";\n\nexport default regions;")

In [20]:
all_keywords_counter = Counter([w for img in info_dict.values() for w in img["descriptions"]])

json.dump(all_keywords_counter, open(f'files/backend/all_keywords.json', 'w'))
# all_keywords_counter

In [21]:
overlap = defaultdict(lambda: defaultdict(lambda: 0))
for img in info_dict.values():
    for w in img["descriptions"]:
        if w:
            for w2 in img["descriptions"]:
                if w2 and w2!=w:
                    overlap[w2][w] += 1

json.dump(overlap, open(f'files/backend/overlap_keywords.json', 'w'))

In [22]:
def filter_dict(image):
    return { key: info_dict[image][key] for key in ["group", "scene", "time", "gps", "location"]}

basic_dict = {image: filter_dict(image) for image in info_dict}
json.dump(basic_dict, open(f'files/backend/basic_dict.json', 'w'))

In [23]:
time_info = {}
def get_hour_minute(date_string):
    datetime_value = datetime.strptime(date_string, "%Y/%m/%d %H:%M:00%z")
    return datetime_value.strftime("%I:%M%p")

def get_final_time(first_info, last_info):
    if first_info == last_info:
        return first_info
    return f"{first_info} - {last_info}"

for group_name in groups:
    group_first_info = None
    group_last_info = None
    for scene_name, images in groups[group_name]["scenes"]:
        first_info = info_dict[images[0]]["time"]
        last_info = info_dict[images[-1]]["time"]
        if not group_first_info:
            group_first_info = first_info
        group_last_info = last_info
        time_info[scene_name] = get_final_time(get_hour_minute(first_info), get_hour_minute(last_info))
    time_info[group_name] = get_final_time(get_hour_minute(group_first_info), get_hour_minute(group_last_info))

json.dump(time_info, open(f"files/backend/time_info.json", "w"))

In [24]:
time_info["S_36"]

'11:02AM'