In [11]:
import os
import json
import errno
import zipfile

from shutil import rmtree

from zipfile import ZipFile
from tqdm import tqdm
from glob import iglob
from os.path import join as pjoin

In [3]:
def mkdir_p(path):
    try:
        os.makedirs(path)
    except OSError as exc:
        if exc.errno == errno.EEXIST and os.path.isdir(path):
            pass
        else:
            raise
def del_folder(path):
    try:
        rmtree(path)
    except:
        pass

def read_json(json_path):
    with open(json_path, 'r') as f:
        return json.load(f)

def write_json(save_path, json_obj):
    with open(save_path, 'w', encoding='utf-8') as make_file:
        json.dump(json_obj, make_file, indent="\t")
    return

def get_extension(file_path):
    return file_path.split(".")[-1]


In [37]:
ROOT_DIR = "/root/pc-tod"
DATA_DIR = pjoin(ROOT_DIR, "data/new_sample")

In [38]:
ANNOT_DIR = pjoin(DATA_DIR, "annotations")
IMAGE_DIR = pjoin(DATA_DIR, "images")

In [39]:
emotion_kor_to_eng = {
    "기쁨" : "happy",
    "당황" : "embarrassing",
    "분노" : "angry",
    "불안" : "unrest",
    "상처" : "hurt",
    "슬픔" : "sad",
    "중립" : "neutral",
}

### **Decompress zip files**

In [40]:
# Annotations
for zip_p in tqdm(iglob(pjoin(ANNOT_DIR, "**/*.zip"))):
    zip_filename = zip_p.split(os.sep)[-1].split(".zip")[0]
    emotion, subset = zip_filename.split("_")[1:]
                
    tgt_dir = pjoin(ANNOT_DIR, emotion_kor_to_eng[emotion])
    
    with ZipFile(zip_p, 'r') as zip_ref:
        zip_ref.extractall(tgt_dir)

0it [00:00, ?it/s]


In [None]:
# Images
for zip_p in tqdm(iglob(pjoin(IMAGE_DIR, "**/*.zip"))):
    zip_filename = zip_p.split(os.sep)[-1].split(".zip")[0]
    emotion, subset = zip_filename.split("_")[1:]
                
    tgt_dir = pjoin(IMAGE_DIR, emotion_kor_to_eng[emotion])
    
    with ZipFile(zip_p, 'r') as zip_ref:
        zip_ref.extractall(tgt_dir)

### **Rename files**

In [41]:
import re
def remove_special_chars(text):
    return re.sub('[^0-9a-zA-Z\_\.\-]', '', text)

def del_duplicated_underbar(text):
    return re.sub('[\_]+', '_', text)


In [43]:
# Annotations
for file_p in tqdm(iglob(pjoin(ANNOT_DIR, "**/*.json"))):
    dirname = os.sep.join(file_p.split(os.sep)[:-1])
    filename = remove_special_chars(file_p.split(os.sep)[-1])

    new_file_p = pjoin(dirname, filename)
    if file_p == new_file_p: continue
    
    assert not os.path.exists(new_file_p), f"Already exist: {new_file_p}"
    os.rename(file_p, new_file_p)

1it [00:00, 2943.37it/s]


In [44]:
# Images
for file_p in tqdm(iglob(pjoin(IMAGE_DIR, "**/*.jpg"))):
    dirname = os.sep.join(file_p.split(os.sep)[:-1])
    filename = remove_special_chars(file_p.split(os.sep)[-1])
    filename = del_duplicated_underbar(filename)
    
    new_file_p = pjoin(dirname, filename)
    if file_p == new_file_p: continue

    assert not os.path.exists(new_file_p), f"Already exist: {new_file_p}"
    os.rename(file_p, new_file_p)

290it [00:00, 20683.38it/s]


In [45]:
train_happy = read_json(pjoin(ANNOT_DIR, "happy", "img_emotion_training_data.json"))

In [47]:
len(train_happy)

60103

In [49]:
train_happy[0]

{'filename': '5f656a0f627a3ef96dec882437e3e7ada1c7a877201cf54dcd7a2c4508588ff3_여_30_기쁨_공공시설&종교&의료시설_20201204105732-001-007.jpg',
 'gender': '여',
 'age': 30,
 'isProf': '전문인',
 'faceExp_uploader': '기쁨',
 'bg_uploader': '공공시설/종교/의료시설',
 'annot_A': {'boxes': {'maxX': 1912.2253,
   'maxY': 1581.6027,
   'minX': 1187.4949,
   'minY': 579.22235},
  'faceExp': '기쁨',
  'bg': '공공시설/종교/의료'},
 'annot_B': {'boxes': {'maxX': 1912.348108621648,
   'maxY': 1572.1522585800617,
   'minX': 1206.363701502596,
   'minY': 579.1777983055337},
  'faceExp': '기쁨',
  'bg': '공공시설/종교/의료'},
 'annot_C': {'boxes': {'maxX': 1890.909447114109,
   'maxY': 1567.448627450284,
   'minX': 1183.8414475546967,
   'minY': 596.9434661684523},
  'faceExp': '기쁨',
  'bg': '공공시설/종교/의료'}}