In [39]:
import os
import re
import cv2
import time
import json
import errno
import random
import torch
import openai

import pandas as pd
import numpy as np

from tqdm import tqdm
from glob import iglob
from PIL import Image

from functools import reduce
from shutil import rmtree
from typing import Optional, Tuple, Dict, List
from os.path import join as pjoin

from torch.utils.data import Dataset


In [2]:
def mkdir_p(path):
    try:
        os.makedirs(path)
    except OSError as exc:
        if exc.errno == errno.EEXIST and os.path.isdir(path):
            pass
        else:
            raise
def del_folder(path):
    try:
        rmtree(path)
    except:
        pass

def read_json(json_path):
    with open(json_path, 'r') as f:
        return json.load(f)

def write_json(save_path, json_obj):
    with open(save_path, 'w', encoding='utf-8') as make_file:
        json.dump(json_obj, make_file, indent="\t")
    return

In [7]:
DATA_DIR = "/data/ai_hub/dst/"
ANNOT_DIR = pjoin(DATA_DIR, "annotations")

In [8]:
dev_annot = read_json(pjoin(ANNOT_DIR, "dev.json"))
len(dev_annot)

274

In [9]:
dev_annot[0]

{'id': 311,
 'filename': '25ba191dd4beed1d666725510ccc4a968fdcb567dac81adba024c720d82414af_남_20_중립_문화재 및 유적지_20201204233820-009-005.jpg',
 'age': 20,
 'sex': 'male',
 'emotion': 'neutral',
 'dialogue': [{'turn_id': 0,
   'system': 'How are you feeling today?',
   'user': "I'm feeling alright ",
   'belief_state': {}}]}

In [35]:
from transformers import AutoProcessor, AutoModelForCausalLM

class MultimodalDstDataset(Dataset):
    def __init__(self, 
                 json_path, 
                 vision_dir, 
                 processor=None, 
                 model_path=None, 
                 max_length=512, 
                 seed=19
                 ):
        self.data = read_json(json_path)
        self.start_of_belief = "=> Belief State:"
        self.eob = "[EOB]"

        self.processor = processor if not model_path else AutoProcessor.from_pretrained(model_path)
        self.vision_dir = vision_dir
        self.max_length = max_length

        random.seed(seed)
        random.shuffle(self.data)
        
        self.build_dataset()        

    def __len__(self):
        return len(self.data)
    
    def build_dataset(self):
        predicts, targets, visions = [], [], []
        
        for content in self.data:
            dialog = content['dialogue']
            uid = content["filename"].split("_")[0]
            upload_id = content["filename"].split("_")[-1]
            
            filename = "_".join([uid, content["sex"], str(content["age"]), content["emotion"], upload_id])

            user_state = dict(user_state=content["emotion"])
            context = "User:"
            predicts += [f"{context} {self.start_of_belief}"]
            targets += [f"{context} {self.start_of_belief} {str(user_state)} {self.eob} System: {dialog[0]['system']}"]

            for i in range(len(dialog) - 1):
                # Update context
                context += f" System: {dialog[i]['system']}" + f" User: {dialog[i]['user']}"
                belief_state = dialog[i]['belief_state']
                
                visions += [filename]
                predicts += [f"{context} {self.start_of_belief}"]
                targets += [f"{context} {self.start_of_belief} {str(belief_state)} {self.eob} System: {dialog[i + 1]['system']}"]
        
        assert len(predicts) == len(targets)

        input_ = self.processor.tokenizer.batch_encode_plus(
            predicts, add_special_tokens=True, max_length=self.max_length, truncation=True
        )
        output_ = self.processor.tokenizer.batch_encode_plus(
            targets, add_special_tokens=True, max_length=self.max_length, truncation=True
        )
        self.inputs = input_["input_ids"]
        self.input_masks = input_["attention_mask"]
        self.labels = output_["input_ids"]  
        self.visions = visions

    def __getitem__(self, index):        
        image_path = pjoin(self.vision_dir, self.visions[index])
        try:
            # Process image
            image = Image.open(image_path).convert("RGB")
            pixel_values = self.processor(images=image, return_tensors="pt").pixel_values
        except:
            print(f"Failed to load examples with image: {image_path}. ")

        return dict(
            pixel_values=pixel_values,
            input_ids=self.inputs[index],
            attention_mask=self.input_masks[index],
            labels=self.labels[index],
        )

In [36]:
IMAGE_DIR = "/data/ai_hub/images/"

In [37]:
dev_dataset = MultimodalDstDataset(json_path=pjoin(ANNOT_DIR, "dev.json"),
                        vision_dir=IMAGE_DIR,
                        processor=None, 
                        model_path="microsoft/git-base", 
                        max_length=512, 
                        seed=19
                         )

In [40]:
len(dev_dataset.inputs)

988

In [41]:
train_dataset = MultimodalDstDataset(json_path=pjoin(ANNOT_DIR, "train.json"),
                        vision_dir=IMAGE_DIR,
                        processor=None, 
                        model_path="microsoft/git-base", 
                        max_length=512, 
                        seed=19
                         )

In [42]:
len(train_dataset.inputs)

2993