# JSON Processing

In [None]:
%sh
cd data
wget https://md-datasets-cache-zipfiles-prod.s3.eu-west-1.amazonaws.com/4h68fmktwh-1.zip
unzip 4h68fmktwh-1.zip    

In [2]:
import json
import boto3
from pprint import pprint

s3 = boto3.client("s3")

In [3]:
s3.download_file("md-datasets-cache-zipfiles-prod", "4h68fmktwh-1.zip", "data/construction.zip")

In [7]:
!unzip data/construction.zip -d data
!unzip data/image_with_captions.zip -d data
!rm data/construction.zip

Archive:  data/image_with_captions.zip
  inflating: data/UAVdata/1.jpg      
  inflating: data/UAVdata/10.jpg     
  inflating: data/UAVdata/100.jpg    
  inflating: data/UAVdata/1000.jpg   
  inflating: data/UAVdata/1001.jpg   
  inflating: data/UAVdata/1002.jpg   
  inflating: data/UAVdata/1003.jpg   
  inflating: data/UAVdata/1004.jpg   
  inflating: data/UAVdata/1005.jpg   
  inflating: data/UAVdata/1006.jpg   
  inflating: data/UAVdata/1007.jpg   
  inflating: data/UAVdata/1008.jpg   
  inflating: data/UAVdata/1009.jpg   
  inflating: data/UAVdata/101.jpg    
  inflating: data/UAVdata/1010.jpg   
  inflating: data/UAVdata/1011.jpg   
  inflating: data/UAVdata/1012.jpg   
  inflating: data/UAVdata/1013.jpg   
  inflating: data/UAVdata/1014.jpg   
  inflating: data/UAVdata/1015.jpg   
  inflating: data/UAVdata/1016.jpg   
  inflating: data/UAVdata/1017.jpg   
  inflating: data/UAVdata/1018.jpg   
  inflating: data/UAVdata/1019.jpg   
  inflating: data/UAVdata/102.jpg    
  inflating

In [309]:
class BoundingBox(object):
    def __init__(self, x, y, width, height):
        self.x = x
        self.y = y
        self.width = width
        self.height = height
    def __repr__(self):
        return str(self.__dict__)
    
class Token(object):
    def __init__(self, TokenId, Text, BeginOffset, EndOffset, Tag, Score):
        assert Score > 0.9, 'Not enough confidence on tag'
        self.TokenId = TokenId
        self.Text = Text
        self.LowerText = Text.lower()
        self.BeginOffset = BeginOffset
        self.EndOffset = EndOffset
        self.Tag = Tag
        self.Score = Score
    def __repr__(self):
        return str(self.__dict__)

        
class ImageAnnotation(object):
    COLORS = {'blue', 'red', 'yellow', 'green', 'white', 'silver', 'gray', 'grey', 'orange', 'black', 'brown', 'pink', 'purple', 'while'}
    STOPWORDS = {'elevens', 'fivr', 'plie', 'thirteens', 'pile', 'piles'}
    
    def __init__(self, index, bounding_box, phrase, comprehend=None):
        self.bb = bounding_box
        self.index = index
        self.phrase = phrase
        if (comprehend):
            self.update_tokens(comprehend)
        else:
            self.Tokens = None
            self.obj_class = None
        
    def _gen_tokens(self, comprehend):
        return(comprehend.detect_syntax(LanguageCode='en', Text=self.phrase)['SyntaxTokens'])
    
    def update_tokens(self, comprehend):
        self.Tokens = self._gen_tokens(comprehend) if len(self.phrase) else None
        if self.Tokens and len(self.Tokens):
            self.obj_class = self.get_class()
        else:
            self.obj_class = None
        
    def get_class(self):
        assert self.Tokens, "No tokenization available for class extraction"
        def get_object(tokens):
            noun_part = []
            for token in tokens:
                if token['PartOfSpeech']['Tag'] == 'VERB':
                    break
                if token['Text'].lower() in ImageAnnotation.COLORS:
                    continue
                if token['Text'].lower() in ImageAnnotation.STOPWORDS:
                    continue
                if token['PartOfSpeech']['Tag'] in ('ADJ', 'NOUN', 'PROPN'):
                    noun_part.append(token)
            return noun_part
        result = get_object(self.Tokens)
        if len(result) > 0:
            result = result[:-1] if result[-1] == 's' else result
        return(result)
        
    def __repr__(self):
        return str(self.__dict__)


class Image(object):
    def __init__(self, filename, size, annotations):
        self.filename = filename
        self.size = size
        self.annotations = annotations
        
    def classify_image(self):
        self.obj_classes = {annotation.obj_class for annotation in self.annotations if annotation.obj_class}
    
    def gen_tokens(self, comprehend):
        for annot in self.annotations:
            annot.update_tokens(comprehend)
            
    def __repr__(self):
        return str(self.__dict__)


In [310]:
def load_images(path, comprehend=None):
    with open(path, "r") as source_manifest:
        manifest = json.load(source_manifest)
    images = []
    for _, image in manifest.items():
        try:
            try:
                annotations = []
                for (index, annotation) in image['regions'].items():
                    shape_attrs = annotation['shape_attributes']
                    shape_attrs.pop('name')
                    phrase = annotation['region_attributes']['phrase']
                    annotations.append(ImageAnnotation(index, BoundingBox(**shape_attrs), phrase, comprehend ))
            except:
                pprint(annotation['shape_attributes'])
                raise
            images.append(
                Image(image['filename'], image['size'], annotations)
            )
        except:
            pprint(image)
            raise
    return(images)

In [312]:
comprehend = boto3.client(service_name='comprehend', region_name='us-east-1')
image_objs = load_images("data/via_region_data_final.json", comprehend)

In [313]:
image_objs[0].classify_image()

TypeError: unhashable type: 'list'

In [None]:
image_objs[0]