In [9]:
import json
import numpy as np
import os
import pandas as pd
import cv2
import tqdm
import pickle
import random
import sys
import re
import glob
import shutil
import collections
import operator
%matplotlib inline
from matplotlib import pyplot as plt

In [10]:
path_all_annotated_image = os.path.join('datasets','KBF','ANNOTATED_IMAGES')
#hte name of the class should be named: "CLASS"

# Download annotation produced through VIA 

In [11]:
#due to old annotations we have way more annotations that will then be removed in preprocessing
path_new_annotations = os.path.join(path_all_annotated_image, 'annotations', "*via_region_data*.json")
dico_name_df = {}
for p in glob.glob(path_new_annotations):
    annotation_name = p.split('\\')[-1]
    dico = json.load(open(p))
    df_ = pd.DataFrame(dico).transpose()
    dico_name_df[annotation_name] = df_

#aggregate all together
df = pd.DataFrame()
for name,df_ in dico_name_df.items():
    df_['annotations_name'] = name
    df = pd.concat([df,df_],ignore_index=True)

print(df.shape)
df.head(3)

(372, 5)


Unnamed: 0,filename,size,regions,file_attributes,annotations_name
0,0.jpg,70631,"[{'shape_attributes': {'name': 'polygon', 'all...",{},via_region_data (14).json
1,1.jpg,65599,"[{'shape_attributes': {'name': 'polygon', 'all...",{},via_region_data (14).json
2,2.jpg,66715,"[{'shape_attributes': {'name': 'polygon', 'all...",{},via_region_data (14).json


# Preprocessing - remove bad annotations

In [12]:
df['annotations_name'].unique()

array(['via_region_data (14).json'], dtype=object)

### remove annotations without masks

In [13]:
df['nbr_regions'] = df['regions'].map(lambda x: len(x))
df['nbr_regions'].value_counts()

0    195
1    120
2     43
3     13
4      1
Name: nbr_regions, dtype: int64

In [14]:
#remove rows without any regions (might happen that some images have two rows, one with and one without)
df = df[df['nbr_regions']!=0]
print(df.shape)
df.head(3)

(177, 6)


Unnamed: 0,filename,size,regions,file_attributes,annotations_name,nbr_regions
0,0.jpg,70631,"[{'shape_attributes': {'name': 'polygon', 'all...",{},via_region_data (14).json,2
1,1.jpg,65599,"[{'shape_attributes': {'name': 'polygon', 'all...",{},via_region_data (14).json,1
2,2.jpg,66715,"[{'shape_attributes': {'name': 'polygon', 'all...",{},via_region_data (14).json,1


### remove annotations associated with images without any class

In [16]:
df['regions'].iloc[0]

[{'shape_attributes': {'name': 'polygon',
   'all_points_x': [274, 305, 348, 336, 279],
   'all_points_y': [625, 632, 655, 682, 659]},
  'region_attributes': {'CLASS': 'F'}},
 {'shape_attributes': {'name': 'polygon',
   'all_points_x': [452, 473, 490, 495, 512, 500, 469, 450],
   'all_points_y': [1143, 1147, 1166, 1201, 1216, 1223, 1210, 1158]},
  'region_attributes': {'CLASS': 'G'}}]

In [17]:
#remove all annotations having at least one mask without a type
df['no_type'] = df['regions'].map(lambda x: sum(['CLASS' not in i['region_attributes'] for i in x])>0)
df.head(3)

Unnamed: 0,filename,size,regions,file_attributes,annotations_name,nbr_regions,no_type
0,0.jpg,70631,"[{'shape_attributes': {'name': 'polygon', 'all...",{},via_region_data (14).json,2,False
1,1.jpg,65599,"[{'shape_attributes': {'name': 'polygon', 'all...",{},via_region_data (14).json,1,False
2,2.jpg,66715,"[{'shape_attributes': {'name': 'polygon', 'all...",{},via_region_data (14).json,1,False


In [18]:
#due to 'old' annotations
x0 = df.shape[0]
df = df[~df['no_type']]
x1 = df.shape[0]
print('We lost %d annotated images by removing no-type image'%(x0-x1))

We lost 0 annotated images by removing no-type image


### modify regions with faute d'orthographe

In [23]:
def mod_list_reg(li):
    for r in li:
        r['region_attributes']['CLASS'] = r['region_attributes']['CLASS'].upper().replace('g','G')
    return(li)
df['regions'] = df['regions'].map(lambda x: mod_list_reg(x))

In [24]:
#verify if all faut are included:
li_reg = df['regions'].tolist()
li_possible_type = []
for reg in li_reg:
    try:
        for r in reg:
            li_possible_type.append(r['region_attributes']['CLASS'])
    except Exception as e:
        print(reg, e)
print('There will be %d different class'%len(set(li_possible_type)))
print(set(li_possible_type))

There will be 5 different class
{'', 'R', 'GF', 'F', 'G'}


### remove multiple annotations for one image

li_filename = df['filename'].tolist()
ctest = dict(collections.Counter(li_filename))
ctest = sorted(ctest.items(), key=operator.itemgetter(1))
ctest.reverse()
ctest

In [10]:
#df[df['filename']=='VERB01_CAM04B04_26.05.17_211203_CAA_C_544_2996_fish-cam04-2017-05-26_21_40403.jpg']

#remove one by one to make sur and understand why
print(df.shape)
df = df[~((df['filename']=='VERB01_CAM04B04_26.05.17_211203_CAA_C_544_2996_fish-cam04-2017-05-26_21_40403.jpg') &\
  (df['annotations_name']=='special_situation_0_FINI_via_region_data (36).json'))]
print(df.shape)

### adding info

In [25]:
df['path'] = df['filename'].map(lambda x: os.path.join(path_all_annotated_image,x))
df.head(3)

Unnamed: 0,filename,size,regions,file_attributes,annotations_name,nbr_regions,no_type,path
0,0.jpg,70631,"[{'shape_attributes': {'name': 'polygon', 'all...",{},via_region_data (14).json,2,False,datasets\KBF\ANNOTATED_IMAGES\0.jpg
1,1.jpg,65599,"[{'shape_attributes': {'name': 'polygon', 'all...",{},via_region_data (14).json,1,False,datasets\KBF\ANNOTATED_IMAGES\1.jpg
2,2.jpg,66715,"[{'shape_attributes': {'name': 'polygon', 'all...",{},via_region_data (14).json,1,False,datasets\KBF\ANNOTATED_IMAGES\2.jpg


# Save

In [26]:
df.to_csv(os.path.join(path_all_annotated_image,'image_info.csv'), index=False, sep=';')