In [1]:
import json
import numpy as np
import os
import pandas as pd
import cv2
import tqdm
import pickle
import random
import sys
import re
import glob
import shutil
import collections
import operator

%matplotlib inline
from matplotlib import pyplot as plt

In [2]:
path_all_annotated_image = os.path.join('datasets','KBF','ANNOTATED_IMAGES')

In [12]:
PACKAGE_PARENT = '..'
SCRIPT_DIR = os.path.dirname(os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser('__file__'))))
sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT)))

from UTILS.utils import from_vggbbox_get_vggpolygon, lists_remove_in1

In [13]:
#each time new images are annotated, copy all images from the folder (i.e. even the one which were not annotated) and put
#the annotation files in the annotations folder. We will here remove (put into no_regions fodler) all images which does not have
#one or more regions

# Download annotation info

In [14]:
df = pd.read_csv(os.path.join(path_all_annotated_image,'image_info.csv'), index_col=False, sep=';')
df['regions'] = df['regions'].map(lambda x: eval(x))
print(df.shape)
df.head(3)

(177, 8)


Unnamed: 0,filename,size,regions,file_attributes,annotations_name,nbr_regions,no_type,path
0,0.jpg,70631,"[{'shape_attributes': {'name': 'polygon', 'all...",{},via_region_data (14).json,2,False,datasets\KBF\ANNOTATED_IMAGES\0.jpg
1,1.jpg,65599,"[{'shape_attributes': {'name': 'polygon', 'all...",{},via_region_data (14).json,1,False,datasets\KBF\ANNOTATED_IMAGES\1.jpg
2,2.jpg,66715,"[{'shape_attributes': {'name': 'polygon', 'all...",{},via_region_data (14).json,1,False,datasets\KBF\ANNOTATED_IMAGES\2.jpg


# Remove some class

In [15]:
def trier(x,li_keep):
    xn = []
    for r in x:
        s = r['region_attributes']['CLASS']
        if s in li_keep:
            xn.append(r)
    return xn
#small test
#trier(df['regions'].iloc[0], ['DO'])

In [16]:
df['regions'] = df['regions'].map(lambda x: trier(x,['R', 'GF', 'F', 'G']))

# Info

In [17]:
li_species = []
for k,reg in enumerate(df['regions'].tolist()):
    for i in range(len(reg)):
        s = reg[i]['region_attributes']['type']
        if s=='':
            print('no type')
        else:
            li_species.append(s)
print('There is %d annotated images'%df.shape[0])
print('There is %d masks in total'%len(li_species))
c = dict(collections.Counter(li_species))
if ('EY' in c) & ('EA' in c):
    print('There is %d eyes and %d ears'%(c['EY'],c['EA']))

There is 177 annotated images
There is 247 masks in total


In [18]:
print('There is %d classes: %s'%(len(c.keys()),'\n'+' \t'.join(c.keys())))

There is 4 classes: 
F 	G 	GF 	R


In [19]:
c

{'F': 26, 'G': 191, 'GF': 8, 'R': 22}

### species that exist

In [22]:
#thats the dico used in the config file, so if a species is not specified in this dico, then add in both the dico and the 
#config
p = os.path.join(path_all_annotated_image, 'algo_file')
#create a director if not existing for images
if not os.path.exists(p):
    os.makedirs(p)
    
dico_classid_id = {"F":1, "G":2, "GF":3, 'R':4}
pickle.dump(dico_classid_id, open(os.path.join(p,'dico_classid_id.pkl'), 'wb'))
#note that if not all species are used there will have a higher number of class_id than their truely is

In [23]:
print('Species that exist but wont be taken into account for model: ')
for k in c.keys():
    if k not in dico_classid_id.keys():
        print('\t', k)

Species that exist but wont be taken into account for model: 


# Add info

In [24]:
df['width'] = np.nan
df['height'] = np.nan
for img_p in df['path'].tolist():
    img = cv2.imread(img_p)
    df.loc[df['path']==img_p,'height'] = img.shape[0]
    df.loc[df['path']==img_p,'width'] = img.shape[1]
    
df['width'] = df['width'].astype(int)
df['height'] = df['height'].astype(int)

# Split in train and val

In [25]:
#split in training and validation set randomly according to the above nbr
#note that for now we are not splitting according to the dtae, as it wont be good for certain species for which we have only one
#date and also because their is a lot of variation in a single day! a osmetime particular situation occur over oen day only and
#we still want to verify them. With mroe data we would be able to split according a independantn var: date
li_id = df['filename'].tolist()
nbr_val_image = int(len(li_id)*0.15)
val_id = random.sample(li_id, nbr_val_image)
train_id = [x for x in li_id if x not in val_id]
print('There is %d images in the training set and %d in the validation set'%(len(train_id),len(val_id)))

There is 151 images in the training set and 26 in the validation set


# Save annotations for model

In [26]:
#moves each of these train images to the train folder
dico_t_id = {'train':train_id, 'val':val_id}

In [27]:
df.head(3)

Unnamed: 0,filename,size,regions,file_attributes,annotations_name,nbr_regions,no_type,path,width,height
0,0.jpg,70631,"[{'shape_attributes': {'name': 'polygon', 'all...",{},via_region_data (14).json,2,False,datasets\KBF\ANNOTATED_IMAGES\0.jpg,794,1437
1,1.jpg,65599,"[{'shape_attributes': {'name': 'polygon', 'all...",{},via_region_data (14).json,1,False,datasets\KBF\ANNOTATED_IMAGES\1.jpg,810,1437
2,2.jpg,66715,"[{'shape_attributes': {'name': 'polygon', 'all...",{},via_region_data (14).json,1,False,datasets\KBF\ANNOTATED_IMAGES\2.jpg,794,1437


In [28]:
li_reg = df['regions'].tolist()
li_possible_type = []
for reg in li_reg:
    try:
        for r in reg:
            li_possible_type.append(r['region_attributes']['type'])
    except Exception as e:
        print(reg, e)
#sort by order of number of annotations
#c_final = dict(collections.Counter(li_possible_type))
#c_final = sorted(c_final.items(), key=operator.itemgetter(1))
#c_final.reverse()
#dico_classid_id = {c_final[i][0]:i+1 for i in range(len(c_final))}
print('There will be %d different class'%len(set(li_possible_type)))
print(set(li_possible_type))
L2add = [x for x in set(li_possible_type) if x not in dico_classid_id.keys()]
L2add = [x for x in L2add if x not in ['AL', 'N']]
if len(L2add)>0:
    print('you should add these species to the config file, modify parameter NUM_CLASSES in the config file, and add them to \
    the above dico dico_classid_id:')
    print(L2add)
    sys.exit()

There will be 4 different class
{'GF', 'G', 'F', 'R'}


### Create annotation

In [29]:
#update class_id_ when we have species, also update in utils_data_config
dico_t_classes = {}
for t in ['train', 'val']:
    
    li_allclasses = []
    #create annotation file for training/validation 
    path_annotation_for_model = os.path.join(path_all_annotated_image, 'annotation_'+t+'.pkl')

    #gather annotations
    li = []
    for filename in dico_t_id[t]:
        reg = df[df['filename']==filename]['regions'].values[0]
        try:
            li_classids = [reg[k]['region_attributes']['type'] for k in range(len(reg))]
            li_poly = [reg[k]['shape_attributes'] for k in range(len(reg))]

            #remove all the unwanted mask which could not be removed during annotations (type=N)
            #note that we keep the images, we jsut remove the annotation
            li_classids, li_poly = lists_remove_in1(li_classids, li_poly, 'N')

            #remove all AL on images (correct as it does not exist image with only one AL)
            li_classids, li_poly = lists_remove_in1(li_classids, li_poly, 'AL')

            #add info
            li.append({'height':df[df['filename']==filename]['height'].values[0], #482
                       'width':df[df['filename']==filename]['width'].values[0], #608
                       'filename':filename,
                       'class_id_':[dico_classid_id[x] for x in li_classids],
                       'polygons':li_poly})
            li_allclasses.extend(li_classids)
        except Exception as e:
            print(e, filename)

    #verify duplicates
    li_filename = [x['filename'] for x in li]
    if len(li_filename)!=len(set(li_filename)):
        print('ERROR: you have duplicates, verify which image has been annotated twice')
        sys.exit()

    #save
    pickle.dump(li, open(path_annotation_for_model, 'wb'))
    dico_t_classes[t] = list(set(li_allclasses))

print(len(li_filename),len(set(li_filename)))
from collections import Counter
ltest = [item for item, count in Counter(li_filename).items() if count > 1]
print(len(ltest))
df[df['filename']=='augmented_0_CYPY01_CAM02B02_02.06.16_183551_TRL_C.jpg']

### summaries

In [30]:
li_t = dico_t_classes['train']
li_v = dico_t_classes['val']
print('the following class are available: %s'% ', '.join(set(li_v+li_t)))
print('the following class are in validation: %s'% ', '.join(li_v))

#if some class are in the validation but not in the training set print error
if sum([x not in li_t for x in li_v])>0:
    print('ERROR: some class are in the validation but not in the training set')
    sys.exit()

the following class are available: R, G, F, GF
the following class are in validation: GF, F, G, R
