In [1]:
# import some packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json

In [2]:
# Path to images
im_pth = "../data/images/"
label_pth = "../data/labels/AnnotationsCombined.csv"
test_label_pth = "../data/labels/ucf_train_test/test_images.csv"

annotations = pd.read_csv(label_pth)
test_files = pd.read_csv(test_label_pth)
#annotations.head()

#Make a list of files in the test set
test_file_list = test_files.imagename.tolist()

In [3]:
#Some utility functions

def get_bounding_box(rsa_dict):
    
    #Get a bounding box for each segmentation
    
    if 'name' in rsa_dict.keys():
    
        if rsa_dict['name'] == 'rect':
        # Coordinates for rectangle
            xl = rsa_dict['x']
            xr = rsa_dict['x'] + rsa_dict['width'] 
            yb = rsa_dict['y'] 
            yt = rsa_dict['y'] + rsa_dict['height'] 
            shape = rsa_dict['name']

        elif rsa_dict['name'] == 'polygon':
        #Coordinates for a polygon
            xl = min(rsa_dict['all_points_x'])
            xr = max(rsa_dict['all_points_x'])
            yb = min(rsa_dict['all_points_y'])
            yt = max(rsa_dict['all_points_y'])
            shape = rsa_dict['name']
            
        elif rsa_dict['name'] == 'circle':
        #coordinates for a circle
            xl = rsa_dict['cx'] - rsa_dict['r']
            xr = rsa_dict['cx'] + rsa_dict['r']
            yb = rsa_dict['cy'] - rsa_dict['r']
            yt = rsa_dict['cy'] + rsa_dict['r']
            shape = rsa_dict['name']
            
        elif rsa_dict['name'] == 'ellipse':
        # Coordinates for an ellipse
            xl = rsa_dict['cx'] - rsa_dict['rx']
            xr = rsa_dict['cx'] + rsa_dict['rx']
            yb = rsa_dict['cy'] - rsa_dict['ry']
            yt = rsa_dict['cy'] + rsa_dict['ry']
            shape = rsa_dict['name']
        
        #Write to tuple
        bbox = ((xl, yb), (xl, yt), (xr, yb), (xr, yt))
        
        
    else:
        # Some images have no annotation, return 4 pixels.
        bbox = ((None,None), (None, None), (None, None), (None, None))
        shape = 'none'
    
    info = {"shape": shape, 'bbox':bbox}
    return info


def get_label(ra_dict):
    
    #Get the type of defect it is
    
    if 'Defect_Class' in ra_dict.keys():
    #Defects are annotated as <parent>_<defect> or <unknown>
        split_string = ra_dict['Defect_Class'].split('_')

        if len(split_string) < 2:
        #Captures the 'unkown'
            parent = split_string[0]
            defect = split_string[0]
        
        else:
        #Capture the true values we are interested.
            parent = split_string[0]
            defect = split_string[1]
        
    else:
    # No defect found
        parent = 'None'
        defect = 'None'
    
    return (parent, defect)

In [4]:
#Parse the region_shape_attribute column to get the coordinates of a bounding box
rsa = [get_bounding_box(x) for x in annotations.region_shape_attributes.apply(lambda x: json.loads(x))]

#Parse the region_attribute column to get the coordinates of the label name
ra = [get_label(x) for x in annotations.region_attributes.apply(lambda x: json.loads(x))]

In [5]:
# Check the dimensions are still OK (then we can presume still in order...)
if len(annotations) == len(ra) == len(rsa):

    #Create a new dataframe with parsed info:
    parsed_info = pd.DataFrame(
        {
            "annotation_shape": [x['shape'] for x in rsa],
            "bounding_box_coords": [x['bbox'] for x in rsa],
            "parent_label": [x[0] for x in ra],
            "defect_class": [x[1] for x in ra]
        }
    )

parsed_info.head()

Unnamed: 0,annotation_shape,bounding_box_coords,parent_label,defect_class
0,rect,"((120, 201), (120, 218), (126, 201), (126, 218))",Contact,NearSolderPad
1,rect,"((297, 201), (297, 226), (300, 201), (300, 226))",Contact,NearSolderPad
2,rect,"((325, 200), (325, 219), (329, 200), (329, 219))",Contact,NearSolderPad
3,rect,"((332, 199), (332, 218), (337, 199), (337, 218))",Contact,NearSolderPad
4,rect,"((288, 296), (288, 316), (292, 296), (292, 316))",Contact,FrontGridInterruption


In [6]:
# Join together on df index - note the data must be unshuffled!
combined_df = annotations.join(parsed_info)
combined_df.head()

Unnamed: 0,filename,file_size,file_attributes,region_count,region_id,region_shape_attributes,region_attributes,annotation_shape,bounding_box_coords,parent_label,defect_class
0,Bent Cure 0Pa TestName_20201222_165250.586_C00...,37387,{},7,0,"{""name"":""rect"",""x"":120,""y"":201,""width"":6,""heig...","{""Defect_Class"":""Contact_NearSolderPad""}",rect,"((120, 201), (120, 218), (126, 201), (126, 218))",Contact,NearSolderPad
1,Bent Cure 0Pa TestName_20201222_165250.586_C00...,37387,{},7,1,"{""name"":""rect"",""x"":297,""y"":201,""width"":3,""heig...","{""Defect_Class"":""Contact_NearSolderPad""}",rect,"((297, 201), (297, 226), (300, 201), (300, 226))",Contact,NearSolderPad
2,Bent Cure 0Pa TestName_20201222_165250.586_C00...,37387,{},7,2,"{""name"":""rect"",""x"":325,""y"":200,""width"":4,""heig...","{""Defect_Class"":""Contact_NearSolderPad""}",rect,"((325, 200), (325, 219), (329, 200), (329, 219))",Contact,NearSolderPad
3,Bent Cure 0Pa TestName_20201222_165250.586_C00...,37387,{},7,3,"{""name"":""rect"",""x"":332,""y"":199,""width"":5,""heig...","{""Defect_Class"":""Contact_NearSolderPad""}",rect,"((332, 199), (332, 218), (337, 199), (337, 218))",Contact,NearSolderPad
4,Bent Cure 0Pa TestName_20201222_165250.586_C00...,37387,{},7,4,"{""name"":""rect"",""x"":288,""y"":296,""width"":4,""heig...","{""Defect_Class"":""Contact_FrontGridInterruption""}",rect,"((288, 296), (288, 316), (292, 296), (292, 316))",Contact,FrontGridInterruption


In [7]:
combined_df[["filename","annotation_shape","bounding_box_coords","parent_label","defect_class"]].head()

Unnamed: 0,filename,annotation_shape,bounding_box_coords,parent_label,defect_class
0,Bent Cure 0Pa TestName_20201222_165250.586_C00...,rect,"((120, 201), (120, 218), (126, 201), (126, 218))",Contact,NearSolderPad
1,Bent Cure 0Pa TestName_20201222_165250.586_C00...,rect,"((297, 201), (297, 226), (300, 201), (300, 226))",Contact,NearSolderPad
2,Bent Cure 0Pa TestName_20201222_165250.586_C00...,rect,"((325, 200), (325, 219), (329, 200), (329, 219))",Contact,NearSolderPad
3,Bent Cure 0Pa TestName_20201222_165250.586_C00...,rect,"((332, 199), (332, 218), (337, 199), (337, 218))",Contact,NearSolderPad
4,Bent Cure 0Pa TestName_20201222_165250.586_C00...,rect,"((288, 296), (288, 316), (292, 296), (292, 316))",Contact,FrontGridInterruption


In [10]:
# I want to know how many files gives how many instances by class

#Make a summary of the complete dataset
summary_1 = combined_df[['defect_class','filename']]\
    .groupby('defect_class')\
    .agg({"filename":['count', 'nunique']})\
    .reset_index()
summary_1.columns = ['defect_class','n_instances_all', 'n_files_all']
#summary_1

#Make a summary of the test dataset
test_data_summary = combined_df[combined_df['filename'].isin(test_file_list)]\
    [['defect_class','filename']]\
    .groupby('defect_class')\
    .agg({"filename":['count', 'nunique']})\
    .reset_index()
test_data_summary.columns = ['defect_class','n_instances_test', 'n_files_test']
#test_data_summary

#join and compare. 
tt_df = summary_1.merge(test_data_summary, left_on='defect_class', right_on = 'defect_class', how = 'left')\
            .fillna(0)
tt_df['pct_test'] = round(tt_df['n_instances_test'] / tt_df['n_instances_all'] * 100, 2)
tt_df[['defect_class', 'n_instances_all', 'n_instances_test', 'pct_test']]

Unnamed: 0,defect_class,n_instances_all,n_instances_test,pct_test
0,BeltMarks,16,0.0,0.0
1,BrightSpot,675,124.0,18.37
2,Closed,8464,1615.0,19.08
3,Corrosion,145,25.0,17.24
4,Disconnected,361,0.0,0.0
5,FrontGridInterruption,40511,7723.0,19.06
6,HighlyResistive,895,159.0,17.77
7,Isolated,4453,838.0,18.82
8,NearSolderPad,20359,4026.0,19.78
9,,5236,0.0,0.0


In [11]:
## I want to see which shape of segmentation annotation corresponds to each defect, 
#filter count sort - summary of annotations
combined_df[['parent_label','defect_class','annotation_shape']]\
    .value_counts()\
    .reset_index()\
    .sort_values(['parent_label','defect_class'])

Unnamed: 0,parent_label,defect_class,annotation_shape,0
20,Contact,BeltMarks,rect,12
23,Contact,BeltMarks,polygon,4
11,Contact,Corrosion,polygon,138
22,Contact,Corrosion,rect,7
0,Contact,FrontGridInterruption,rect,38621
6,Contact,FrontGridInterruption,polygon,1890
1,Contact,NearSolderPad,rect,19153
7,Contact,NearSolderPad,polygon,1206
2,Crack,Closed,polygon,8376
13,Crack,Closed,rect,85


In [12]:
#Now plot...