In [1]:
import os
import glob
import pandas as pd
import xml.etree.ElementTree as ET
import shutil

def xml_to_csv(path):
    xml_list = []
    for xml_file in glob.glob(path + '/*.xml'):
        tree = ET.parse(xml_file)
        root = tree.getroot()
        for member in root.findall('object'):
            value = (root.find('filename').text,
                     int(root.find('size')[0].text),
                     int(root.find('size')[1].text),
                     member[0].text,
                     int(member[4][0].text),
                     int(member[4][1].text),
                     int(member[4][2].text),
                     int(member[4][3].text)
                     )
            xml_list.append(value)
    column_name = ['filename', 'width', 'height', 'class', 'xmin', 'ymin', 'xmax', 'ymax']
    xml_df = pd.DataFrame(xml_list, columns=column_name)
    return xml_df

#sanitize labels
#7-eleven -> 7_eleven
#family mart -> family_mart
replace_dict = {"7-eleven":"7_eleven",
                "family mart":"family_mart"}

Move old data and separate XMLs and PNGs into folders

In [2]:
# path = "data/7_eleven_old/"
# xml_save_path = "data/xml_old/"
# png_save_path = "data/img/"

# for f in os.listdir(path):
#     if os.path.splitext(f)[1] == '.xml':
#         count += 1
#         xml_base_name = os.path.splitext(f)[0]+'.xml'
#         png_base_name = os.path.splitext(f)[0]+'.png'
#         xml_src_path = path+xml_base_name
#         png_src_path = path+png_base_name
#         shutil.copy2(xml_src_path, xml_save_path)
#         shutil.copy2(png_src_path, png_save_path)
        

In [3]:
def sanitize_label(df, replace_dict):
    """
    sanitize labels from dataframe
    Args:
        df: pandas dataframe
        replace_dict (str): dictionary containing sanitizing mapping
    """
    return df.replace(replace_dict)

This is an old data which might have overlapping filenames and only contain 1 class: `7-eleven`

In [4]:
old_data_path = "data/xml_old"
old_df = xml_to_csv(old_data_path)
old_df = sanitize_label(old_df, replace_dict)
old_df.filename = old_df.filename.apply(lambda x:"v01_"+x)
old_df.width = 640
old_df.height = 640
print(old_df['class'].unique(), old_df['class'].nunique())
print(old_df.head())
print(old_df.filename.nunique())

['7_eleven'] 1
                                            filename  width  height     class  \
0   v01_7eleven_35422222600000005_1396649526_120.png    640     640  7_eleven   
1   v01_7eleven_35422222600000005_1396649526_120.png    640     640  7_eleven   
2  v01_7eleven_3543121970000001_13965794530000002...    640     640  7_eleven   
3             v01_7eleven_354588697_1396072656_0.png    640     640  7_eleven   
4    v01_7eleven_355101339_14010322340000002_120.png    640     640  7_eleven   

   xmin  ymin  xmax  ymax  
0   217   294   348   333  
1   315   188   335   218  
2   571   271   638   311  
3     1   258    85   303  
4   498   288   607   312  
331


This is new data

In [5]:
data_path = "data/xml"
df = xml_to_csv(data_path)
df = sanitize_label(df, replace_dict)
df.width = 640
df.height = 640
print(df['class'].unique(), df['class'].nunique())
print(df.filename.nunique())
print(df.head())

['lawson' '7_eleven' 'family_mart'] 3
1346
                                 filename  width  height     class  xmin  \
0     lawson_35.744331_139.925114_120.png    640     640    lawson   383   
1  7-eleven_35.7040431_139.6634282_60.png    640     640  7_eleven     1   
2  7-eleven_35.7040431_139.6634282_60.png    640     640  7_eleven    96   
3  7-eleven_35.7040431_139.6634282_60.png    640     640  7_eleven   556   
4   lawson_35.7040348_139.6135523_300.png    640     640    lawson     1   

   ymin  xmax  ymax  
0   272   640   328  
1   162   277   298  
2   329   128   365  
3   110   616   176  
4   255   123   313  


In [6]:
df = df.append(old_df, ignore_index=True)
print(df['class'].unique(), df['class'].nunique())
print(df.filename.nunique())
print(df.head())

['lawson' '7_eleven' 'family_mart'] 3
1677
                                 filename  width  height     class  xmin  \
0     lawson_35.744331_139.925114_120.png    640     640    lawson   383   
1  7-eleven_35.7040431_139.6634282_60.png    640     640  7_eleven     1   
2  7-eleven_35.7040431_139.6634282_60.png    640     640  7_eleven    96   
3  7-eleven_35.7040431_139.6634282_60.png    640     640  7_eleven   556   
4   lawson_35.7040348_139.6135523_300.png    640     640    lawson     1   

   ymin  xmax  ymax  
0   272   640   328  
1   162   277   298  
2   329   128   365  
3   110   616   176  
4   255   123   313  


In [7]:
pd.crosstab(df['class'], df['class'])

class,7_eleven,family_mart,lawson
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
7_eleven,779,0,0
family_mart,0,918,0
lawson,0,0,969


In [8]:
df.to_csv("data/csv/all_labels.csv")