In [24]:
import os
from glob import glob
import pandas as pd
pd.options.mode.chained_assignment = None
from functools import reduce
from xml.etree import ElementTree as et

In [25]:
# Load XML files and store in list
xml_list = glob('./data_images/*.xml')

# replace \\ with //
xml_list = list(map(lambda x: x.replace('\\','/'), xml_list))

In [26]:
xml_list

['./data_images/001.xml',
 './data_images/003.xml',
 './data_images/004.xml',
 './data_images/005.xml',
 './data_images/006.xml',
 './data_images/007.xml',
 './data_images/008.xml',
 './data_images/009.xml',
 './data_images/010.xml',
 './data_images/011.xml',
 './data_images/012.xml',
 './data_images/013.xml',
 './data_images/014.xml',
 './data_images/015.xml',
 './data_images/016.xml',
 './data_images/019.xml',
 './data_images/020.xml',
 './data_images/021.xml',
 './data_images/022.xml',
 './data_images/023.xml',
 './data_images/024.xml',
 './data_images/025.xml',
 './data_images/026.xml',
 './data_images/027.xml',
 './data_images/028.xml',
 './data_images/029.xml',
 './data_images/030.xml',
 './data_images/031.xml',
 './data_images/032.xml',
 './data_images/033.xml',
 './data_images/034.xml',
 './data_images/035.xml',
 './data_images/038.xml',
 './data_images/039.xml',
 './data_images/040.xml',
 './data_images/041.xml',
 './data_images/042.xml',
 './data_images/043.xml',
 './data_ima

In [27]:
# read and extract file info: name, size(w,h), and
# object(name, xmin, xmax, ymin, ymax) -- have to parse

def extract_data(filename):
    tree = et.parse(filename)
    root = tree.getroot();

# extract name of file
    img_name = root.find('filename').text
    
    # dimensions of image
    width = root.find('size').find('width').text
    height = root.find('size').find('height').text
    
    # coordinates of object, loop for multiple objects in a file
    objs = root.findall('object')
    parse = []
    for obj in objs:
        name = obj.find('name').text
        boundbox = obj.find('bndbox')
        xmin = boundbox.find('xmin').text
        xmax = boundbox.find('xmax').text
        ymin = boundbox.find('ymin').text
        ymax = boundbox.find('ymax').text
        parse.append([img_name, width, height, name, xmin, xmax, ymin, ymax])
        #print(parse)
    return parse

In [28]:
parse_all = list(map(extract_data, xml_list))

In [29]:
data = reduce(lambda x, y: x+y, parse_all)

In [30]:
df = pd.DataFrame(data, columns = ['filename', 'width', 'height', 'name', 'xmin', 'xmax', 'ymin', 'ymax'])

In [31]:
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax
0,001.png,996,553,train,540,812,199,472
1,003.png,988,666,train,402,771,270,566
2,004.png,1132,892,train,438,1053,158,703
3,005.png,983,1287,train,1,417,434,863
4,006.png,996,1264,train,422,910,303,1090


In [32]:
df.shape

(51, 8)

In [33]:
df['name'].value_counts()

name
train    51
Name: count, dtype: int64

In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  51 non-null     object
 1   width     51 non-null     object
 2   height    51 non-null     object
 3   name      51 non-null     object
 4   xmin      51 non-null     object
 5   xmax      51 non-null     object
 6   ymin      51 non-null     object
 7   ymax      51 non-null     object
dtypes: object(8)
memory usage: 3.3+ KB


In [35]:
# data type conversion - coords to integer
col = ['width','height','xmin','xmax','ymin','ymax']
df[col] = df[col].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  51 non-null     object
 1   width     51 non-null     int64 
 2   height    51 non-null     int64 
 3   name      51 non-null     object
 4   xmin      51 non-null     int64 
 5   xmax      51 non-null     int64 
 6   ymin      51 non-null     int64 
 7   ymax      51 non-null     int64 
dtypes: int64(6), object(2)
memory usage: 3.3+ KB


In [36]:
# YOLO labelling - center x & center y - width and height
# center is / 2 to get middle
# width and height will use values of bounding box
df['center_x'] = ((df['xmax'] + df['xmin'])/2)/df['width']
df['center_y'] = ((df['ymax'] + df['ymin'])/2)/df['height']
df['w'] = (df['xmax'] - df['xmin'])/df['width']
df['h'] = (df['ymax'] - df['ymin'])/df['height']

In [37]:
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,001.png,996,553,train,540,812,199,472,0.678715,0.606691,0.273092,0.493671
1,003.png,988,666,train,402,771,270,566,0.593623,0.627628,0.373482,0.444444
2,004.png,1132,892,train,438,1053,158,703,0.658569,0.482623,0.543286,0.610987
3,005.png,983,1287,train,1,417,434,863,0.212614,0.503885,0.423194,0.333333
4,006.png,996,1264,train,422,910,303,1090,0.668675,0.551028,0.48996,0.622627


In [38]:
imgs = df ['filename'].unique()
len(imgs)

50

In [39]:
imgs_df = pd.DataFrame(imgs, columns = ['filename'])
# select at random 80% of images to train..."train"...
imgs_train = tuple(imgs_df.sample(frac = 0.8)['filename'])

In [40]:
# remainder, .2
imgs_test = tuple(imgs_df.query(f' filename not in {imgs_train}')['filename'])

In [41]:
train_df = df.query(f' filename in {imgs_train}')
test_df = df.query(f' filename in {imgs_test}')

In [42]:
train_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,001.png,996,553,train,540,812,199,472,0.678715,0.606691,0.273092,0.493671
1,003.png,988,666,train,402,771,270,566,0.593623,0.627628,0.373482,0.444444
2,004.png,1132,892,train,438,1053,158,703,0.658569,0.482623,0.543286,0.610987
4,006.png,996,1264,train,422,910,303,1090,0.668675,0.551028,0.48996,0.622627
5,007.png,886,1158,train,82,368,264,542,0.25395,0.348014,0.322799,0.240069


In [43]:
# Assigning ID number to object names
# Just train
def label_encoding(x):
    label = {'train':1}
    return label[x]

In [44]:
train_df['id'] = train_df['name'].apply(label_encoding)
test_df['id'] = test_df['name'].apply(label_encoding)

In [45]:
train_df.head(10)

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h,id
0,001.png,996,553,train,540,812,199,472,0.678715,0.606691,0.273092,0.493671,1
1,003.png,988,666,train,402,771,270,566,0.593623,0.627628,0.373482,0.444444,1
2,004.png,1132,892,train,438,1053,158,703,0.658569,0.482623,0.543286,0.610987,1
4,006.png,996,1264,train,422,910,303,1090,0.668675,0.551028,0.48996,0.622627,1
5,007.png,886,1158,train,82,368,264,542,0.25395,0.348014,0.322799,0.240069,1
6,008.png,883,1221,train,183,841,295,998,0.579841,0.529484,0.745187,0.575758,1
7,009.png,1051,652,train,490,792,161,481,0.609895,0.492331,0.287345,0.490798,1
8,010.png,869,614,train,265,673,94,419,0.539701,0.417752,0.469505,0.529316,1
9,011.png,902,524,train,1,248,138,311,0.138027,0.428435,0.273836,0.330153,1
10,011.png,902,524,train,257,584,106,401,0.466186,0.483779,0.362528,0.562977,1


In [49]:
# saving images with labels
import os 
from shutil import move

In [50]:
train_folder = 'data_images/train'
test_folder = 'data_images/test'

os.mkdir(train_folder)
os.mkdir(test_folder)

In [51]:
cols = ['filename','id','center_x','center_y','w','h']
groupby_objtrain = train_df[cols].groupby('filename')
groupby_objtest = test_df[cols].groupby('filename')

In [72]:
# save image in folders with labels
def save_data(filename, folder_pth, group_obj):
    src = os.path.join('data_images', filename)
    dest = os.path.join(folder_pth, filename)
    # move images to folder
    move(src, dest)
    # save with labels as txt
    txt_filename = os.path.join(folder_pth, os.path.splitext(filename)[0]+'.txt')
    group_obj.get_group(filename).set_index('filename').to_csv(txt_filename, sep = ' ', index = False, header = False)

In [73]:
filename_series = pd.Series(groupby_objtrain.groups.keys())

In [75]:
filename_series.apply(save_data,args=(train_folder, groupby_objtrain))

0     None
1     None
2     None
3     None
4     None
5     None
6     None
7     None
8     None
9     None
10    None
11    None
12    None
13    None
14    None
15    None
16    None
17    None
18    None
19    None
20    None
21    None
22    None
23    None
24    None
25    None
26    None
27    None
28    None
29    None
30    None
31    None
32    None
33    None
34    None
35    None
36    None
37    None
38    None
39    None
dtype: object

In [76]:
filename_seriesTest = pd.Series(groupby_objtest.groups.keys())
filename_seriesTest.apply(save_data,args=(test_folder, groupby_objtest))

0    None
1    None
2    None
3    None
4    None
5    None
6    None
7    None
8    None
9    None
dtype: object