In [2]:
import os
from glob import glob # extract path of each file
import pandas as pd # data preprocessing
from xml.etree import ElementTree as et # parse information from XML
from functools import reduce

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
# step-1: get path of each xml file
xmlfiles = glob('./data_images/*.xml')
# replace \\ with /
replace_text = lambda x: x.replace('\\','/')
xmlfiles = list(map(replace_text,xmlfiles))

In [5]:
xmlfiles

['./data_images/Img1.xml',
 './data_images/Img10.xml',
 './data_images/Img11.xml',
 './data_images/Img12.xml',
 './data_images/Img13.xml',
 './data_images/Img14.xml',
 './data_images/Img15.xml',
 './data_images/Img16.xml',
 './data_images/img17.xml',
 './data_images/Img18.xml',
 './data_images/Img19.xml',
 './data_images/Img2.xml',
 './data_images/Img20.xml',
 './data_images/Img21.xml',
 './data_images/Img22.xml',
 './data_images/Img23.xml',
 './data_images/Img24.xml',
 './data_images/Img25.xml',
 './data_images/Img26.xml',
 './data_images/Img27.xml',
 './data_images/Img28.xml',
 './data_images/Img29.xml',
 './data_images/Img3.xml',
 './data_images/Img30.xml',
 './data_images/Img31.xml',
 './data_images/Img32.xml',
 './data_images/Img33.xml',
 './data_images/Img34.xml',
 './data_images/Img35.xml',
 './data_images/Img36.xml',
 './data_images/Img37.xml',
 './data_images/Img38.xml',
 './data_images/Img39.xml',
 './data_images/Img4.xml',
 './data_images/Img40.xml',
 './data_images/Img41.xm

In [6]:
# step-2: read xml files
# from each xml file we need to extract
# filename, size(width, height), object(name, xmin, xmax, ymin, ymax)
def extract_text(filename):
    tree = et.parse(filename)
    root = tree.getroot()

    # extract filename
    image_name = root.find('filename').text
    # width and height of the image
    width = root.find('size').find('width').text
    height = root.find('size').find('height').text
    objs = root.findall('object')
    parser = []
    for obj in objs:
        name = obj.find('name').text
        bndbox = obj.find('bndbox')
        xmin = bndbox.find('xmin').text
        xmax = bndbox.find('xmax').text
        ymin = bndbox.find('ymin').text
        ymax = bndbox.find('ymax').text
        parser.append([image_name, width, height, name,xmin,xmax,ymin,ymax])
        
    return parser

In [7]:
parser_all = list(map(extract_text,xmlfiles))

In [8]:
data = reduce(lambda x, y : x+y,parser_all)

In [9]:
df = pd.DataFrame(data,columns = ['filename','width','height','name','xmin','xmax','ymin','ymax'])

In [12]:
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax
0,Img1.png,999,699,Africanized Honey Bee,2,943,117,699
1,Img10.jpeg,5494,3662,Aphids,1143,3927,675,3039
2,Img11.jpg,1920,1810,Aphids,59,1856,38,1688
3,Img12.jpg,1254,836,Aphids,155,1167,6,808
4,Img13.jpg,400,400,Aphids,47,362,51,384


In [10]:
df.shape

(48, 8)

In [11]:
df['name'].value_counts()

name
Colorado Potato Beetle        11
Africanized Honey Bee          9
Brown Marmorated Stink Bug     9
Armyworms                      8
Fruit Flies                    7
Aphids                         4
Name: count, dtype: int64

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48 entries, 0 to 47
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  48 non-null     object
 1   width     48 non-null     object
 2   height    48 non-null     object
 3   name      48 non-null     object
 4   xmin      48 non-null     object
 5   xmax      48 non-null     object
 6   ymin      48 non-null     object
 7   ymax      48 non-null     object
dtypes: object(8)
memory usage: 3.1+ KB


In [14]:
# type conversion
cols = ['width','height','xmin','xmax','ymin','ymax']
df[cols] = df[cols].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48 entries, 0 to 47
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  48 non-null     object
 1   width     48 non-null     int32 
 2   height    48 non-null     int32 
 3   name      48 non-null     object
 4   xmin      48 non-null     int32 
 5   xmax      48 non-null     int32 
 6   ymin      48 non-null     int32 
 7   ymax      48 non-null     int32 
dtypes: int32(6), object(2)
memory usage: 2.0+ KB


In [15]:
# center x, center y
df['center_x'] = ((df['xmax']+df['xmin'])/2)/df['width']
df['center_y'] = ((df['ymax']+df['ymin'])/2)/df['height']
# w 
df['w'] = (df['xmax']-df['xmin'])/df['width']
# h 
df['h'] = (df['ymax']-df['ymin'])/df['height']

In [16]:
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,Img1.png,999,699,Africanized Honey Bee,2,943,117,699,0.472973,0.583691,0.941942,0.832618
1,Img10.jpeg,5494,3662,Aphids,1143,3927,675,3039,0.461412,0.5071,0.506735,0.645549
2,Img11.jpg,1920,1810,Aphids,59,1856,38,1688,0.498698,0.476796,0.935937,0.911602
3,Img12.jpg,1254,836,Aphids,155,1167,6,808,0.527113,0.486842,0.807018,0.95933
4,Img13.jpg,400,400,Aphids,47,362,51,384,0.51125,0.54375,0.7875,0.8325


In [17]:
images = df['filename'].unique()

In [18]:
len(images)

43

In [19]:
# 80% train and 20% test
img_df = pd.DataFrame(images,columns=['filename'])
img_train = tuple(img_df.sample(frac=0.8)['filename']) # shuffle and pick 80% of images

In [20]:
img_test = tuple(img_df.query(f'filename not in {img_train}')['filename']) # take rest 20% images

In [21]:
len(img_train), len(img_test)

(34, 9)

In [22]:
train_df = df.query(f'filename in {img_train}')
test_df = df.query(f'filename in {img_test}')

In [23]:
train_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,Img1.png,999,699,Africanized Honey Bee,2,943,117,699,0.472973,0.583691,0.941942,0.832618
4,Img13.jpg,400,400,Aphids,47,362,51,384,0.51125,0.54375,0.7875,0.8325
6,Img15.jpg,2800,1953,Armyworms,375,1610,717,1848,0.354464,0.656682,0.441071,0.579109
7,Img16.jpg,2592,1944,Armyworms,880,1989,456,1366,0.553434,0.468621,0.427855,0.468107
8,img17.jpg,1280,720,Armyworms,121,1170,201,720,0.504297,0.639583,0.819531,0.720833


In [24]:
test_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
1,Img10.jpeg,5494,3662,Aphids,1143,3927,675,3039,0.461412,0.5071,0.506735,0.645549
2,Img11.jpg,1920,1810,Aphids,59,1856,38,1688,0.498698,0.476796,0.935937,0.911602
3,Img12.jpg,1254,836,Aphids,155,1167,6,808,0.527113,0.486842,0.807018,0.95933
5,Img14.jpg,1440,1920,Armyworms,364,1084,289,1624,0.502778,0.498177,0.5,0.695312
17,Img24.jpg,800,550,Brown Marmorated Stink Bug,7,792,28,502,0.499375,0.481818,0.98125,0.861818


In [26]:
# label encoding
def label_encoding(x):
    labels = {'Colorado Potato Beetle':0, 'Africanized Honey Bee':1, 'Brown Marmorated Stink Bug':2, 'Armyworms':3, 'Fruit Flies':4, 'Aphids':5}
    return labels[x]

In [27]:
train_df['id'] = train_df['name'].apply(label_encoding)
test_df['id'] = test_df['name'].apply(label_encoding)

In [28]:
train_df.head(10)

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h,id
0,Img1.png,999,699,Africanized Honey Bee,2,943,117,699,0.472973,0.583691,0.941942,0.832618,1
4,Img13.jpg,400,400,Aphids,47,362,51,384,0.51125,0.54375,0.7875,0.8325,5
6,Img15.jpg,2800,1953,Armyworms,375,1610,717,1848,0.354464,0.656682,0.441071,0.579109,3
7,Img16.jpg,2592,1944,Armyworms,880,1989,456,1366,0.553434,0.468621,0.427855,0.468107,3
8,img17.jpg,1280,720,Armyworms,121,1170,201,720,0.504297,0.639583,0.819531,0.720833,3
9,Img18.jpg,640,840,Armyworms,13,620,106,610,0.494531,0.42619,0.948438,0.6,3
10,Img19.jpg,1200,882,Armyworms,488,1098,198,457,0.660833,0.371315,0.508333,0.293651,3
11,Img19.jpg,1200,882,Armyworms,88,1016,497,810,0.46,0.74093,0.773333,0.354875,3
12,Img2.jpg,680,350,Africanized Honey Bee,146,607,33,305,0.553676,0.482857,0.677941,0.777143,1
13,Img20.jpg,1600,795,Armyworms,117,1444,286,638,0.487812,0.581132,0.829375,0.442767,3


In [29]:
import os
from shutil import move

In [30]:
train_folder = 'data_images/train'
test_folder = 'data_images/test'


os.mkdir(train_folder)
os.mkdir(test_folder)

In [31]:
cols = ['filename','id','center_x','center_y', 'w', 'h']
groupby_obj_train = train_df[cols].groupby('filename')
groupby_obj_test = test_df[cols].groupby('filename')

In [43]:
#groupby_obj_train.get_group('000009.jpg').set_index('filename').to_csv('sample.txt',index=False,header=False)
# save each image in train/test folder and repective labels in .txt
def save_data(filename, folder_path, group_obj):
    # move image
    src = os.path.join('data_images',filename)
    dst = os.path.join(folder_path,filename)
    move(src,dst) # move image to the destination folder
    
    # save the labels
    text_filename = os.path.join(folder_path,
                                 os.path.splitext(filename)[0]+'.txt')
    group_obj.get_group(filename).set_index('filename').to_csv(text_filename,sep=' ',index=False,header=False)
    

In [44]:
filename_series = pd.Series(groupby_obj_train.groups.keys())

In [45]:
filename_series.apply(save_data,args=(train_folder,groupby_obj_train))

FileNotFoundError: [WinError 2] The system cannot find the file specified

In [46]:
filename_series_test = pd.Series(groupby_obj_test.groups.keys())
filename_series_test.apply(save_data,args=(test_folder,groupby_obj_test))

FileNotFoundError: [WinError 2] The system cannot find the file specified