In [180]:
import os
from glob import glob
import pandas as pd
from functools import reduce
from xml.etree import ElementTree as et

In [181]:
#step-1: get path of each xmlfile
xmlfiles = glob('./data_images/*.xml')
#replace \\ with /
replace_text = lambda x: x.replace('\\','/')
xmlfiles = list(map(replace_text,xmlfiles))

In [182]:
xmlfiles

['./data_images/D (1).xml',
 './data_images/D (10).xml',
 './data_images/D (100).xml',
 './data_images/D (11).xml',
 './data_images/D (12).xml',
 './data_images/D (13).xml',
 './data_images/D (14).xml',
 './data_images/D (15).xml',
 './data_images/D (16).xml',
 './data_images/D (17).xml',
 './data_images/D (18).xml',
 './data_images/D (19).xml',
 './data_images/D (2).xml',
 './data_images/D (20).xml',
 './data_images/D (21).xml',
 './data_images/D (22).xml',
 './data_images/D (23).xml',
 './data_images/D (24).xml',
 './data_images/D (25).xml',
 './data_images/D (26).xml',
 './data_images/D (27).xml',
 './data_images/D (28).xml',
 './data_images/D (29).xml',
 './data_images/D (3).xml',
 './data_images/D (30).xml',
 './data_images/D (31).xml',
 './data_images/D (32).xml',
 './data_images/D (33).xml',
 './data_images/D (34).xml',
 './data_images/D (35).xml',
 './data_images/D (36).xml',
 './data_images/D (37).xml',
 './data_images/D (38).xml',
 './data_images/D (39).xml',
 './data_images/

In [183]:
#step-2: read xml files
#from each xml file we need to extract
#filename, size(wicth, height), object(nam,xmin,xmax,ymin,ymax)
def extract_text(filename):
    tree = et.parse(filename)
    root = tree.getroot()

    #extract file name
    image_name = root.find('filename').text
    #width and height of the image
    width = root.find('size').find('width').text
    height = root.find('size').find('height').text
    objs = root.findall('object')
    parser = []
    for obj in objs:
        name = obj.find('name').text
        bndbox = obj.find('bndbox')
        xmin = bndbox.find('xmin').text
        xmax = bndbox.find('xmax').text
        ymin = bndbox.find('ymin').text
        ymax = bndbox.find('ymax').text
        parser.append([image_name,width,height,name,xmin,xmax,ymin,ymax])
    return parser

In [184]:
parser_all = list(map(extract_text,xmlfiles))

In [185]:
data = reduce(lambda x, y : x+y,parser_all)

In [186]:
df = pd.DataFrame(data,columns = ['filename','width','height','name','xmin','xmax','ymin','ymax'])

In [187]:
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax
0,D (1).jpg,227,227,Retak Diagonal,1,227,1,175
1,D (10).jpg,227,227,Retak Diagonal,40,227,103,227
2,D (100).jpg,227,227,Retak Diagonal,1,227,72,227
3,D (11).jpg,227,227,Retak Diagonal,1,227,56,227
4,D (12).jpg,227,227,Retak Diagonal,1,227,1,227


In [188]:
df.shape

(318, 8)

In [189]:
df['name'].value_counts()

name
Retak Diagonal      138
Retak Vertikal       90
Retak Horizontal     90
Name: count, dtype: int64

In [190]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 318 entries, 0 to 317
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  318 non-null    object
 1   width     318 non-null    object
 2   height    318 non-null    object
 3   name      318 non-null    object
 4   xmin      318 non-null    object
 5   xmax      318 non-null    object
 6   ymin      318 non-null    object
 7   ymax      318 non-null    object
dtypes: object(8)
memory usage: 20.0+ KB


In [191]:
#type conversion
cols = ['width','height','xmin','xmax','ymin','ymax']
df[cols] = df[cols].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 318 entries, 0 to 317
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  318 non-null    object
 1   width     318 non-null    int32 
 2   height    318 non-null    int32 
 3   name      318 non-null    object
 4   xmin      318 non-null    int32 
 5   xmax      318 non-null    int32 
 6   ymin      318 non-null    int32 
 7   ymax      318 non-null    int32 
dtypes: int32(6), object(2)
memory usage: 12.5+ KB


In [192]:
# center x, center y
df['center_x'] = ((df['xmax']+df['xmin'])/2)/df['width']
df['center_y'] = ((df['ymax']+df['ymin'])/2)/df['height']

# w
df['w'] = (df['xmax']-df['xmin'])/df['width']
# h
df['h'] = (df['ymax']-df['ymin'])/df['height']

In [193]:
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,D (1).jpg,227,227,Retak Diagonal,1,227,1,175,0.502203,0.387665,0.995595,0.76652
1,D (10).jpg,227,227,Retak Diagonal,40,227,103,227,0.588106,0.726872,0.823789,0.546256
2,D (100).jpg,227,227,Retak Diagonal,1,227,72,227,0.502203,0.65859,0.995595,0.682819
3,D (11).jpg,227,227,Retak Diagonal,1,227,56,227,0.502203,0.623348,0.995595,0.753304
4,D (12).jpg,227,227,Retak Diagonal,1,227,1,227,0.502203,0.502203,0.995595,0.995595


In [194]:
images = df['filename'].unique()

In [195]:
len(images)

298

In [196]:
# 80% train and 20% test
img_df = pd.DataFrame(images, columns=['filename'])
img_train = tuple(img_df.sample(frac=0.8)['filename']) # shuffle and pick 80% of images


In [197]:
img_test = tuple(img_df.query(f'filename not in {img_train}')['filename']) # take rest 20% images

In [198]:
len(img_train), len(img_test)

(238, 60)

In [199]:
train_df = df.query(f'filename in {img_train}')
test_df = df.query(f'filename in {img_test}')

In [200]:
train_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
1,D (10).jpg,227,227,Retak Diagonal,40,227,103,227,0.588106,0.726872,0.823789,0.546256
4,D (12).jpg,227,227,Retak Diagonal,1,227,1,227,0.502203,0.502203,0.995595,0.995595
5,D (13).jpg,227,227,Retak Diagonal,1,227,71,227,0.502203,0.656388,0.995595,0.687225
6,D (14).jpg,227,227,Retak Diagonal,7,158,1,227,0.363436,0.502203,0.665198,0.995595
8,D (16).jpg,227,227,Retak Diagonal,34,227,16,227,0.57489,0.535242,0.85022,0.929515


In [206]:
test_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h,id
0,D (1).jpg,227,227,Retak Diagonal,1,227,1,175,0.502203,0.387665,0.995595,0.76652,0
2,D (100).jpg,227,227,Retak Diagonal,1,227,72,227,0.502203,0.65859,0.995595,0.682819,0
3,D (11).jpg,227,227,Retak Diagonal,1,227,56,227,0.502203,0.623348,0.995595,0.753304,0
7,D (15).jpg,227,227,Retak Diagonal,1,227,30,130,0.502203,0.352423,0.995595,0.440529,0
11,D (19).jpg,227,227,Retak Vertikal,28,151,1,227,0.394273,0.502203,0.54185,0.995595,1


In [207]:
# Label encoding
def label_encoding(x):
    labels = {'Retak Diagonal':0,'Retak Vertikal':1,'Retak Horizontal':2}
    return labels[x]

In [213]:
train_df.loc[:, ['id', 'name']] = train_df[['id', 'name']]
test_df.loc[:, ['id', 'name']] = test_df[['id', 'name']]

In [214]:
train_df.head(10)

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h,id
1,D (10).jpg,227,227,Retak Diagonal,40,227,103,227,0.588106,0.726872,0.823789,0.546256,0
4,D (12).jpg,227,227,Retak Diagonal,1,227,1,227,0.502203,0.502203,0.995595,0.995595,0
5,D (13).jpg,227,227,Retak Diagonal,1,227,71,227,0.502203,0.656388,0.995595,0.687225,0
6,D (14).jpg,227,227,Retak Diagonal,7,158,1,227,0.363436,0.502203,0.665198,0.995595,0
8,D (16).jpg,227,227,Retak Diagonal,34,227,16,227,0.57489,0.535242,0.85022,0.929515,0
9,D (17).jpg,227,227,Retak Diagonal,15,226,1,156,0.530837,0.345815,0.929515,0.682819,0
10,D (18).jpg,227,227,Retak Vertikal,23,125,1,227,0.325991,0.502203,0.449339,0.995595,1
12,D (2).jpg,227,227,Retak Diagonal,1,194,10,227,0.429515,0.522026,0.85022,0.955947,0
13,D (20).jpg,227,227,Retak Diagonal,43,127,1,227,0.374449,0.502203,0.370044,0.995595,0
15,D (22).jpg,227,227,Retak Diagonal,1,222,115,227,0.491189,0.753304,0.973568,0.493392,0


In [215]:
import os
from shutil import move

In [216]:
train_folder = 'data_images/train'
test_folder = 'data_images/test'
    
os.mkdir(train_folder)
os.mkdir(test_folder)

In [217]:
cols = ['filename', 'id', 'center_x', 'center_y', 'w', 'h']
groupby_obj_train = train_df[cols].groupby('filename')
groupby_obj_test = test_df[cols].groupby('filename')

In [223]:
#groupby_obj_train.get_group('D (10).jpg').set_index('filename').to_csv('sample.txt',index=False,header=False)
#save each image in train/test folder and repective labels in .txt
def save_data(filename, folder_path, group_obj):
    #move image
    src = os.path.join('data_images',filename)
    dst = os.path.join(folder_path,filename)
    move(src,dst)
    
    #save the labels
    text_filename = os.path.join(folder_path, os.path.splitext(filename)[0]+'.txt')
    group_obj.get_group(filename).set_index('filename').to_csv(text_filename,sep=' ',index=False,header=False)
    

In [227]:
filename_series = pd.Series(groupby_obj_train.groups.keys())

In [230]:
filename_series.apply(save_data,args=(train_folder,groupby_obj_train))

0      None
1      None
2      None
3      None
4      None
       ... 
233    None
234    None
235    None
236    None
237    None
Length: 238, dtype: object

In [232]:
filename_series_test = pd.Series(groupby_obj_test.groups.keys())
filename_series_test.apply(save_data,args=(test_folder,groupby_obj_test))

FileNotFoundError: [Errno 2] No such file or directory: 'data_images\\D (1).jpg'