In [1]:
import os
from glob import glob # extract path of each file
import pandas as pd # data preprocessing
from xml.etree import ElementTree as et # parse information from XML
from functools import reduce

In [2]:
import warnings
warnings.filterwarnings('ignore')

### step-1: get path of each xml file

In [37]:
xmlfiles = glob('./dataset/*.xml')
# replace \\ with /
replace_text = lambda x: x.replace('\\','/')
xmlfiles = list(map(replace_text,xmlfiles))

In [38]:
xmlfiles

['./dataset/001.xml',
 './dataset/002.xml',
 './dataset/003.xml',
 './dataset/004.xml',
 './dataset/005.xml',
 './dataset/006.xml',
 './dataset/007.xml',
 './dataset/008.xml',
 './dataset/009.xml',
 './dataset/010.xml',
 './dataset/011.xml',
 './dataset/012.xml',
 './dataset/013.xml',
 './dataset/014.xml',
 './dataset/015.xml',
 './dataset/016.xml',
 './dataset/017.xml',
 './dataset/018.xml',
 './dataset/019.xml',
 './dataset/020.xml',
 './dataset/021.xml',
 './dataset/022.xml',
 './dataset/023.xml',
 './dataset/024.xml',
 './dataset/025.xml',
 './dataset/026.xml',
 './dataset/027.xml',
 './dataset/028.xml',
 './dataset/029.xml',
 './dataset/030.xml',
 './dataset/031.xml',
 './dataset/032.xml',
 './dataset/033.xml',
 './dataset/034.xml',
 './dataset/035.xml',
 './dataset/036.xml',
 './dataset/037.xml',
 './dataset/038.xml',
 './dataset/039.xml',
 './dataset/040.xml',
 './dataset/041.xml',
 './dataset/042.xml',
 './dataset/043.xml',
 './dataset/044.xml',
 './dataset/045.xml',
 './datase

In [39]:
print(len(xmlfiles))

555


### step-2: read xml files
##### * from each xml file we need to extract
##### * filename, size(width, height), object(name, xmin, xmax, ymin, ymax)

In [9]:
def extract_text(filename):
    tree = et.parse(filename)
    root = tree.getroot()

    # extract filename
    image_name = root.find('filename').text
    # width and height of the image
    width = root.find('size').find('width').text
    height = root.find('size').find('height').text
    objs = root.findall('object')
    parser = []
    for obj in objs:
        name = obj.find('name').text
        bndbox = obj.find('bndbox')
        xmin = bndbox.find('xmin').text
        xmax = bndbox.find('xmax').text
        ymin = bndbox.find('ymin').text
        ymax = bndbox.find('ymax').text
        parser.append([image_name, width, height, name,xmin,xmax,ymin,ymax])
        
    return parser

In [10]:
parser_all = list(map(extract_text,xmlfiles))

In [11]:
data = reduce(lambda x, y : x+y,parser_all)

In [12]:
df = pd.DataFrame(data,columns = ['filename','width','height','name','xmin','xmax','ymin','ymax'])

In [13]:
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax
0,001.jpg,2048,1536,gun,244,1797,48,1465
1,002.jpg,1459,1094,gun,76,1432,167,1016
2,003.jpg,800,603,gun,17,786,76,593
3,004.jpg,1014,722,gun,101,950,124,656
4,005.jpg,1000,669,gun,165,724,68,493


In [18]:
df.shape

(761, 8)

In [19]:
df['name'].value_counts()

name
person    333
knife     216
gun       212
Name: count, dtype: int64

## Conversion
![conversionIMG.png](conversionIMG.png)

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 761 entries, 0 to 760
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  761 non-null    object
 1   width     761 non-null    object
 2   height    761 non-null    object
 3   name      761 non-null    object
 4   xmin      761 non-null    object
 5   xmax      761 non-null    object
 6   ymin      761 non-null    object
 7   ymax      761 non-null    object
dtypes: object(8)
memory usage: 47.7+ KB


In [21]:
# type conversion
cols = ['width','height','xmin','xmax','ymin','ymax']
df[cols] = df[cols].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 761 entries, 0 to 760
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  761 non-null    object
 1   width     761 non-null    int64 
 2   height    761 non-null    int64 
 3   name      761 non-null    object
 4   xmin      761 non-null    int64 
 5   xmax      761 non-null    int64 
 6   ymin      761 non-null    int64 
 7   ymax      761 non-null    int64 
dtypes: int64(6), object(2)
memory usage: 47.7+ KB


In [22]:
# center x, center y
df['center_x'] = ((df['xmax']+df['xmin'])/2)/df['width']
df['center_y'] = ((df['ymax']+df['ymin'])/2)/df['height']
# w 
df['w'] = (df['xmax']-df['xmin'])/df['width']
# h 
df['h'] = (df['ymax']-df['ymin'])/df['height']

In [23]:
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,001.jpg,2048,1536,gun,244,1797,48,1465,0.498291,0.492513,0.758301,0.922526
1,002.jpg,1459,1094,gun,76,1432,167,1016,0.516792,0.540676,0.929404,0.776051
2,003.jpg,800,603,gun,17,786,76,593,0.501875,0.554726,0.96125,0.85738
3,004.jpg,1014,722,gun,101,950,124,656,0.518245,0.540166,0.837278,0.736842
4,005.jpg,1000,669,gun,165,724,68,493,0.4445,0.419283,0.559,0.635277


### split data into train and test

In [25]:
images = df['filename'].unique()

In [26]:
len(images)

555

In [27]:
# 80% train and 20% test
img_df = pd.DataFrame(images,columns=['filename'])
img_train = tuple(img_df.sample(frac=0.8)['filename']) # shuffle and pick 80% of images

In [28]:
img_test = tuple(img_df.query(f'filename not in {img_train}')['filename']) # take rest 20% images

In [29]:
len(img_train), len(img_test)

(444, 111)

In [30]:
train_df = df.query(f'filename in {img_train}')
test_df = df.query(f'filename in {img_test}')

In [31]:
train_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,001.jpg,2048,1536,gun,244,1797,48,1465,0.498291,0.492513,0.758301,0.922526
1,002.jpg,1459,1094,gun,76,1432,167,1016,0.516792,0.540676,0.929404,0.776051
3,004.jpg,1014,722,gun,101,950,124,656,0.518245,0.540166,0.837278,0.736842
4,005.jpg,1000,669,gun,165,724,68,493,0.4445,0.419283,0.559,0.635277
5,005.jpg,1000,669,person,346,986,119,669,0.666,0.588939,0.64,0.822123


In [32]:
test_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
2,003.jpg,800,603,gun,17,786,76,593,0.501875,0.554726,0.96125,0.85738
13,012.jpg,2048,1362,gun,6,2025,81,1081,0.49585,0.426579,0.98584,0.734214
16,014.jpg,1024,768,gun,29,1001,86,660,0.50293,0.485677,0.949219,0.747396
23,019.jpg,2047,1280,gun,17,1984,72,1215,0.488764,0.502734,0.960918,0.892969
29,025.jpg,2048,1365,gun,64,1900,132,1360,0.479492,0.54652,0.896484,0.899634


### Assign id number to object names

In [33]:
# label encoding
def label_encoding(x):
    labels = {'person':0, 'knife':1, 'gun':2}
    return labels[x]

In [34]:
train_df['id'] = train_df['name'].apply(label_encoding)
test_df['id'] = test_df['name'].apply(label_encoding)

In [35]:
train_df.head(10)

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h,id
0,001.jpg,2048,1536,gun,244,1797,48,1465,0.498291,0.492513,0.758301,0.922526,2
1,002.jpg,1459,1094,gun,76,1432,167,1016,0.516792,0.540676,0.929404,0.776051,2
3,004.jpg,1014,722,gun,101,950,124,656,0.518245,0.540166,0.837278,0.736842,2
4,005.jpg,1000,669,gun,165,724,68,493,0.4445,0.419283,0.559,0.635277,2
5,005.jpg,1000,669,person,346,986,119,669,0.666,0.588939,0.64,0.822123,0
6,006.jpg,800,600,gun,29,768,156,521,0.498125,0.564167,0.92375,0.608333,2
7,007.jpg,1600,1200,gun,4,1575,161,1063,0.493437,0.51,0.981875,0.751667,2
8,008.jpg,2048,1536,gun,141,1716,99,1333,0.453369,0.466146,0.769043,0.803385,2
9,009.jpg,800,600,gun,115,745,148,401,0.5375,0.4575,0.7875,0.421667,2
10,009.jpg,800,600,person,14,488,8,600,0.31375,0.506667,0.5925,0.986667,0


In [36]:
test_df.head(10)

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h,id
2,003.jpg,800,603,gun,17,786,76,593,0.501875,0.554726,0.96125,0.85738,2
13,012.jpg,2048,1362,gun,6,2025,81,1081,0.49585,0.426579,0.98584,0.734214,2
16,014.jpg,1024,768,gun,29,1001,86,660,0.50293,0.485677,0.949219,0.747396,2
23,019.jpg,2047,1280,gun,17,1984,72,1215,0.488764,0.502734,0.960918,0.892969,2
29,025.jpg,2048,1365,gun,64,1900,132,1360,0.479492,0.54652,0.896484,0.899634,2
30,026.jpg,479,313,gun,21,474,24,270,0.516701,0.469649,0.94572,0.785942,2
54,040.jpg,1024,768,gun,107,925,87,686,0.503906,0.503255,0.798828,0.779948,2
56,042.jpg,1000,750,gun,46,960,66,659,0.503,0.483333,0.914,0.790667,2
67,051.jpg,2048,1364,gun,20,1992,127,1073,0.491211,0.439883,0.962891,0.693548,2
71,053.jpg,2048,1373,gun,54,2001,18,1318,0.501709,0.486526,0.950684,0.946832,2


### Save Image and Labels in text

In [40]:
import os
from shutil import move

In [41]:
train_folder = 'dataset/train'
test_folder = 'dataset/test'


os.mkdir(train_folder)
os.mkdir(test_folder)

In [42]:
cols = ['filename','id','center_x','center_y', 'w', 'h']
groupby_obj_train = train_df[cols].groupby('filename')
groupby_obj_test = test_df[cols].groupby('filename')

##### save each image in train/test folder and repective labels in .txt

In [44]:
def save_data(filename, folder_path, group_obj):
    # move image
    src = os.path.join('dataset',filename)
    dst = os.path.join(folder_path,filename)
    move(src,dst) # move image to the destination folder
    
    # save the labels
    text_filename = os.path.join(folder_path,
                                 os.path.splitext(filename)[0]+'.txt')
    group_obj.get_group(filename).set_index('filename').to_csv(text_filename,sep=' ',index=False,header=False)

In [45]:
filename_series = pd.Series(groupby_obj_train.groups.keys())

In [46]:
filename_series.apply(save_data,args=(train_folder,groupby_obj_train))

0      None
1      None
2      None
3      None
4      None
       ... 
439    None
440    None
441    None
442    None
443    None
Length: 444, dtype: object

In [47]:
filename_series_test = pd.Series(groupby_obj_test.groups.keys())

In [48]:
filename_series_test.apply(save_data,args=(test_folder,groupby_obj_test))

0      None
1      None
2      None
3      None
4      None
       ... 
106    None
107    None
108    None
109    None
110    None
Length: 111, dtype: object