In [2]:
import os
import pandas as pd
from glob import glob
# to use the reduce function for the img processing 
from functools import reduce
# To read and perform operation on xml file
from xml.etree import ElementTree as et

In [3]:
#Load all xml file and store it in a list
xml_list=glob('./data_images/*.xml')

In [4]:
xml_list

['./data_images\\01.xml',
 './data_images\\02.xml',
 './data_images\\03.xml',
 './data_images\\04.xml',
 './data_images\\05.xml']

In [5]:
# Data Cleaning Part
# 1. Replace \\ with /
xmlfiles=list(map(lambda x: x.replace('\\','/'),xml_list))

In [6]:
xmlfiles

['./data_images/01.xml',
 './data_images/02.xml',
 './data_images/03.xml',
 './data_images/04.xml',
 './data_images/05.xml']

In [7]:
# Step 2: Read XML file to extract 
# 1. filename
# 2. size(width,height)
# 3. object(name,xmin,xmax,ymin,ymax)
# Try for the single image
def extract_text(filename):
    tree=et.parse(filename)
    root=tree.getroot()
    
    #Extract filename
    image_name=root.find('filename').text
    # width and height of the image
    width=root.find('size').find('width').text
    height=root.find('size').find('height').text
    objs = root.findall('object')
    parser=[]
    for obj in objs:
        name=obj.find('name').text
        bndbox=obj.find('bndbox')
        xmin=bndbox.find('xmin').text
        xmax=bndbox.find('xmax').text
        ymin=bndbox.find('ymin').text
        ymax=bndbox.find('ymax').text
        parser.append([image_name,width,height,name,xmin,xmax,ymin,ymax])
    
    
    return parser


In [8]:
parser_all=list(map(extract_text,xmlfiles))

In [9]:
parser_all

[[['01.jpg', '1024', '684', 'man', '98', '350', '150', '530'],
  ['01.jpg', '1024', '684', 'man', '321', '658', '94', '684'],
  ['01.jpg', '1024', '684', 'women', '666', '896', '116', '663'],
  ['01.jpg', '1024', '684', 'women', '904', '1024', '56', '539']],
 [['02.jpg', '2048', '1661', 'car', '110', '1892', '171', '1576'],
  ['02.jpg', '2048', '1661', 'man', '1703', '1808', '325', '540'],
  ['02.jpg', '2048', '1661', 'car', '1844', '1972', '350', '530']],
 [['03.jpg', '640', '480', 'car', '21', '635', '103', '392']],
 [['04.jpg', '2048', '1536', 'car', '21', '2033', '898', '1465']],
 [['05.jpg', '2048', '1301', 'car', '299', '1703', '591', '1137']]]

In [10]:
# we had got the result in multidimentional format so to convert it  into plane form we do following operation
data=reduce(lambda x,y:x+y,parser_all)

In [11]:
data

[['01.jpg', '1024', '684', 'man', '98', '350', '150', '530'],
 ['01.jpg', '1024', '684', 'man', '321', '658', '94', '684'],
 ['01.jpg', '1024', '684', 'women', '666', '896', '116', '663'],
 ['01.jpg', '1024', '684', 'women', '904', '1024', '56', '539'],
 ['02.jpg', '2048', '1661', 'car', '110', '1892', '171', '1576'],
 ['02.jpg', '2048', '1661', 'man', '1703', '1808', '325', '540'],
 ['02.jpg', '2048', '1661', 'car', '1844', '1972', '350', '530'],
 ['03.jpg', '640', '480', 'car', '21', '635', '103', '392'],
 ['04.jpg', '2048', '1536', 'car', '21', '2033', '898', '1465'],
 ['05.jpg', '2048', '1301', 'car', '299', '1703', '591', '1137']]

In [12]:
# Convert this into a dataframe
df=pd.DataFrame(data,columns=['filename','width','height','name','xmin','xmax','ymin','ymax'])

In [13]:
df

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax
0,01.jpg,1024,684,man,98,350,150,530
1,01.jpg,1024,684,man,321,658,94,684
2,01.jpg,1024,684,women,666,896,116,663
3,01.jpg,1024,684,women,904,1024,56,539
4,02.jpg,2048,1661,car,110,1892,171,1576
5,02.jpg,2048,1661,man,1703,1808,325,540
6,02.jpg,2048,1661,car,1844,1972,350,530
7,03.jpg,640,480,car,21,635,103,392
8,04.jpg,2048,1536,car,21,2033,898,1465
9,05.jpg,2048,1301,car,299,1703,591,1137


In [14]:
df=df.drop_duplicates()

In [15]:
df

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax
0,01.jpg,1024,684,man,98,350,150,530
1,01.jpg,1024,684,man,321,658,94,684
2,01.jpg,1024,684,women,666,896,116,663
3,01.jpg,1024,684,women,904,1024,56,539
4,02.jpg,2048,1661,car,110,1892,171,1576
5,02.jpg,2048,1661,man,1703,1808,325,540
6,02.jpg,2048,1661,car,1844,1972,350,530
7,03.jpg,640,480,car,21,635,103,392
8,04.jpg,2048,1536,car,21,2033,898,1465
9,05.jpg,2048,1301,car,299,1703,591,1137


In [16]:
df['name'].value_counts()

name
car      5
man      3
women    2
Name: count, dtype: int64

# Prepare Labels for YOLO model

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  10 non-null     object
 1   width     10 non-null     object
 2   height    10 non-null     object
 3   name      10 non-null     object
 4   xmin      10 non-null     object
 5   xmax      10 non-null     object
 6   ymin      10 non-null     object
 7   ymax      10 non-null     object
dtypes: object(8)
memory usage: 768.0+ bytes


In [18]:
# As we requiire the width,height,xmin,ymin,xmax,ymax as a either integer or float so we will convert it 
# type conversion
cols = ['width','height','xmin','xmax','ymin','ymax']
df[cols] = df[cols].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  10 non-null     object
 1   width     10 non-null     int32 
 2   height    10 non-null     int32 
 3   name      10 non-null     object
 4   xmin      10 non-null     int32 
 5   xmax      10 non-null     int32 
 6   ymin      10 non-null     int32 
 7   ymax      10 non-null     int32 
dtypes: int32(6), object(2)
memory usage: 528.0+ bytes


In [19]:
# So next we will find out the require parameters 
# center x, center y
df['center_x'] = ((df['xmax']+df['xmin'])/2)/df['width']
df['center_y'] = ((df['ymax']+df['ymin'])/2)/df['height']
# w 
df['w'] = (df['xmax']-df['xmin'])/df['width']
# h 
df['h'] = (df['ymax']-df['ymin'])/df['height']

In [20]:
df

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,01.jpg,1024,684,man,98,350,150,530,0.21875,0.497076,0.246094,0.555556
1,01.jpg,1024,684,man,321,658,94,684,0.478027,0.568713,0.329102,0.862573
2,01.jpg,1024,684,women,666,896,116,663,0.762695,0.569444,0.224609,0.799708
3,01.jpg,1024,684,women,904,1024,56,539,0.941406,0.434942,0.117188,0.70614
4,02.jpg,2048,1661,car,110,1892,171,1576,0.48877,0.525888,0.870117,0.845876
5,02.jpg,2048,1661,man,1703,1808,325,540,0.857178,0.260385,0.05127,0.12944
6,02.jpg,2048,1661,car,1844,1972,350,530,0.931641,0.264901,0.0625,0.108368
7,03.jpg,640,480,car,21,635,103,392,0.5125,0.515625,0.959375,0.602083
8,04.jpg,2048,1536,car,21,2033,898,1465,0.501465,0.769206,0.982422,0.369141
9,05.jpg,2048,1301,car,299,1703,591,1137,0.48877,0.664105,0.685547,0.419677


# Split the data into train and test data

In [21]:
images = df['filename'].unique()

In [22]:
len(images)

5

In [23]:
# 80% train and 20% test
img_df = pd.DataFrame(images,columns=['filename'])
img_train = tuple(img_df.sample(frac=0.8)['filename']) # shuffle and pick 80% of images

In [24]:
img_train

('05.jpg', '03.jpg', '04.jpg', '01.jpg')

In [25]:
img_test = tuple(img_df.query(f'filename not in {img_train}')['filename']) # take rest 20% images

In [26]:
img_test

('02.jpg',)

In [27]:
len(img_train), len(img_test)

(4, 1)

In [29]:
train_df=df.query(f'filename in {img_train}')
test_df=df.query(f'filename in {img_test}')

In [30]:
train_df

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,01.jpg,1024,684,man,98,350,150,530,0.21875,0.497076,0.246094,0.555556
1,01.jpg,1024,684,man,321,658,94,684,0.478027,0.568713,0.329102,0.862573
2,01.jpg,1024,684,women,666,896,116,663,0.762695,0.569444,0.224609,0.799708
3,01.jpg,1024,684,women,904,1024,56,539,0.941406,0.434942,0.117188,0.70614
7,03.jpg,640,480,car,21,635,103,392,0.5125,0.515625,0.959375,0.602083
8,04.jpg,2048,1536,car,21,2033,898,1465,0.501465,0.769206,0.982422,0.369141
9,05.jpg,2048,1301,car,299,1703,591,1137,0.48877,0.664105,0.685547,0.419677


In [31]:
test_df

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
4,02.jpg,2048,1661,car,110,1892,171,1576,0.48877,0.525888,0.870117,0.845876
5,02.jpg,2048,1661,man,1703,1808,325,540,0.857178,0.260385,0.05127,0.12944
6,02.jpg,2048,1661,car,1844,1972,350,530,0.931641,0.264901,0.0625,0.108368


# Assign an id number to the name column

In [32]:
# we assign the id number because YOLO is a deep learning model and to process the data the model need a numeric data as a 
# Machine Learning model doesn't understand the text .So to convert the text into numeric form this process is called label encoding 



In [40]:
# label encoding
def label_encoding(x):
    labels = {'man':0, 'car':1, 'women':2, 'bottle':3, 'pottedplant':4, 'bird':5, 'dog':6,
       'sofa':7, 'bicycle':8, 'horse':9, 'boat':10, 'motorbike':11, 'cat':12, 'tvmonitor':13,
       'cow':14, 'sheep':15, 'aeroplane':16, 'train':17, 'diningtable':18, 'bus':19}
    return labels[x]

In [41]:
train_df['id'] = train_df['name'].apply(label_encoding)
test_df['id'] = test_df['name'].apply(label_encoding)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['id'] = train_df['name'].apply(label_encoding)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['id'] = test_df['name'].apply(label_encoding)


In [42]:
train_df

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h,id
0,01.jpg,1024,684,man,98,350,150,530,0.21875,0.497076,0.246094,0.555556,0
1,01.jpg,1024,684,man,321,658,94,684,0.478027,0.568713,0.329102,0.862573,0
2,01.jpg,1024,684,women,666,896,116,663,0.762695,0.569444,0.224609,0.799708,2
3,01.jpg,1024,684,women,904,1024,56,539,0.941406,0.434942,0.117188,0.70614,2
7,03.jpg,640,480,car,21,635,103,392,0.5125,0.515625,0.959375,0.602083,1
8,04.jpg,2048,1536,car,21,2033,898,1465,0.501465,0.769206,0.982422,0.369141,1
9,05.jpg,2048,1301,car,299,1703,591,1137,0.48877,0.664105,0.685547,0.419677,1


In [43]:
test_df

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h,id
4,02.jpg,2048,1661,car,110,1892,171,1576,0.48877,0.525888,0.870117,0.845876,1
5,02.jpg,2048,1661,man,1703,1808,325,540,0.857178,0.260385,0.05127,0.12944,0
6,02.jpg,2048,1661,car,1844,1972,350,530,0.931641,0.264901,0.0625,0.108368,1


# Save Images and labels in a text

In [44]:
import os
from shutil import move # It is used to handle the xml files

In [45]:
# To save the train and test data we are going to create two separate folders that contain the test_df and train_df images
train_folder = 'data_images/train'
test_folder = 'data_images/test'


os.mkdir(train_folder)
os.mkdir(test_folder)

In [46]:
# From the dataframe we require only few parameters so we group them and store in a separate variable 
cols = ['filename','id','center_x','center_y', 'w', 'h']
groupby_obj_train = train_df[cols].groupby('filename')
groupby_obj_test = test_df[cols].groupby('filename')

In [47]:
# Next we are going to save the information of same image in a separate txt file
# save each image in train/test folder and repective labels in .txt
def save_data(filename, folder_path, group_obj):
    # move image
    src = os.path.join('data_images',filename)
    dst = os.path.join(folder_path,filename)
    move(src,dst) # move image to the destination folder
    
    # save the labels
    text_filename = os.path.join(folder_path,
                                 os.path.splitext(filename)[0]+'.txt')
    group_obj.get_group(filename).set_index('filename').to_csv(text_filename,sep=' ',index=False,header=False)
    

In [48]:
filename_series = pd.Series(groupby_obj_train.groups.keys())

In [49]:
filename_series

0    01.jpg
1    03.jpg
2    04.jpg
3    05.jpg
dtype: object

In [50]:
filename_series.apply(save_data,args=(train_folder,groupby_obj_train))

0    None
1    None
2    None
3    None
dtype: object

In [51]:
filename_series_test = pd.Series(groupby_obj_test.groups.keys())
filename_series_test.apply(save_data,args=(test_folder,groupby_obj_test))

0    None
dtype: object