In [115]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from glob import glob
from functools import reduce
from xml.etree import ElementTree as et
from shutil import move

In [26]:
# Load all the XML files and store in a list
xml_list = glob('Annotations\*.xml')
xml_list

In [45]:
# Data cleaning, replace \\ with /
xml_list = list(map(lambda x: x.replace('\\','/'), xml_list))
xml_list

['Annotations/2007_000027.xml',
 'Annotations/2007_000032.xml',
 'Annotations/2007_000033.xml',
 'Annotations/2007_000039.xml',
 'Annotations/2007_000042.xml',
 'Annotations/2007_000061.xml',
 'Annotations/2007_000063.xml',
 'Annotations/2007_000068.xml',
 'Annotations/2007_000121.xml',
 'Annotations/2007_000123.xml',
 'Annotations/2007_000129.xml',
 'Annotations/2007_000170.xml',
 'Annotations/2007_000175.xml',
 'Annotations/2007_000187.xml',
 'Annotations/2007_000241.xml',
 'Annotations/2007_000243.xml',
 'Annotations/2007_000250.xml',
 'Annotations/2007_000256.xml',
 'Annotations/2007_000272.xml',
 'Annotations/2007_000323.xml',
 'Annotations/2007_000332.xml',
 'Annotations/2007_000333.xml',
 'Annotations/2007_000346.xml',
 'Annotations/2007_000363.xml',
 'Annotations/2007_000364.xml',
 'Annotations/2007_000392.xml',
 'Annotations/2007_000423.xml',
 'Annotations/2007_000452.xml',
 'Annotations/2007_000464.xml',
 'Annotations/2007_000480.xml',
 'Annotations/2007_000491.xml',
 'Annota

In [42]:
# Step - 2
# Read file names
# from each xml file we need to extract
# filename, size(width, height), object(xmin, xmax, ymin, ymax, name)
def extract_text(filename):
    tree = et.parse(filename)
    root = tree.getroot()
    # print(root)

    parser = []
    #extract filename
    image_name = root.find('filename').text

    #extract width and height
    width = root.find('size').find('width').text
    height = root.find('size').find('height').text

    #extract xmin, xmax, ymin, ymax
    objects = root.findall('object')
    for obj in objects:
        object_name = obj.find('name').text
        bounding_box = obj.find('bndbox')
        xmin = bounding_box.find('xmin').text
        xmax = bounding_box.find('xmax').text
        ymin = bounding_box.find('ymin').text
        ymax = bounding_box.find('ymax').text
        parser.append([image_name, width, height, object_name, xmin, xmax, ymin, ymax])
    return parser



In [46]:
parser_all = list(map(extract_text, xml_list))

In [47]:
parser_all

[[['2007_000027.jpg', '486', '500', 'person', '174', '349', '101', '351']],
 [['2007_000032.jpg', '500', '281', 'aeroplane', '104', '375', '78', '183'],
  ['2007_000032.jpg', '500', '281', 'aeroplane', '133', '197', '88', '123'],
  ['2007_000032.jpg', '500', '281', 'person', '195', '213', '180', '229'],
  ['2007_000032.jpg', '500', '281', 'person', '26', '44', '189', '238']],
 [['2007_000033.jpg', '500', '366', 'aeroplane', '9', '499', '107', '263'],
  ['2007_000033.jpg', '500', '366', 'aeroplane', '421', '482', '200', '226'],
  ['2007_000033.jpg', '500', '366', 'aeroplane', '325', '411', '188', '223']],
 [['2007_000039.jpg', '500', '375', 'tvmonitor', '156', '344', '89', '279']],
 [['2007_000042.jpg', '500', '335', 'train', '263', '500', '32', '295'],
  ['2007_000042.jpg', '500', '335', 'train', '1', '235', '36', '299']],
 [['2007_000061.jpg', '500', '333', 'boat', '274', '437', '11', '279'],
  ['2007_000061.jpg', '500', '333', 'boat', '184', '281', '214', '252']],
 [['2007_000063.jpg

In [48]:
data = reduce(lambda x, y : x + y, parser_all)
data

[['2007_000027.jpg', '486', '500', 'person', '174', '349', '101', '351'],
 ['2007_000032.jpg', '500', '281', 'aeroplane', '104', '375', '78', '183'],
 ['2007_000032.jpg', '500', '281', 'aeroplane', '133', '197', '88', '123'],
 ['2007_000032.jpg', '500', '281', 'person', '195', '213', '180', '229'],
 ['2007_000032.jpg', '500', '281', 'person', '26', '44', '189', '238'],
 ['2007_000033.jpg', '500', '366', 'aeroplane', '9', '499', '107', '263'],
 ['2007_000033.jpg', '500', '366', 'aeroplane', '421', '482', '200', '226'],
 ['2007_000033.jpg', '500', '366', 'aeroplane', '325', '411', '188', '223'],
 ['2007_000039.jpg', '500', '375', 'tvmonitor', '156', '344', '89', '279'],
 ['2007_000042.jpg', '500', '335', 'train', '263', '500', '32', '295'],
 ['2007_000042.jpg', '500', '335', 'train', '1', '235', '36', '299'],
 ['2007_000061.jpg', '500', '333', 'boat', '274', '437', '11', '279'],
 ['2007_000061.jpg', '500', '333', 'boat', '184', '281', '214', '252'],
 ['2007_000063.jpg', '500', '375', 'do

In [81]:
df = pd.DataFrame(data, columns = ['filename', 'width', 'height', 'name', 'xmin', 'xmax', 'ymin', 'ymax'])
df

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax
0,2007_000027.jpg,486,500,person,174,349,101,351
1,2007_000032.jpg,500,281,aeroplane,104,375,78,183
2,2007_000032.jpg,500,281,aeroplane,133,197,88,123
3,2007_000032.jpg,500,281,person,195,213,180,229
4,2007_000032.jpg,500,281,person,26,44,189,238
...,...,...,...,...,...,...,...,...
40133,2012_004328.jpg,328,500,person,59,166,220,415
40134,2012_004328.jpg,328,500,person,219,268,226,332
40135,2012_004329.jpg,333,500,person,57,284,88,397
40136,2012_004330.jpg,375,500,person,230,370,133,441


In [50]:
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax
0,2007_000027.jpg,486,500,person,174,349,101,351
1,2007_000032.jpg,500,281,aeroplane,104,375,78,183
2,2007_000032.jpg,500,281,aeroplane,133,197,88,123
3,2007_000032.jpg,500,281,person,195,213,180,229
4,2007_000032.jpg,500,281,person,26,44,189,238


In [52]:
df.shape

(40138, 8)

In [55]:
df['name'].value_counts()

name
person         17401
chair           3056
car             2492
dog             1598
bottle          1561
cat             1277
bird            1271
pottedplant     1202
sheep           1084
boat            1059
aeroplane       1002
tvmonitor        893
sofa             841
bicycle          837
horse            803
motorbike        801
diningtable      800
cow              771
train            704
bus              685
Name: count, dtype: int64

In [56]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40138 entries, 0 to 40137
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  40138 non-null  object
 1   width     40138 non-null  object
 2   height    40138 non-null  object
 3   name      40138 non-null  object
 4   xmin      40138 non-null  object
 5   xmax      40138 non-null  object
 6   ymin      40138 non-null  object
 7   ymax      40138 non-null  object
dtypes: object(8)
memory usage: 2.4+ MB


In [68]:
df.describe()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax
count,40138,40138,40138,40138,40138,40138,40138,40138
unique,17125,272,339,20,494,493,460,488
top,2008_007069.jpg,500,375,person,1,500,1,375
freq,56,31806,18343,17401,4863,3727,2812,4578


In [85]:
# Identify the row(s) where 'ymin' column has the problematic value
problem_indices = df[df['ymin'] == '45.70000076293945'].index
# '281.70000076293945'
# Remove the identified row(s) from the DataFrame
df = df.drop(problem_indices)

In [86]:
# type conversion
cols = ['width', 'height', 'xmin', 'xmax', 'ymin', 'ymax']
df[cols] = df[cols].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 40136 entries, 0 to 40137
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  40136 non-null  object
 1   width     40136 non-null  int32 
 2   height    40136 non-null  int32 
 3   name      40136 non-null  object
 4   xmin      40136 non-null  int32 
 5   xmax      40136 non-null  int32 
 6   ymin      40136 non-null  int32 
 7   ymax      40136 non-null  int32 
dtypes: int32(6), object(2)
memory usage: 1.8+ MB


In [88]:
#center x, center y
#using normalized values between 0 and 1, so we are dividing by width and height
df['center_x'] = ((df['xmax'] + df['xmin'])/2)/df['width']
df['center_y'] = ((df['ymax'] + df['ymin'])/2)/df['height']
# width
df['w'] = (df['xmax'] - df['xmin'])/df['width']
# height
df['h'] = (df['ymax'] - df['ymin']) / df['height']

In [89]:
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,2007_000027.jpg,486,500,person,174,349,101,351,0.538066,0.452,0.360082,0.5
1,2007_000032.jpg,500,281,aeroplane,104,375,78,183,0.479,0.464413,0.542,0.373665
2,2007_000032.jpg,500,281,aeroplane,133,197,88,123,0.33,0.375445,0.128,0.124555
3,2007_000032.jpg,500,281,person,195,213,180,229,0.408,0.727758,0.036,0.174377
4,2007_000032.jpg,500,281,person,26,44,189,238,0.07,0.759786,0.036,0.174377


Split Data into Train and Test Sets

In [91]:
images = df['filename'].unique()

In [92]:
len(images)

17123

In [104]:
# 80% for training and 20% for test set
img_df = pd.DataFrame(images, columns=['filename'])
img_train = tuple(img_df.sample(frac = 0.9)['filename']) #shuffle and pick 80% of the images
img_train

('2008_003492.jpg',
 '2011_000954.jpg',
 '2009_001484.jpg',
 '2009_001257.jpg',
 '2008_002983.jpg',
 '2011_000224.jpg',
 '2011_000791.jpg',
 '2011_002949.jpg',
 '2008_001055.jpg',
 '2012_000558.jpg',
 '2010_005192.jpg',
 '2010_002413.jpg',
 '2011_007164.jpg',
 '2008_004834.jpg',
 '2010_002935.jpg',
 '2011_003300.jpg',
 '2008_007594.jpg',
 '2012_001845.jpg',
 '2011_005006.jpg',
 '2010_006762.jpg',
 '2008_003497.jpg',
 '2012_002904.jpg',
 '2011_000192.jpg',
 '2010_006773.jpg',
 '2010_005592.jpg',
 '2009_000058.jpg',
 '2011_005572.jpg',
 '2008_007501.jpg',
 '2010_002113.jpg',
 '2007_006317.jpg',
 '2008_007470.jpg',
 '2010_001293.jpg',
 '2010_006635.jpg',
 '2012_003325.jpg',
 '2009_001673.jpg',
 '2010_005206.jpg',
 '2007_002234.jpg',
 '2009_004150.jpg',
 '2008_000647.jpg',
 '2009_005079.jpg',
 '2007_003207.jpg',
 '2009_002607.jpg',
 '2008_005818.jpg',
 '2008_005959.jpg',
 '2011_002536.jpg',
 '2008_003157.jpg',
 '2008_005504.jpg',
 '2008_000364.jpg',
 '2010_000103.jpg',
 '2011_005227.jpg',


In [105]:
img_test = tuple(img_df.query(f'filename not in {img_train}')['filename'])
img_test

('2007_000250.jpg',
 '2007_000346.jpg',
 '2007_000392.jpg',
 '2007_000491.jpg',
 '2007_000783.jpg',
 '2007_000837.jpg',
 '2007_001175.jpg',
 '2007_001239.jpg',
 '2007_001457.jpg',
 '2007_001602.jpg',
 '2007_001678.jpg',
 '2007_001698.jpg',
 '2007_001724.jpg',
 '2007_002268.jpg',
 '2007_002426.jpg',
 '2007_002639.jpg',
 '2007_002648.jpg',
 '2007_002668.jpg',
 '2007_002895.jpg',
 '2007_003134.jpg',
 '2007_003201.jpg',
 '2007_003530.jpg',
 '2007_003541.jpg',
 '2007_003848.jpg',
 '2007_003889.jpg',
 '2007_004092.jpg',
 '2007_004193.jpg',
 '2007_004238.jpg',
 '2007_004275.jpg',
 '2007_004281.jpg',
 '2007_004405.jpg',
 '2007_004423.jpg',
 '2007_004483.jpg',
 '2007_004500.jpg',
 '2007_004722.jpg',
 '2007_004841.jpg',
 '2007_005149.jpg',
 '2007_005227.jpg',
 '2007_005310.jpg',
 '2007_005331.jpg',
 '2007_005845.jpg',
 '2007_005857.jpg',
 '2007_005902.jpg',
 '2007_006035.jpg',
 '2007_006046.jpg',
 '2007_006240.jpg',
 '2007_006373.jpg',
 '2007_006559.jpg',
 '2007_006614.jpg',
 '2007_007048.jpg',


In [106]:
print(len(img_train), len(img_test))

15411 1712


In [108]:
train_df = df.query(f'filename in {img_train}')
test_df = df.query(f'filename in {img_test}')

In [109]:
train_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,2007_000027.jpg,486,500,person,174,349,101,351,0.538066,0.452,0.360082,0.5
1,2007_000032.jpg,500,281,aeroplane,104,375,78,183,0.479,0.464413,0.542,0.373665
2,2007_000032.jpg,500,281,aeroplane,133,197,88,123,0.33,0.375445,0.128,0.124555
3,2007_000032.jpg,500,281,person,195,213,180,229,0.408,0.727758,0.036,0.174377
4,2007_000032.jpg,500,281,person,26,44,189,238,0.07,0.759786,0.036,0.174377


In [110]:
test_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
37,2007_000250.jpg,500,375,diningtable,1,474,170,375,0.475,0.726667,0.946,0.546667
38,2007_000250.jpg,500,375,bottle,97,150,124,297,0.247,0.561333,0.106,0.461333
45,2007_000346.jpg,500,375,bottle,124,230,107,343,0.354,0.6,0.212,0.629333
46,2007_000346.jpg,500,375,person,137,497,78,375,0.634,0.604,0.72,0.792
47,2007_000346.jpg,500,375,person,89,129,202,247,0.218,0.598667,0.08,0.12


### Assign Id numbers to Object Names
person         
chair           
car             
dog             
bottle          
cat             
bird            
pottedplant     
sheep           
boat            
aeroplane       
tvmonitor        
sofa             
bicycle          
horse            
motorbike        
diningtable      
cow              
train            
bus              

In [112]:
# label encoding
def label_encoding(x):
    labels = {'person':0, 'car':1, 'chair':2, 'bottle':3, 'pottedplant':4, 'bird':5, 'dog':6, 'sofa':7, 'bicycle':8, 'horse':9, 'boat':10, 'motorbike':11, 'cat':12, 'tvmonitor':13, 'cow':14, 'sheep':15, 'aeroplane':16, 'train':17, 'diningtable':18, 'bus':19}
    return labels[x]

In [113]:
train_df['id'] = train_df['name'].apply(label_encoding)
test_df['id'] = test_df['name'].apply(label_encoding)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['id'] = train_df['name'].apply(label_encoding)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['id'] = test_df['name'].apply(label_encoding)


In [114]:
train_df.head(10)

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h,id
0,2007_000027.jpg,486,500,person,174,349,101,351,0.538066,0.452,0.360082,0.5,0
1,2007_000032.jpg,500,281,aeroplane,104,375,78,183,0.479,0.464413,0.542,0.373665,16
2,2007_000032.jpg,500,281,aeroplane,133,197,88,123,0.33,0.375445,0.128,0.124555,16
3,2007_000032.jpg,500,281,person,195,213,180,229,0.408,0.727758,0.036,0.174377,0
4,2007_000032.jpg,500,281,person,26,44,189,238,0.07,0.759786,0.036,0.174377,0
5,2007_000033.jpg,500,366,aeroplane,9,499,107,263,0.508,0.505464,0.98,0.42623,16
6,2007_000033.jpg,500,366,aeroplane,421,482,200,226,0.903,0.581967,0.122,0.071038,16
7,2007_000033.jpg,500,366,aeroplane,325,411,188,223,0.736,0.561475,0.172,0.095628,16
8,2007_000039.jpg,500,375,tvmonitor,156,344,89,279,0.5,0.490667,0.376,0.506667,13
9,2007_000042.jpg,500,335,train,263,500,32,295,0.763,0.48806,0.474,0.785075,17


### Save Image and Labels in text

In [116]:
train_folder = 'data_images/train'
test_folder = 'data_images/test'

In [117]:
os.mkdir(train_folder)
os.mkdir(test_folder)

In [118]:
cols = ['filename', 'id', 'center_x', 'center_y', 'w', 'h']
groupby_obj_train = train_df[cols].groupby('filename')
groupby_obj_test = test_df[cols].groupby('filename')

In [133]:
# groupby_obj_train.get_group('2007_000033.jpg').set_index('filename').to_csv('sample.txt', index = False, header = False)


None


In [122]:
#save each image in train/test folder and respective label in .txt
def save_data(filename, folder_path, group_obj):
    src = os.path.join('data_images', filename)
    dst = os.path.join(folder_path, filename)
    move(src, dst)
    
    #save the Labels
    text_filename = os.path.join(folder_path, os.path.splitext(filename)[0]+'.txt')
    group_obj.get_group(filename).set_index('filename').to_csv(text_filename, sep = " ", index = False, header = False)

In [129]:
filename_series = pd.Series(groupby_obj_train.groups.keys())

In [135]:
filename_series.apply(save_data, args=(train_folder, groupby_obj_train))

0        None
1        None
2        None
3        None
4        None
         ... 
15406    None
15407    None
15408    None
15409    None
15410    None
Length: 15411, dtype: object

In [137]:
for filename in img_test:
    # print(filename)
    save_data(filename, test_folder, groupby_obj_test)

In [148]:
print('2011_006777.jpg' in img_test)

False
