In [1]:
import os
from glob import glob
import pandas as pd
from functools import reduce
from xml.etree import ElementTree as et

In [2]:
# load all xml files and store in list
xml_list = glob('./data_images/*.xml')
xml_list

['./data_images/007826.xml',
 './data_images/002786.xml',
 './data_images/006286.xml',
 './data_images/002962.xml',
 './data_images/008297.xml',
 './data_images/009189.xml',
 './data_images/009823.xml',
 './data_images/002976.xml',
 './data_images/002745.xml',
 './data_images/006523.xml',
 './data_images/008268.xml',
 './data_images/004452.xml',
 './data_images/002023.xml',
 './data_images/005980.xml',
 './data_images/004446.xml',
 './data_images/002037.xml',
 './data_images/009162.xml',
 './data_images/006251.xml',
 './data_images/000620.xml',
 './data_images/000146.xml',
 './data_images/007629.xml',
 './data_images/001258.xml',
 './data_images/002751.xml',
 './data_images/002989.xml',
 './data_images/007601.xml',
 './data_images/001270.xml',
 './data_images/002779.xml',
 './data_images/005016.xml',
 './data_images/003301.xml',
 './data_images/006279.xml',
 './data_images/007167.xml',
 './data_images/008254.xml',
 './data_images/000608.xml',
 './data_images/005764.xml',
 './data_image

In [90]:
def extract_text(filename):
    tree = et.parse(filename)
    root = tree.getroot()
    image_name = root.find('filename').text
    width = float(root.find('size').find('width').text)
    height = float(root.find('size').find('height').text)
    objs = root.findall('object')
    parser = []
    for obj in objs:
        name = obj.find('name').text
        xmin =  obj.find('bndbox').find('xmin').text
        xmax = obj.find('bndbox').find('xmax').text
        ymin =  obj.find('bndbox').find('xmin').text
        ymax = obj.find('bndbox').find('xmin').text

        parser.append([image_name,width,height,name, \
                       xmin,xmax,ymin,ymax])
    return parser






In [91]:
parser_all = list(map(extract_text,xml_list))
data = reduce(lambda x,y : x+y, parser_all)
df = pd.DataFrame(data,columns = ['filename','width','height','name','xmin','xmax','ymin','ymax'])
cols = ['width','height','xmin','xmax','ymin','ymax']
df[cols] = df[cols].astype(int)
df['center_x'] = ((df['xmax']+df['xmin'])/2)/df['width']
df['center_y'] = ((df['ymax']+df['ymin'])/2)/df['height']
df['w'] = ((df['xmax'] - df['xmin'])/df['width'])
df['h'] = ((df['ymax'] - df['ymin']/df['height']))
df.head()
#df['name'].value_counts()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,007826.jpg,500,375,diningtable,80,320,80,80,0.4,0.213333,0.48,79.786667
1,007826.jpg,500,375,chair,197,257,197,197,0.454,0.525333,0.12,196.474667
2,007826.jpg,500,375,chair,139,185,139,139,0.324,0.370667,0.092,138.629333
3,007826.jpg,500,375,chair,258,312,258,258,0.57,0.688,0.108,257.312
4,007826.jpg,500,375,chair,10,93,10,10,0.103,0.026667,0.166,9.973333


Split Data into train and Test sets

In [92]:
#split data into train and test data sets
images = df['filename'].unique()
img_df = pd.DataFrame(images,columns=['filename'])
img_df.head()
img_train = tuple(img_df.sample(frac=0.8)['filename']) #shuffle and pick 80% of images
img_test = tuple(img_df.query(f'filename not in {img_train}')['filename'])

train_df = df.query(f'filename in {img_train}')
test_df = df.query(f'filename in {img_test}')
len(img_train),len(img_test)
len(train_df),len(test_df)
#test_df.head()


(12453, 3210)

In [93]:
train_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,007826.jpg,500,375,diningtable,80,320,80,80,0.4,0.213333,0.48,79.786667
1,007826.jpg,500,375,chair,197,257,197,197,0.454,0.525333,0.12,196.474667
2,007826.jpg,500,375,chair,139,185,139,139,0.324,0.370667,0.092,138.629333
3,007826.jpg,500,375,chair,258,312,258,258,0.57,0.688,0.108,257.312
4,007826.jpg,500,375,chair,10,93,10,10,0.103,0.026667,0.166,9.973333


In [94]:
test_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
9,006286.jpg,500,375,person,80,405,80,80,0.485,0.213333,0.65,79.786667
10,006286.jpg,500,375,person,436,475,436,436,0.911,1.162667,0.078,434.837333
11,006286.jpg,500,375,person,381,428,381,381,0.809,1.016,0.094,379.984
12,006286.jpg,500,375,diningtable,402,500,402,402,0.902,1.072,0.196,400.928
13,006286.jpg,500,375,diningtable,347,405,347,347,0.752,0.925333,0.116,346.074667


Label Encoding

In [95]:
#label encoding
def label_encoding(x):
    labels = {'person':0,'car':1,'chair':2,'bottle':3,'pottedplant':4,'bird':5,'dog':6,\
              'sofa':7,'bicycle':8,'horse':9,'boat':10,'motorbike':11,'cat':12,'tvmonitor':13,\
                'cow':14,'sheep':15,'aeroplane':16,'train':17,'diningtable':18,'bus':19}
    return labels[x]

In [96]:
label_encoding(train_df['name'][3])

train_df['id'] = train_df['name'].apply(label_encoding)
test_df['id'] = test_df['name'].apply(label_encoding)

train_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['id'] = train_df['name'].apply(label_encoding)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['id'] = test_df['name'].apply(label_encoding)


Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h,id
0,007826.jpg,500,375,diningtable,80,320,80,80,0.4,0.213333,0.48,79.786667,18
1,007826.jpg,500,375,chair,197,257,197,197,0.454,0.525333,0.12,196.474667,2
2,007826.jpg,500,375,chair,139,185,139,139,0.324,0.370667,0.092,138.629333,2
3,007826.jpg,500,375,chair,258,312,258,258,0.57,0.688,0.108,257.312,2
4,007826.jpg,500,375,chair,10,93,10,10,0.103,0.026667,0.166,9.973333,2


Save image and labels in train and test folders

In [97]:
import os
from shutil import move

train_folder = 'data_images/train'
test_folder = 'data_images/test'

os.mkdir(train_folder)
os.mkdir(test_folder)


FileExistsError: [Errno 17] File exists: 'data_images/train'

In [114]:
cols = ['filename','id','center_x','center_y','w','h']
groupby_obj_train = train_df[cols].groupby('filename')
groupby_obj_test = test_df[cols].groupby('filename')

groupby_obj_train.get_group('007826.jpg').set_index('filename').to_csv('sample.txt',index=False,header=False)
groupby_obj_train.get_group('007826.jpg')

Unnamed: 0,filename,id,center_x,center_y,w,h
0,007826.jpg,18,0.4,0.213333,0.48,79.786667
1,007826.jpg,2,0.454,0.525333,0.12,196.474667
2,007826.jpg,2,0.324,0.370667,0.092,138.629333
3,007826.jpg,2,0.57,0.688,0.108,257.312
4,007826.jpg,2,0.103,0.026667,0.166,9.973333
5,007826.jpg,2,0.325,0.218667,0.322,81.781333
6,007826.jpg,2,0.187,0.114667,0.202,42.885333


In [122]:
def save_data(filename, folder_path, group_obj):
    #move image to folder
    src = os.path.join('data_images',filename)
    dst = os.path.join(folder_path,filename)
    print(src,dst)
    move(src,dst)
    #save labels
    text_filename = os.path.join(folder_path, os.path.splitext(filename)[0]+'.txt')
    group_obj.get_group(filename).set_index('filename').to_csv(text_filename,sep=' ',index=False, header=False)

In [126]:
#move training images and label info to train folder
filename_series = pd.Series(groupby_obj_train.groups.keys())
filename_series
train_folder
groupby_obj_train
filename_series.apply(save_data,args=(train_folder,groupby_obj_train))

data_images/000001.jpg data_images/train/000001.jpg
data_images/000002.jpg data_images/train/000002.jpg
data_images/000007.jpg data_images/train/000007.jpg
data_images/000009.jpg data_images/train/000009.jpg
data_images/000017.jpg data_images/train/000017.jpg
data_images/000020.jpg data_images/train/000020.jpg
data_images/000023.jpg data_images/train/000023.jpg
data_images/000024.jpg data_images/train/000024.jpg
data_images/000030.jpg data_images/train/000030.jpg
data_images/000032.jpg data_images/train/000032.jpg
data_images/000035.jpg data_images/train/000035.jpg
data_images/000036.jpg data_images/train/000036.jpg
data_images/000041.jpg data_images/train/000041.jpg
data_images/000042.jpg data_images/train/000042.jpg
data_images/000044.jpg data_images/train/000044.jpg
data_images/000046.jpg data_images/train/000046.jpg
data_images/000047.jpg data_images/train/000047.jpg
data_images/000050.jpg data_images/train/000050.jpg
data_images/000051.jpg data_images/train/000051.jpg
data_images/

0       None
1       None
2       None
3       None
4       None
        ... 
4005    None
4006    None
4007    None
4008    None
4009    None
Length: 4010, dtype: object

In [130]:
#move test images and label info to test folder
filename_series = pd.Series(groupby_obj_test.groups.keys())
filename_series.apply(save_data,args=(test_folder,groupby_obj_test))


data_images/000012.jpg data_images/test/000012.jpg
data_images/000016.jpg data_images/test/000016.jpg
data_images/000019.jpg data_images/test/000019.jpg
data_images/000021.jpg data_images/test/000021.jpg
data_images/000026.jpg data_images/test/000026.jpg
data_images/000033.jpg data_images/test/000033.jpg
data_images/000034.jpg data_images/test/000034.jpg
data_images/000039.jpg data_images/test/000039.jpg
data_images/000048.jpg data_images/test/000048.jpg
data_images/000064.jpg data_images/test/000064.jpg
data_images/000066.jpg data_images/test/000066.jpg
data_images/000089.jpg data_images/test/000089.jpg
data_images/000102.jpg data_images/test/000102.jpg
data_images/000104.jpg data_images/test/000104.jpg
data_images/000129.jpg data_images/test/000129.jpg
data_images/000133.jpg data_images/test/000133.jpg
data_images/000158.jpg data_images/test/000158.jpg
data_images/000162.jpg data_images/test/000162.jpg
data_images/000163.jpg data_images/test/000163.jpg
data_images/000170.jpg data_ima

0       None
1       None
2       None
3       None
4       None
        ... 
997     None
998     None
999     None
1000    None
1001    None
Length: 1002, dtype: object

In [131]:
os.mkdir('annotations')

In [132]:
    #move image to folder
    src = os.path.join('data_images',filename)
    dst = os.path.join(folder_path,filename)
    print(src,dst)
    move(src,dst)

NameError: name 'folder_path' is not defined