In [1]:
import os
from glob import glob
import pandas as pd
from functools import reduce
from xml.etree import ElementTree as et

In [2]:
xml_list=glob('Annotations_XML/*.xml')
xml_list=list(map(lambda x: x .replace('\\','/'),xml_list))

In [3]:
def extract_text(filename):
    tree = et.parse(filename)
    root = tree.getroot()
    # Extracting data from the XML
    image_name = root.find('filename').text
    width = root.find('size').find('width').text
    height = root.find('size').find('height').text  
    objs = root.findall('object')
    parser = []
    
    for obj in objs:  # Iterate over each object in objs
        name = obj.find('name').text
        bndbox = obj.find('bndbox')
        xmin = bndbox.find('xmin').text
        xmax = bndbox.find('xmax').text
        ymin = bndbox.find('ymin').text
        ymax = bndbox.find('ymax').text
        parser.append([image_name, width, height, name, xmin, xmax, ymin, ymax])
    
    return parser


In [4]:
parser_all=list(map(extract_text,xml_list))

In [5]:
data=reduce(lambda x,y:x+y,parser_all)

In [6]:
df=pd.DataFrame(data,columns=['filename','width','height','name','xmin','xmax','ymin','ymax'])

In [7]:
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax
0,dayClip1--00000.jpg,1280,960,go,702,706,351,356
1,dayClip1--00000.jpg,1280,960,go,850,854,405,409
2,dayClip1--00001.jpg,1280,960,go,702,706,351,355
3,dayClip1--00001.jpg,1280,960,go,851,855,404,408
4,dayClip1--00002.jpg,1280,960,go,702,706,349,354


In [8]:
df.shape

(100764, 8)

In [9]:
df['name'].value_counts()

go         52379
stop       45565
Name: name, dtype: int64

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100764 entries, 0 to 100763
Data columns (total 8 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   filename  100764 non-null  object
 1   width     100764 non-null  object
 2   height    100764 non-null  object
 3   name      100764 non-null  object
 4   xmin      100764 non-null  object
 5   xmax      100764 non-null  object
 6   ymin      100764 non-null  object
 7   ymax      100764 non-null  object
dtypes: object(8)
memory usage: 6.2+ MB


In [11]:
cols=['width','height','xmin','xmax','ymin','ymax']
df[cols]=df[cols].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100764 entries, 0 to 100763
Data columns (total 8 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   filename  100764 non-null  object
 1   width     100764 non-null  int32 
 2   height    100764 non-null  int32 
 3   name      100764 non-null  object
 4   xmin      100764 non-null  int32 
 5   xmax      100764 non-null  int32 
 6   ymin      100764 non-null  int32 
 7   ymax      100764 non-null  int32 
dtypes: int32(6), object(2)
memory usage: 3.8+ MB


In [12]:
df['center_x']=((df['xmax']+df['xmin'])/2)/df['width']
df['center_y']=((df['ymax']+df['ymin'])/2)/df['height']
df['w']=(df['xmax']-df['xmin'])/df['width']
df['h']=(df['ymax']-df['ymin'])/df['height']

In [13]:
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,dayClip1--00000.jpg,1280,960,go,702,706,351,356,0.55,0.368229,0.003125,0.005208
1,dayClip1--00000.jpg,1280,960,go,850,854,405,409,0.665625,0.423958,0.003125,0.004167
2,dayClip1--00001.jpg,1280,960,go,702,706,351,355,0.55,0.367708,0.003125,0.004167
3,dayClip1--00001.jpg,1280,960,go,851,855,404,408,0.666406,0.422917,0.003125,0.004167
4,dayClip1--00002.jpg,1280,960,go,702,706,349,354,0.55,0.366146,0.003125,0.005208


In [14]:
images=df['filename'].unique()

In [15]:
len(images)

36215

In [16]:
img_df=pd.DataFrame(images,columns=['filename'])
img_train=tuple(img_df.sample(frac=0.8)['filename'])

In [17]:
len(img_train)

28972

In [18]:
img_test=tuple(img_df.query(f'filename not in {img_train}')['filename'])

In [19]:
len(img_test)

7243

In [20]:
train_df=df.query(f'filename in{img_train}')
test_df=df.query(f'filename in{img_test}')

In [21]:
train_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,dayClip1--00000.jpg,1280,960,go,702,706,351,356,0.55,0.368229,0.003125,0.005208
1,dayClip1--00000.jpg,1280,960,go,850,854,405,409,0.665625,0.423958,0.003125,0.004167
2,dayClip1--00001.jpg,1280,960,go,702,706,351,355,0.55,0.367708,0.003125,0.004167
3,dayClip1--00001.jpg,1280,960,go,851,855,404,408,0.666406,0.422917,0.003125,0.004167
4,dayClip1--00002.jpg,1280,960,go,702,706,349,354,0.55,0.366146,0.003125,0.005208


In [22]:
test_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
20,dayClip1--00010.jpg,1280,960,go,703,708,345,349,0.551172,0.361458,0.003906,0.004167
21,dayClip1--00010.jpg,1280,960,go,866,870,403,408,0.678125,0.422396,0.003125,0.005208
50,dayClip1--00025.jpg,1280,960,go,718,723,331,336,0.562891,0.347396,0.003906,0.005208
51,dayClip1--00025.jpg,1280,960,go,907,912,400,404,0.710547,0.41875,0.003906,0.004167
70,dayClip1--00035.jpg,1280,960,go,724,730,319,325,0.567969,0.335417,0.004687,0.00625


In [23]:
def label_encoding(x):
    labels={'go':0, 'stop':1, 'warning':2}
    return labels[x]

In [24]:
train_df['id'] = train_df['name'].apply(label_encoding)
test_df['id'] = test_df['name'].apply(label_encoding)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['id'] = train_df['name'].apply(label_encoding)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['id'] = test_df['name'].apply(label_encoding)


In [25]:
train_df.head(10)

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h,id
0,dayClip1--00000.jpg,1280,960,go,702,706,351,356,0.55,0.368229,0.003125,0.005208,0
1,dayClip1--00000.jpg,1280,960,go,850,854,405,409,0.665625,0.423958,0.003125,0.004167,0
2,dayClip1--00001.jpg,1280,960,go,702,706,351,355,0.55,0.367708,0.003125,0.004167,0
3,dayClip1--00001.jpg,1280,960,go,851,855,404,408,0.666406,0.422917,0.003125,0.004167,0
4,dayClip1--00002.jpg,1280,960,go,702,706,349,354,0.55,0.366146,0.003125,0.005208,0
5,dayClip1--00002.jpg,1280,960,go,852,856,403,407,0.667188,0.421875,0.003125,0.004167,0
6,dayClip1--00003.jpg,1280,960,go,702,706,347,352,0.55,0.364063,0.003125,0.005208,0
7,dayClip1--00003.jpg,1280,960,go,854,858,401,406,0.66875,0.420312,0.003125,0.005208,0
8,dayClip1--00004.jpg,1280,960,go,702,707,346,351,0.550391,0.363021,0.003906,0.005208,0
9,dayClip1--00004.jpg,1280,960,go,856,860,401,406,0.670312,0.420312,0.003125,0.005208,0


In [26]:
from shutil import move

In [27]:
train_folder='tt2/train'
test_folder='tt2/test'
os.mkdir(train_folder)
os.mkdir(test_folder)

In [28]:
cols=['filename','id','center_x','center_y','w','h']
groupby_obj_train=train_df[cols].groupby('filename')
groupby_obj_test=test_df[cols].groupby('filename')

In [29]:
import os
from shutil import copy

def save_data(filename, folder_path, group_obj):
    src = os.path.join('Images_sorted', filename)
    dst = os.path.join(folder_path, filename)
    
    # Copy the file instead of moving it
    copy(src, dst)
    
    # Create the text file in the destination folder
    text_filename = os.path.join(folder_path, os.path.splitext(filename)[0] + '.txt')
    
    # Corrected 'sep' parameter
    group_obj.get_group(filename).set_index('filename').to_csv(text_filename, sep=' ', index=False, header=False)


In [30]:
filename_series=pd.Series(groupby_obj_train.groups.keys())

In [31]:
filename_series.apply(save_data,args=(train_folder,groupby_obj_train))

0        None
1        None
2        None
3        None
4        None
         ... 
28967    None
28968    None
28969    None
28970    None
28971    None
Length: 28972, dtype: object

In [32]:
filename_series_test=pd.Series(groupby_obj_test.groups.keys())
filename_series_test.apply(save_data,args=(test_folder,groupby_obj_test))

0       None
1       None
2       None
3       None
4       None
        ... 
7238    None
7239    None
7240    None
7241    None
7242    None
Length: 7243, dtype: object

In [40]:
import yaml

# Define the data for the YAML file
data = {
    'train': 'tt/train',
    'test': 'tt/test',
    'nc': 3,
    'names': ['go', 'stop', 'warning']
}

# Specify the YAML file path
yaml_file = 'tt/config.yaml'

# Write the data to the YAML file
with open(yaml_file, 'w') as file:
    yaml.dump(data, file)

print(f"YAML file created at {yaml_file}")


YAML file created at tt/config.yaml
