In [2]:
import os
import glob
import pandas as pd
import io
import xml.etree.ElementTree as ET
import argparse
import random

In [3]:
def xml_to_csv(path):
    xml_list = []
    className = os.listdir(path)[1:]
    for i in className:
        for xml_file in glob.glob(path +"/"+i +'/*.xml'):
            tree = ET.parse(xml_file)
            root = tree.getroot()
            filename = root.find('FileName').text
            xmin = root.find('DefectBound').find('X1').text
            ymin = root.find('DefectBound').find('Y1').text
            xmax = root.find('DefectBound').find('X2').text
            ymax = root.find('DefectBound').find('Y2').text
            width = str(int(xmax) - int(xmin))
            height = str(int(ymax) - int(ymin))
            value = (filename,
                    width,
                    height,
                    i,
                    xmin,
                    ymin,
                    xmax,
                    ymax,
                    )
            xml_list.append(value)


    column_name = ['filename', 'width', 'height',
                'className', 'xmin', 'ymin', 'xmax', 'ymax']
    xml_df = pd.DataFrame(xml_list, columns=column_name)
    return xml_df

In [4]:
xml = xml_to_csv("東捷科技_20220504_ADC_釋出資料集_origin")

In [5]:
xml

Unnamed: 0,filename,width,height,className,xmin,ymin,xmax,ymax
0,A114D042CA08004_Dft,18,15,I_R,18,461,36,476
1,A114D077AL05001_Dft,6,20,I_R,35,454,41,474
2,A114E00NAN08002_Dft,2,5,I_R,36,473,38,478
3,A114E036CA07001_Dft,5,8,I_R,34,470,39,478
4,A114F00MAG06001_Dft,6,7,I_R,32,470,38,477
...,...,...,...,...,...,...,...,...
2497,A115U076BV02001_Dft,21,20,Udev,33,411,54,431
2498,A115U079CE04001_Dft,21,21,Udev,32,413,53,434
2499,A115U082BQ08001_Dft,23,24,Udev,82,483,105,507
2500,A115U0A5AN03001_Dft,2,3,Udev,149,406,151,409


In [6]:
from collections import namedtuple
def split(df, group):
    data = namedtuple('data', ['filename', 'object'])
    gb = df.groupby(group)
    return [data(filename, gb.get_group(x)) for filename, x in zip(gb.groups.keys(), gb.groups)]
class_Name = split(xml, 'filename')

In [7]:
for name in class_Name:
    print(name.object.className.values[0])
    print(name.filename)
    print(name)
    print("--------")

Udev
A114B068AG01001_Dft
data(filename='A114B068AG01001_Dft', object=                 filename width height className xmin ymin xmax ymax
2002  A114B068AG01001_Dft     6      9      Udev   98  421  104  430)
--------
TP
A114B068AG01002_Dft
data(filename='A114B068AG01002_Dft', object=                 filename width height className xmin ymin xmax ymax
1501  A114B068AG01002_Dft     9     15        TP   18  463   27  478)
--------
TP
A114B068AG01004_Dft
data(filename='A114B068AG01004_Dft', object=                 filename width height className xmin ymin xmax ymax
1502  A114B068AG01004_Dft    15     13        TP    0  483   15  496)
--------
TP
A114B068AG03002_Dft
data(filename='A114B068AG03002_Dft', object=                 filename width height className xmin ymin xmax ymax
1503  A114B068AG03002_Dft     5      3        TP    0  408    5  411)
--------
PP
A114D023AB01001_Dft
data(filename='A114D023AB01001_Dft', object=                filename width height className xmin ymin xmax ymax
500

In [8]:
classNames = [os.path.join("東捷科技_20220504_ADC_釋出資料集_origin", filename) for filename in os.listdir("東捷科技_20220504_ADC_釋出資料集_origin") ]

In [9]:
path = []
for className in classNames:
    for i in os.listdir(className):
        if i[-3:]=="bmp":
            if "bound" not in i:
                path.append([className.split("\\")[-1],os.path.join('{}',i).format(className)])
random.shuffle(path)
len(path)

3003

In [10]:
path[0]

['PP', '東捷科技_20220504_ADC_釋出資料集_origin\\PP\\A114K05ACA08002_Dft.bmp']

In [13]:
import shutil
for file_name in path[:2400]:
    train_path = "train/"+file_name[0]
    if(os.path.isdir(train_path)==False):
        os.makedirs(train_path)
    if os.path.isfile(file_name[1]):
        shutil.copy(file_name[1], train_path)
    xml_file_name = file_name[1].replace(".bmp",".xml")
    if os.path.isfile(xml_file_name):
        shutil.copy(xml_file_name, train_path)

In [11]:
path[2401]

['PP_Pb', '東捷科技_20220504_ADC_釋出資料集_origin\\PP_Pb\\A115K069BC03001_Dft.bmp']

In [14]:
IMAGE_PATHS = []
for file_name in path[2400:]:
    test_path = "test/"+file_name[0]
    if(os.path.isdir(test_path)==False):
        os.makedirs(test_path)
    if os.path.isfile(file_name[1]):
        shutil.copy(file_name[1], test_path)
        IMAGE_PATHS.append(test_path+"/"+file_name[1])
    xml_file_name = file_name[1].replace(".bmp",".xml")
    if os.path.isfile(xml_file_name):
        shutil.copy(xml_file_name, test_path)

In [16]:
IMAGE_PATHS

['test/I_R/東捷科技_20220504_ADC_釋出資料集_origin\\I_R\\A114K03XCK07003_Dft.bmp',
 'test/PP_Pb/東捷科技_20220504_ADC_釋出資料集_origin\\PP_Pb\\A115K069BC03001_Dft.bmp',
 'test/Udev/東捷科技_20220504_ADC_釋出資料集_origin\\Udev\\A115M07ABE04001_Dft.bmp',
 'test/I_R/東捷科技_20220504_ADC_釋出資料集_origin\\I_R\\A114U03YBQ02002_Dft.bmp',
 'test/PP_Pb/東捷科技_20220504_ADC_釋出資料集_origin\\PP_Pb\\A114R057AT02002_Dft.bmp',
 'test/Udev/東捷科技_20220504_ADC_釋出資料集_origin\\Udev\\A114R086BV05001_Dft.bmp',
 'test/F_TP/東捷科技_20220504_ADC_釋出資料集_origin\\F_TP\\A115J011BL01001_Dft.bmp',
 'test/PP_Pb/東捷科技_20220504_ADC_釋出資料集_origin\\PP_Pb\\A1153075BC05001_Dft.bmp',
 'test/F_TP/東捷科技_20220504_ADC_釋出資料集_origin\\F_TP\\A114R06TBP01001_Dft.bmp',
 'test/TP/東捷科技_20220504_ADC_釋出資料集_origin\\TP\\A114E02LCE03004_Dft.bmp',
 'test/TP/東捷科技_20220504_ADC_釋出資料集_origin\\TP\\A114H00QCD05001_Dft.bmp',
 'test/PP_Pb/東捷科技_20220504_ADC_釋出資料集_origin\\PP_Pb\\A115A08QCD07010_Dft.bmp',
 'test/F_TP/東捷科技_20220504_ADC_釋出資料集_origin\\F_TP\\A114U019AQ01001_Dft.bmp',
 'test/F_TP/東捷科技

In [10]:

for i in os.listdir(className[0]):
    if i[-3:]=="bmp":
        if "bound" not in i:
            print(i)

A114H051BM01002_Dft.bmp
A114K00CCG03001_Dft.bmp
A114K01BAJ05001_Dft.bmp
A114K01CBR02001_Dft.bmp
A114K01CBR05001_Dft.bmp
A114K01CCA05001_Dft.bmp
A114K01CCD01002_Dft.bmp
A114K04SAB07001_Dft.bmp
A114K04TBM02007_Dft.bmp
A114K04TBM04003_Dft.bmp
A114K04VAH05002_Dft.bmp
A114K04VAJ01003_Dft.bmp
A114K04VCF01003_Dft.bmp
A114K051CD05002_Dft.bmp
A114K05ACA01001_Dft.bmp
A114K05BAK01002_Dft.bmp
A114K05XCC01001_Dft.bmp
A114K05XCC03001_Dft.bmp
A114K08EAK03004_Dft.bmp
A114K08EAK06001_Dft.bmp
A114K0A4CA03002_Dft.bmp
A114K0A4CA07001_Dft.bmp
A114L011AD07001_Dft.bmp
A114L011AE07001_Dft.bmp
A114L011AG07001_Dft.bmp
A114L011AK01003_Dft.bmp
A114L011AQ02001_Dft.bmp
A114L013AE02001_Dft.bmp
A114L013AM02001_Dft.bmp
A114L013AT01002_Dft.bmp
A114L013AT01003_Dft.bmp
A114L01VCC04002_Dft.bmp
A114L01WCC02001_Dft.bmp
A114L077BA01001_Dft.bmp
A114M02CAF03001_Dft.bmp
A114M02CAH01001_Dft.bmp
A114M046AW01004_Dft.bmp
A114M04HBE04001_Dft.bmp
A114M06LAE08001_Dft.bmp
A114M083BW04001_Dft.bmp
A114M083BW06001_Dft.bmp
A114M083BX08001_

In [None]:
testFileDir = "test" ## test 圖片路徑
IMAGE_PATHS = [os.path.join(testFileDir, filename) for filename in os.listdir(testFileDir) if filename.endswith(".bmp")]
print(IMAGE_PATHS)