In [54]:
import os
import re
import xml.etree.ElementTree as ET
import numpy as np
import pandas as pd

All tags:
{'link', 'plate_number_image_url', 'plate_id', 'plate_title', 'tags', 'plate_region', 'fon_id', 'fon_title', 'model', 'photo_url', 'plate_number', 'country', 'model2', 'car'}

In [55]:
car_pattern = r"([a-zA-Z0-9а-яА-Я]+)"

In [56]:
TAGS = [
    'car', 'model', 'model2', 'photo_url', 'image_path', 'plate_number',
    'plate_number_image_url',  'plate_id', 'plate_title', 'plate_region', 'tags','fon_id', 'fon_title',  'country', 'link',
]

In [57]:
def get_folder_name(generation: str) -> str:
    """
    Generation name looks like a mess. This func will extract only numeric info
    :param generation:
    :return: number or string like "2nd"
    """
    if generation is None: return '0'

    match = re.search(car_pattern, generation)
    if match:
        return match.group(1)

    try:
        int(generation)
        return generation
    except ValueError:
        gen = generation.split()[0]
        if gen[-1] in ',:.;':
            gen = gen[:-1]
        return gen

In [58]:
def get_car_image_name(url: str) -> str:
    """
    Function to get unique name for image. All url are unique (as long as pictures are different), so why don't we use that
    """
    name = url.split('/')[-1]
    # name = name.split('.')[0]
    return name

In [59]:
def remove_slash_and_other_trash(model:str) -> str:
    """
    We don't need lots of nested folders!
    """
    if model:
        return model.replace("/", "_&_").replace(":", "-")
    else:
        return '0'

In [60]:
def parse_xml(xml_files_path: str, save_result_to: str):
    """ Function to parse all XML files in folder and create all folders according to model names"""
    save_result_to += "/"
    xml_files = os.listdir(xml_files_path)
    os.makedirs(save_result_to, exist_ok=True)
    data_rows = []
    for file_n, file_path in enumerate(xml_files[:]):
        with open(xml_files_path + file_path, 'r', encoding='utf-8') as file:
            tree = ET.parse(file)
        root = tree.getroot()

        # Access elements and attributes in the XML file
        for n, child in enumerate(root[:]):

            data = {}

            car = child.find("car").text
            if not car or len(car) < 2:
                continue
            photo_url = child.find("photo_url").text

            image_name = get_car_image_name(photo_url)

            model = child.find("model").text
            model2 = child.find("model2").text



            path_tags = [car, model, model2]

            for tag_n, tag in enumerate(path_tags[:], start=0):
                tag = get_folder_name(tag)
                path_tags[tag_n] = remove_slash_and_other_trash(tag)


            for tag in TAGS:
                if tag != 'image_path':
                    data[tag] = child.find(tag).text
                else:
                    data[tag] = '/'.join(path_tags) + '/' + image_name


            data_rows.append(data)



# print(f"File {file_n} is processed successfully.")

    # Create a DataFrame from the collected data
    df = pd.DataFrame(data_rows)

    # Sort the DataFrame by the first two columns
    df.sort_values(by=[TAGS[0], TAGS[1], TAGS[2]], inplace=True)

    # Save the sorted data to a CSV file
    df.to_csv(save_result_to + 'car_labels_from_XMLs.csv', index=False)
    return


In [61]:
parse_xml(xml_files_path="../50k/", save_result_to="../labels_of_parsed_XML/")

In [62]:
pd.read_csv("../labels_of_parsed_XML/car_labels_from_XMLs.csv", na_values=' ')

Unnamed: 0,car,model,model2,photo_url,image_path,plate_number,plate_number_image_url,plate_id,plate_title,plate_region,tags,fon_id,fon_title,country,link
0,ABG Titan,0,0,https://img03.platesmania.com/230502/o/2145242...,ABG/0/0/21452423.jpg,CA-4 8974,https://img03.platesmania.com/230502/inf/21452...,7,Special vehicles (2004),Grodno Oblast,,2,Two-row plate,belarus,https://platesmania.com/by/nomer21452423
1,ABM,Volcan,0,https://img03.platesmania.com/230226/o/2096495...,ABM/Volcan/0/20964955.jpg,AB 2884,https://img03.platesmania.com/230226/inf/20964...,3,Motorcycles (2014),0,motorcycle,2,Two-row plate,georgia,https://platesmania.com/ge/nomer20964955
2,ABM,Volcan,0,http://img03.platesmania.com/190808/o/13282911...,ABM/Volcan/0/13282911.jpg,AB 2635,http://img03.platesmania.com/190808/inf/132829...,3,Motorcycles (2014),0,motorcycle,2,Two-row plate,georgia,https://platesmania.com/ge/nomer13282911
3,AC,Cobra,0,https://img03.platesmania.com/230207/o/2084763...,AC/Cobra/0/20847630.jpg,HH 064455,https://img03.platesmania.com/230207/inf/20847...,8,Trade plates (06th),Hanseatic City of Hamburg,cabriolet | oldtimer,2,Two-row plate,germany,https://platesmania.com/de/nomer20847630
4,AC,Cobra,0,https://img03.platesmania.com/230313/o/2107214...,AC/Cobra/0/21072141.jpg,BG BM-8,https://img03.platesmania.com/230313/inf/21072...,4,Vanity Plates,Belgrade,cabriolet | oldtimer,1,Single-row plate,serbia,https://platesmania.com/rs/nomer21072141
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45314,ČZ,Čezeta,0,https://img03.platesmania.com/230611/o/2178016...,Z/ezeta/0/21780160.jpg,HO 60-10,https://img03.platesmania.com/230611/inf/21780...,8,Motorcycles (1960),Hodonín District,oldtimer | motorcycle,10,back (two-row trapezoidal),czechrepublic,https://platesmania.com/cz/nomer21780160
45315,ČZ,Čezeta,0,https://img03.platesmania.com/230611/o/2178000...,Z/ezeta/0/21780003.jpg,SO1 38-89,https://img03.platesmania.com/230611/inf/21780...,8,Motorcycles (1960),Sokolov District,oldtimer | motorcycle,10,back (two-row trapezoidal),czechrepublic,https://platesmania.com/cz/nomer21780003
45316,ČZ,Čezeta,0,https://img03.platesmania.com/230322/o/2113222...,Z/ezeta/0/21132223.jpg,CH 00-28,https://img03.platesmania.com/230322/inf/21132...,8,Motorcycles (1960),Cheb District,motorcycle,9,back (two-row rectangular),czechrepublic,https://platesmania.com/cz/nomer21132223
45317,ČZ,Čezeta,0,http://img03.platesmania.com/210110/o/16002861...,Z/ezeta/0/16002861.jpg,TC 2252,http://img03.platesmania.com/210110/inf/160028...,5,Motorcycles,0,oldtimer | motorcycle,4,Two-row plate with flag,latvia,https://platesmania.com/lv/nomer16002861
