# <center>Parse XML files of PlantCLEF Dataset</center>

In [1]:
import os
import pandas as pd
import xml.etree.ElementTree as ET
from tqdm.notebook import tqdm

In [2]:
file_list = os.listdir("train-xml")
print("No. of XML files to process:", len(file_list))

No. of XML files to process: 91758


In [3]:
file_list[0]

'46826.xml'

In [4]:
tree = ET.parse(os.path.join("train-xml",file_list[0]))
root = tree.getroot()
for child in root:
    print(child.tag,": ",child.text)

ObservationId :  7354
MediaId :  46826
Vote :  2.5
Content :  Flower
ClassId :  2140
Family :  Caryophyllaceae
Genus :  Myosoton
Species :  Myosoton aquaticum (L.) Moench
Author :  emmanuel stratmains
Date :  2012-8-1
Location :  Allonnes
Latitude :  47.97051
Longitude :  0.16968
YearInCLEF :  PlantCLEF2015
ObservationId2014 :  None
ImageId2014 :  None
LearnTag :  Train


In [5]:
print([item.tag for item in root]) 

['ObservationId', 'MediaId', 'Vote', 'Content', 'ClassId', 'Family', 'Genus', 'Species', 'Author', 'Date', 'Location', 'Latitude', 'Longitude', 'YearInCLEF', 'ObservationId2014', 'ImageId2014', 'LearnTag']


## Adding all the relevant infromation to dataframe

In [8]:
df = pd.DataFrame(columns = ["FileName","ObservationId","Content","Family","Genus","Species"])

for file in tqdm(file_list):
    tree = ET.parse(os.path.join("train-xml",file))
    root = tree.getroot()

    info_dict = {}
    info_dict['FileName'] = file
    for child in root:
        if child.tag == "ObservationId":
            info_dict['ObservationId'] = child.text
        if child.tag == "Content":
            info_dict['Content'] = child.text
        if child.tag == "Family":
            info_dict['Family'] = child.text
        if child.tag == "Genus":
            info_dict['Genus'] = child.text
        if child.tag == "Species":
            info_dict['Species'] = child.text
    df = df.append(info_dict,ignore_index=True)

  0%|          | 0/91758 [00:00<?, ?it/s]

In [9]:
df.head(20)

Unnamed: 0,FileName,ObservationId,Content,Family,Genus,Species
0,46826.xml,7354,Flower,Caryophyllaceae,Myosoton,Myosoton aquaticum (L.) Moench
1,5848.xml,5554,Flower,Linaceae,Linum,Linum usitatissimum L.
2,78710.xml,12084,Entire,Plantaginaceae,Globularia,Globularia alypum L.
3,31476.xml,25185,LeafScan,Betulaceae,Carpinus,Carpinus betulus L.
4,68503.xml,9663,Flower,Orchidaceae,Ophrys,Ophrys aranifera Huds.
5,21665.xml,18122,Fruit,Ranunculaceae,Anemone,Anemone vernalis L.
6,59906.xml,14317,Leaf,Fagaceae,Quercus,Quercus suber L.
7,104485.xml,19800,LeafScan,Fagaceae,Quercus,Quercus petraea Liebl.
8,71052.xml,4017,Entire,Poaceae,Ammophila,Ammophila arenaria (L.) Link
9,5690.xml,3021,Flower,Asteraceae,Inula,Inula conyza DC.


In [10]:
df.to_csv("plantCLFdataset-train.csv",index=False)

In [11]:
df.info

<bound method DataFrame.info of          FileName ObservationId   Content           Family       Genus  \
0       46826.xml          7354    Flower  Caryophyllaceae    Myosoton   
1        5848.xml          5554    Flower         Linaceae       Linum   
2       78710.xml         12084    Entire   Plantaginaceae  Globularia   
3       31476.xml         25185  LeafScan       Betulaceae    Carpinus   
4       68503.xml          9663    Flower      Orchidaceae      Ophrys   
...           ...           ...       ...              ...         ...   
91753   36300.xml          9076  LeafScan       Betulaceae      Betula   
91754   70355.xml          2062    Flower      Sapindaceae        Acer   
91755  105782.xml          1923     Fruit       Betulaceae       Alnus   
91756   50085.xml         34245      Stem       Lythraceae     Lythrum   
91757   39033.xml         38933    Entire     Asparagaceae      Ruscus   

                              Species  
0      Myosoton aquaticum (L.) Moench  

In [12]:
pd.read_csv("plantCLFdataset-train.csv").head(10)

Unnamed: 0,FileName,ObservationId,Content,Family,Genus,Species
0,46826.xml,7354,Flower,Caryophyllaceae,Myosoton,Myosoton aquaticum (L.) Moench
1,5848.xml,5554,Flower,Linaceae,Linum,Linum usitatissimum L.
2,78710.xml,12084,Entire,Plantaginaceae,Globularia,Globularia alypum L.
3,31476.xml,25185,LeafScan,Betulaceae,Carpinus,Carpinus betulus L.
4,68503.xml,9663,Flower,Orchidaceae,Ophrys,Ophrys aranifera Huds.
5,21665.xml,18122,Fruit,Ranunculaceae,Anemone,Anemone vernalis L.
6,59906.xml,14317,Leaf,Fagaceae,Quercus,Quercus suber L.
7,104485.xml,19800,LeafScan,Fagaceae,Quercus,Quercus petraea Liebl.
8,71052.xml,4017,Entire,Poaceae,Ammophila,Ammophila arenaria (L.) Link
9,5690.xml,3021,Flower,Asteraceae,Inula,Inula conyza DC.
