# <center>Collection Formation with Genuses</center>
## <center>Second Approach to collection formation</center>

In [17]:
import pandas as pd

In [18]:
df = pd.read_csv('plantCLFdataset-train.csv')
df.shape

(91758, 6)

In [19]:
df.head()

Unnamed: 0,FileName,ObservationId,Content,Family,Genus,Species
0,46826.xml,7354,Flower,Caryophyllaceae,Myosoton,Myosoton aquaticum (L.) Moench
1,5848.xml,5554,Flower,Linaceae,Linum,Linum usitatissimum L.
2,78710.xml,12084,Entire,Plantaginaceae,Globularia,Globularia alypum L.
3,31476.xml,25185,LeafScan,Betulaceae,Carpinus,Carpinus betulus L.
4,68503.xml,9663,Flower,Orchidaceae,Ophrys,Ophrys aranifera Huds.


## Filter Leaf and Flower from the dataset

In [20]:
df_LeafandFlower = df[df.Content.isin(['Flower','Leaf','LeafScan'])].reset_index(drop=True)
df_LeafandFlower.shape

(54197, 6)

## Create collection with one Leaf and one Flower

In [21]:
collection_dict = {}
for key in df_LeafandFlower.index:
    collection_dict[key] = 0

In [22]:
collection_counter = 0
for genus in df_LeafandFlower.Genus.unique():
    df_genus = df_LeafandFlower[df_LeafandFlower.Genus == genus]
    flower_df = df_genus[df_genus.Content == "Flower"]
    leaf_df = df_genus[~df_genus.index.isin(flower_df.index)]
    cut_off = flower_df.shape[0] if flower_df.shape[0] <= leaf_df.shape[0] else leaf_df.shape[0]
    if cut_off != 0:
        flower_df_sel_index = flower_df.sample(n=cut_off).index
        leaf_df_sel_index = leaf_df.sample(n=cut_off).index
        for i in range(cut_off):
            collection_counter += 1
            collection_dict[flower_df_sel_index[i]] = collection_counter
            collection_dict[leaf_df_sel_index[i]] = collection_counter

In [23]:
df_LeafandFlower['index'] = df_LeafandFlower.index
df_LeafandFlower

Unnamed: 0,FileName,ObservationId,Content,Family,Genus,Species,index
0,46826.xml,7354,Flower,Caryophyllaceae,Myosoton,Myosoton aquaticum (L.) Moench,0
1,5848.xml,5554,Flower,Linaceae,Linum,Linum usitatissimum L.,1
2,31476.xml,25185,LeafScan,Betulaceae,Carpinus,Carpinus betulus L.,2
3,68503.xml,9663,Flower,Orchidaceae,Ophrys,Ophrys aranifera Huds.,3
4,59906.xml,14317,Leaf,Fagaceae,Quercus,Quercus suber L.,4
...,...,...,...,...,...,...,...
54192,78709.xml,3850,Flower,Asteraceae,Berardia,Berardia lanuginosa (Lam.) Fiori,54192
54193,5851.xml,41224,Leaf,Anacardiaceae,Pistacia,Pistacia lentiscus L.,54193
54194,40296.xml,30438,Flower,Orchidaceae,Neottia,Neottia nidus-avis (L.) Rich.,54194
54195,36300.xml,9076,LeafScan,Betulaceae,Betula,Betula pendula Roth,54195


In [24]:
df_LeafandFlower['CollectionId'] = df_LeafandFlower['index'].map(collection_dict)
df_LeafandFlower

Unnamed: 0,FileName,ObservationId,Content,Family,Genus,Species,index,CollectionId
0,46826.xml,7354,Flower,Caryophyllaceae,Myosoton,Myosoton aquaticum (L.) Moench,0,3
1,5848.xml,5554,Flower,Linaceae,Linum,Linum usitatissimum L.,1,0
2,31476.xml,25185,LeafScan,Betulaceae,Carpinus,Carpinus betulus L.,2,10
3,68503.xml,9663,Flower,Orchidaceae,Ophrys,Ophrys aranifera Huds.,3,0
4,59906.xml,14317,Leaf,Fagaceae,Quercus,Quercus suber L.,4,0
...,...,...,...,...,...,...,...,...
54192,78709.xml,3850,Flower,Asteraceae,Berardia,Berardia lanuginosa (Lam.) Fiori,54192,10957
54193,5851.xml,41224,Leaf,Anacardiaceae,Pistacia,Pistacia lentiscus L.,54193,0
54194,40296.xml,30438,Flower,Orchidaceae,Neottia,Neottia nidus-avis (L.) Rich.,54194,0
54195,36300.xml,9076,LeafScan,Betulaceae,Betula,Betula pendula Roth,54195,0


## Filter out unused images 

In [25]:
df_collection = df_LeafandFlower[df_LeafandFlower.CollectionId != 0]
df_collection = df_collection.drop(['index'],axis=1)
df_collection

Unnamed: 0,FileName,ObservationId,Content,Family,Genus,Species,CollectionId
0,46826.xml,7354,Flower,Caryophyllaceae,Myosoton,Myosoton aquaticum (L.) Moench,3
2,31476.xml,25185,LeafScan,Betulaceae,Carpinus,Carpinus betulus L.,10
7,37007.xml,18801,Flower,Convolvulaceae,Convolvulus,Convolvulus arvensis L.,242
10,51382.xml,29563,LeafScan,Platanaceae,Platanus,Platanus x hispanica Mill. ex Münchh.,419
11,105943.xml,22639,Flower,Fagaceae,Quercus,Quercus coccifera L.,107
...,...,...,...,...,...,...,...
54186,66537.xml,16787,Flower,Asteraceae,Bidens,Bidens pilosa L.,10398
54189,48812.xml,12306,Flower,Fabaceae,Medicago,Medicago sativa L.,8500
54190,76724.xml,27033,Flower,Cornaceae,Cornus,Cornus mas L.,4813
54192,78709.xml,3850,Flower,Asteraceae,Berardia,Berardia lanuginosa (Lam.) Fiori,10957


In [26]:
df_collection.CollectionId.value_counts()

2049     2
1322     2
9486     2
5400     2
7449     2
        ..
613      2
6758     2
4711     2
10864    2
2047     2
Name: CollectionId, Length: 11093, dtype: int64

## Filter out Genuses with less than 20 collections(40 images)

In [27]:
df_collection_filtered = df_collection.copy()
for genus in df_collection.Genus.unique():
    df_genus = df_collection[df_collection.Genus == genus]
    if df_genus.shape[0] < 40:
        df_collection_filtered = df_collection_filtered.drop(df_genus.index)
print(df_collection_filtered.shape)

(17338, 7)


In [28]:
df_collection_filtered.Genus.value_counts()

Prunus        764
Euphorbia     622
Geranium      470
Acer          464
Crataegus     366
             ... 
Orchis         40
Reynoutria     40
Anthriscus     40
Oenothera      40
Knautia        40
Name: Genus, Length: 158, dtype: int64

In [29]:
df_collection_filtered[df_collection_filtered.CollectionId==10]

Unnamed: 0,FileName,ObservationId,Content,Family,Genus,Species,CollectionId
2,31476.xml,25185,LeafScan,Betulaceae,Carpinus,Carpinus betulus L.,10
53830,100177.xml,2476,Flower,Betulaceae,Carpinus,Carpinus betulus L.,10


In [30]:
df_collection.to_csv("plantCLFcollection_genus.csv",index=False)