### Agrupar todas las fuentes de imágenes en una sola tabla

In [53]:
import pandas as pd

In [54]:
table_columns = ['taxonid', 'photo_link', 'by']

#### iNaturalist (observaciones en España)

In [55]:
media_inaturalist_spain = pd.read_csv('../../1. Obtención y Procesado de Datos/Ubicaciones y media/Datos iNaturalist/media_processed.csv')
media_inaturalist_spain.columns

Index(['id', 'type', 'format', 'identifier', 'references', 'created',
       'creator', 'publisher', 'license', 'rightsHolder', 'catalogNumber'],
      dtype='object')

In [56]:
observations_inaturalist_spain = pd.read_csv('../../1. Obtención y Procesado de Datos/Ubicaciones y media/Datos iNaturalist/observations_processed.csv')
observations_inaturalist_spain.columns

Index(['id', 'occurrenceID', 'references', 'recordedBy', 'recordedByID',
       'eventDate', 'decimalLatitude', 'decimalLongitude', 'scientificName',
       'taxonRank', 'license', 'rightsHolder', 'sex', 'malla_codigo',
       'idtaxon'],
      dtype='object')

In [57]:
inaturalist_spain_df = media_inaturalist_spain.merge(observations_inaturalist_spain, how='inner', on='id')
inaturalist_spain_df = inaturalist_spain_df.rename(columns={'idtaxon': 'taxonid', 'identifier': 'photo_link', 'rightsHolder_x': 'by'})
inaturalist_spain_df = inaturalist_spain_df[table_columns]
inaturalist_spain_df = inaturalist_spain_df.dropna()
inaturalist_spain_df['taxonid'] = inaturalist_spain_df['taxonid'].astype(int)

In [58]:
inaturalist_spain_df['taxonid'].value_counts()

taxonid
12196    2994
12321    2127
10969    2069
11047    1895
12111    1834
         ... 
73409       1
19817       1
35879       1
35217       1
63562       1
Name: count, Length: 9053, dtype: int64

In [59]:
dfs = []
for taxon in set(inaturalist_spain_df['taxonid']):
    df_taxon = inaturalist_spain_df[inaturalist_spain_df['taxonid'] == taxon]
    dfs.append(df_taxon.sample(min(10, len(df_taxon)), random_state=42))
df = pd.concat(dfs)
len(df)

61289

#### Plantnet, animalia y wikimedia

In [60]:
plantnet_df = pd.read_excel('./Plantnet/plantnet_images.xlsx')[table_columns]
animalia_wikimedia_df = pd.read_excel('./Animalia y Wikimedia/animalia_wikimedia_images.xlsx')[table_columns]
not_inaturalist_df = pd.concat([plantnet_df, animalia_wikimedia_df])
len(not_inaturalist_df)

236

In [61]:
df = pd.concat([df, not_inaturalist_df])
len(df)

61525

#### iNaturalist (observaciones globales)

In [43]:
media_inaturalist_all_world = pd.read_csv('./iNaturalist/Datos iNaturalist/media_selection.csv')
observations_inaturalist_all_world = pd.read_csv('./iNaturalist/Datos iNaturalist/observations_selection_final.csv')

  media_inaturalist_all_world = pd.read_csv('./iNaturalist/Datos iNaturalist/media_selection.csv')
  observations_inaturalist_all_world = pd.read_csv('./iNaturalist/Datos iNaturalist/observations_selection_final.csv')


In [46]:
media_inaturalist_all_world['format'].value_counts()

format
image/jpeg                  3289200
image/png                     41189
audio/mp4                     16239
audio/x-wav                   15510
audio/mpeg                     9535
application/octet-stream       1435
image/gif                       673
image/pjpeg                      89
video/3gpp                        7
Name: count, dtype: int64

In [47]:
media_inaturalist_all_world = media_inaturalist_all_world[media_inaturalist_all_world['format'].isin(['image/jpeg', 'image/png'])]

In [48]:
inaturalist_all_world_df = media_inaturalist_all_world.merge(observations_inaturalist_all_world, how='inner', on='id')
inaturalist_all_world_df = inaturalist_all_world_df.rename(columns={'idtaxon': 'taxonid', 'identifier': 'photo_link', 'rightsHolder_x': 'by'})
inaturalist_all_world_df = inaturalist_all_world_df[table_columns]
inaturalist_all_world_df = inaturalist_all_world_df.dropna()
inaturalist_all_world_df['taxonid'] = inaturalist_all_world_df['taxonid'].astype(int)

In [49]:
inaturalist_all_world_df['taxonid'].value_counts()

taxonid
12000    83977
10534    60588
11176    47519
10489    44220
11608    38707
         ...  
12022        1
8327         1
41821        1
42784        1
39849        1
Name: count, Length: 1816, dtype: int64

In [62]:
dfs = [df]
for taxon in set(inaturalist_all_world_df['taxonid']):
    df_taxon = inaturalist_all_world_df[inaturalist_all_world_df['taxonid'] == taxon]
    missing = len(df[df['taxonid'] == taxon])
    if missing > 0:
        dfs.append(df_taxon.sample(min(10 - missing, len(df_taxon)), random_state=42))
df = pd.concat(dfs)
len(df)

63532

#### Guardamos el dataset final

In [66]:
# Nos quedamos solamente con aquellos taxones que estan en nuestra tabla de taxonomia
taxonomia_df = pd.read_excel('../Taxonomia.xlsx')
df = df[df['taxonid'].isin(taxonomia_df['taxonid'])]
len(df)

10982

In [67]:
df['taxonid'].value_counts()

taxonid
6        10
11442    10
11476    10
11470    10
11453    10
         ..
2243      1
2443      1
2452      1
2457      1
12153     1
Name: count, Length: 1346, dtype: int64

In [None]:
# Cambiar la columna 'by' porque es una palabra reservada en SQL
df = df.rename(columns={'by': 'license_holder'})
df['license_holder'] = df['license_holder'].astype(str)
df.to_excel('../Imagenes.xlsx', index=False)