In [1]:
import pandas as pd
import duckdb

print("Pandas version:", pd.__version__)
print("DuckDB version:", duckdb.__version__)

Pandas version: 2.3.3
DuckDB version: 1.4.2


# Dataset Occurrence u observaciones

In [126]:
import duckdb
import pandas as pd

con = duckdb.connect(database=':memory:')

# Traer los primeros 5 registros a Pandas
df = con.execute("""
    SELECT *
    FROM read_csv_auto('~/Downloads/gbif_folder/Occurrence.txt')
    LIMIT 5
""").fetchdf()  # o .df() también funciona
df.head()

# Ahora df es un DataFrame de Pandas

Unnamed: 0,id,occurrenceID,catalogNumber,basisOfRecord,collectionCode,dynamicProperties,otherCatalogNumbers,genus,specificEpithet,infraspecificEpithet,...,verbatimEventDate,fieldNotes,behavior,sex,lifeStage,preparations,references,Associated Taxa,rightsHolder,license
0,9@XC,https://data.biodiversitydata.nl/xeno-canto/ob...,XC9,HumanObservation,Wildlife sounds - Birds,,,Synallaxis,azarae,media,...,12-08-2002,two birds trip:http://www.cs.bris.ac.uk/home/p...,song,,,field recording,https://data.biodiversitydata.nl/xeno-canto/ob...,,Bob Planqué,CC BY-NC
1,9@XC,https://data.biodiversitydata.nl/xeno-canto/ob...,XC9,HumanObservation,Wildlife sounds - Birds,,,Synallaxis,azarae,media,...,12-08-2002,two birds trip:http://www.cs.bris.ac.uk/home/p...,song,,,field recording,https://data.biodiversitydata.nl/xeno-canto/ob...,,Bob Planqué,CC BY-NC
2,99@XC,https://data.biodiversitydata.nl/xeno-canto/ob...,XC99,HumanObservation,Wildlife sounds - Birds,,,Turdus,hauxwelli,,...,01-10-2003,trip:http://www.cs.bris.ac.uk/home/planque/Peru/,song,,,field recording,https://data.biodiversitydata.nl/xeno-canto/ob...,,Bob Planqué,CC BY-NC
3,99@XC,https://data.biodiversitydata.nl/xeno-canto/ob...,XC99,HumanObservation,Wildlife sounds - Birds,,,Turdus,hauxwelli,,...,01-10-2003,trip:http://www.cs.bris.ac.uk/home/planque/Peru/,song,,,field recording,https://data.biodiversitydata.nl/xeno-canto/ob...,,Bob Planqué,CC BY-NC
4,9999@XC,https://data.biodiversitydata.nl/xeno-canto/ob...,XC9999,HumanObservation,Wildlife sounds - Birds,,,Myrmothera,campanisona,signata,...,04-03-1998,,song,,,field recording,https://data.biodiversitydata.nl/xeno-canto/ob...,has background sounds: Selenidera reinwardtii,Allen T. Chartier,CC BY-NC


In [4]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 37 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   id                     5 non-null      object        
 1   occurrenceID           5 non-null      object        
 2   catalogNumber          5 non-null      object        
 3   basisOfRecord          5 non-null      object        
 4   collectionCode         5 non-null      object        
 5   dynamicProperties      0 non-null      object        
 6   otherCatalogNumbers    0 non-null      object        
 7   genus                  5 non-null      object        
 8   specificEpithet        5 non-null      object        
 9   infraspecificEpithet   3 non-null      object        
 10  scientificName         5 non-null      object        
 11  taxonRank              5 non-null      object        
 12  kingdom                5 non-null      object        
 13  family   

In [127]:
# Paises con mayor observacion de aves

import duckdb

con = duckdb.connect(database=':memory:')

# Contar ocurrencias por país y ordenar de mayor a menor
df_country_counts = con.execute("""
    SELECT country, COUNT(*) AS num_records
    FROM read_csv_auto('~/Downloads/gbif_folder/Occurrence.txt')
    GROUP BY country
    ORDER BY num_records DESC
""").fetchdf()  # o .df()

# Mostrar los primeros países con más registros
print(df_country_counts.head(20))

           country  num_records
0    United States       136821
1           Brazil       129488
2   United Kingdom       127166
3           France       113395
4         Colombia        68952
5          Ecuador        63606
6            Spain        62638
7          Germany        60480
8           Sweden        47175
9           Mexico        43797
10          Poland        39300
11     Netherlands        35448
12           China        34261
13           India        33301
14            Peru        31302
15       Australia        30171
16    South Africa        28304
17       Argentina        26280
18       Indonesia        22773
19        Malaysia        22524


In [132]:
# Aproximación geográfica de Europa con latitud y longitud
min_lat, max_lat = 35.0, 71.0
min_lon, max_lon = -10.0, 40.0

df_europe_geo = con.execute(f"""
    SELECT *
    FROM read_csv_auto('~/Downloads/gbif_folder/Occurrence.txt')
    WHERE latitudeDecimal BETWEEN {min_lat} AND {max_lat}
      AND longitudeDecimal BETWEEN {min_lon} AND {max_lon}
""").fetchdf()

print("Número de filas:", len(df_europe_geo))
print("Número de columnas:", df_europe_geo.shape[1])

Número de filas: 627360
Número de columnas: 37


In [133]:
#Eliminar Duplicados 

df_dups = df_europe_geo[df_europe_geo.duplicated()]
print("Duplicados encontrados:", len(df_dups))



Duplicados encontrados: 299203


In [134]:
df_europe_clean = df_europe_geo.drop_duplicates()
print("Filas sin duplicados:", len(df_europe_clean))

Filas sin duplicados: 328157


In [135]:
df_europe_clean.head(1)

Unnamed: 0,id,occurrenceID,catalogNumber,basisOfRecord,collectionCode,dynamicProperties,otherCatalogNumbers,genus,specificEpithet,infraspecificEpithet,...,verbatimEventDate,fieldNotes,behavior,sex,lifeStage,preparations,references,Associated Taxa,rightsHolder,license
0,999990@XC,https://data.biodiversitydata.nl/xeno-canto/ob...,XC999990,HumanObservation,Wildlife sounds - Birds,"{""recordingDevice"":""Olympus LS-P5 Linear PCM R...",,Phoenicurus,ochruros,gibraltariensis,...,2025-05-20,animal seen:yes; playback used:no,song,male,adult,field recording,https://data.biodiversitydata.nl/xeno-canto/ob...,has background sounds: Coloeus monedula|Passer...,Kelle Moreau,CC BY-NC


In [136]:
pd.set_option('display.max_rows', None)
df_europe_clean['scientificName'].value_counts()

scientificName
Mystery mystery                                                                       11304
Parus major                                                                            6032
Turdus merula                                                                          5998
Erithacus rubecula                                                                     5464
Fringilla coelebs                                                                      4977
Phylloscopus collybita                                                                 4480
Sylvia atricapilla                                                                     4435
Turdus philomelos                                                                      4300
Troglodytes troglodytes                                                                3588
Cyanistes caeruleus                                                                    3283
Strix aluco                                                      

In [137]:
import pandas as pd

# 1. Contar registros por scientificName
counts = df_europe_clean['scientificName'].value_counts()

# 2. Filtrar solo los que tienen más de 500
names_gt_500 = counts[counts > 500].index.tolist()

# 3. Crear un dataframe NUEVO solo con esas especies
df_europe_500 = df_europe_clean[df_europe_clean['scientificName'].isin(names_gt_500)].copy()

print("Especies seleccionadas:", len(names_gt_500))
print("Filas en el nuevo dataframe:", len(df_europe_500))
df_europe_500.head(1)

Especies seleccionadas: 181
Filas en el nuevo dataframe: 261954


Unnamed: 0,id,occurrenceID,catalogNumber,basisOfRecord,collectionCode,dynamicProperties,otherCatalogNumbers,genus,specificEpithet,infraspecificEpithet,...,verbatimEventDate,fieldNotes,behavior,sex,lifeStage,preparations,references,Associated Taxa,rightsHolder,license
1,999980@XC,https://data.biodiversitydata.nl/xeno-canto/ob...,XC999980,HumanObservation,Wildlife sounds - Birds,"{""recordingDevice"":""Tascam DR-05X""}",,Saxicola,rubicola,,...,2025-05-08,A stonechat making some sounds on the vegetati...,uncertain,undetermined,uncertain,in the hand,https://data.biodiversitydata.nl/xeno-canto/ob...,,Colm Philpott,CC BY-NC


# Datos de dataset Multimedia

In [72]:
import duckdb

con = duckdb.connect(database=':memory:')

# Traer los primeros 5 registros a Pandas
dfm = con.execute("""
    SELECT *
    FROM read_csv_auto('~/Downloads/gbif_folder/Multimedia.txt')
    LIMIT 5
""").fetchdf()  # o .df() también funciona
dfm.head(2)

# Ahora df es un DataFrame de Pandas

Unnamed: 0,CoreId,associatedObservationReference,Identifier,type,Rating,rightsHolder,creator,accessURI,format,variantLiteral,description,caption,resourceCreationTechnique,captureDevice,physicalSetting,license
0,9@XC,https://data.biodiversitydata.nl/xeno-canto/ob...,https://xeno-canto.org/sounds/uploaded/OH38YHK...,StillImage,,Stichting Xeno-canto voor Natuurgeluiden,Stichting Xeno-canto voor Natuurgeluiden,https://xeno-canto.org/sounds/uploaded/OH38YHK...,image/png,ac:MediumQuality,,Oscillogram of the first ten seconds of the so...,,,,CC BY-NC-SA 3.0
1,9@XC,https://data.biodiversitydata.nl/xeno-canto/ob...,https://xeno-canto.org/sounds/uploaded/OH38YHK...,StillImage,,Stichting Xeno-canto voor Natuurgeluiden,Stichting Xeno-canto voor Natuurgeluiden,https://xeno-canto.org/sounds/uploaded/OH38YHK...,image/png,ac:MediumQuality,,Oscillogram of the first ten seconds of the so...,,,,CC BY-NC-SA 3.0


In [73]:
dfm.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype 
---  ------                          --------------  ----- 
 0   CoreId                          5 non-null      object
 1   associatedObservationReference  5 non-null      object
 2   Identifier                      5 non-null      object
 3   type                            5 non-null      object
 4   Rating                          1 non-null      Int64 
 5   rightsHolder                    5 non-null      object
 6   creator                         5 non-null      object
 7   accessURI                       5 non-null      object
 8   format                          5 non-null      object
 9   variantLiteral                  5 non-null      object
 10  description                     1 non-null      object
 11  caption                         4 non-null      object
 12  resourceCreationTechnique       1 non-null      object

#Se une el dataset filtrado con 500 nombres de especies con el archivo multimedia

In [138]:
import pandas as pd

# Cargar el dataset de Multimedia
df_multimedia = pd.read_csv('~/Downloads/gbif_folder/Multimedia.txt', sep=',')

# Hacer el merge con df_europe_500
df_merged = pd.merge(
    df_europe_500,
    df_multimedia,
    left_on='id',       # id de df_europe_500
    right_on='CoreId',  # CoreId de Multimedia
    how='left'          # left join para mantener todas las filas de df_europe_500plus
)

# Verificar
print("Filas del dataset df_europe_500:", len(df_europe_500))
print("Filas del dataset df_merged después del merge:", len(df_merged))
df_merged.head(1)


Filas del dataset df_europe_500: 261954
Filas del dataset df_merged después del merge: 1507992


Unnamed: 0,id,occurrenceID,catalogNumber,basisOfRecord,collectionCode,dynamicProperties,otherCatalogNumbers,genus,specificEpithet,infraspecificEpithet,...,creator,accessURI,format,variantLiteral,description,caption,resourceCreationTechnique,captureDevice,physicalSetting,license_y
0,999980@XC,https://data.biodiversitydata.nl/xeno-canto/ob...,XC999980,HumanObservation,Wildlife sounds - Birds,"{""recordingDevice"":""Tascam DR-05X""}",,Saxicola,rubicola,,...,Stichting Xeno-canto voor Natuurgeluiden,https://xeno-canto.org/sounds/uploaded/FIMCRHL...,image/png,ac:MediumQuality,,Oscillogram of the first ten seconds of the so...,,,,CC BY-NC-SA 4.0


In [139]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1507992 entries, 0 to 1507991
Data columns (total 53 columns):
 #   Column                          Non-Null Count    Dtype         
---  ------                          --------------    -----         
 0   id                              1507992 non-null  object        
 1   occurrenceID                    1507992 non-null  object        
 2   catalogNumber                   1507992 non-null  object        
 3   basisOfRecord                   1507992 non-null  object        
 4   collectionCode                  1507992 non-null  object        
 5   dynamicProperties               387926 non-null   object        
 6   otherCatalogNumbers             1977 non-null     object        
 7   genus                           1507992 non-null  object        
 8   specificEpithet                 1507992 non-null  object        
 9   infraspecificEpithet            122471 non-null   object        
 10  scientificName                  1507992 no

In [140]:
# Filtrar filas donde format sea 'audio/mp3' o 'audio/wav'
df_filtrado = df_merged[df_merged['format'].isin(['audio/mp3', 'audio/wav'])].copy()

# Verificar
print("Filas con audio:", len(df_filtrado))
df_filtrado.head(1)

Filas con audio: 501557


Unnamed: 0,id,occurrenceID,catalogNumber,basisOfRecord,collectionCode,dynamicProperties,otherCatalogNumbers,genus,specificEpithet,infraspecificEpithet,...,creator,accessURI,format,variantLiteral,description,caption,resourceCreationTechnique,captureDevice,physicalSetting,license_y
2,999980@XC,https://data.biodiversitydata.nl/xeno-canto/ob...,XC999980,HumanObservation,Wildlife sounds - Birds,"{""recordingDevice"":""Tascam DR-05X""}",,Saxicola,rubicola,,...,Colm Philpott,https://xeno-canto.org/sounds/uploaded/FIMCRHL...,audio/wav,ac:BestQuality,128 s,,automatic recording: no; bitrate: 1411200 bps;...,Tascam DR-05X |,Artificial,CC BY-NC-SA 4.0


In [141]:
df_filtrado['scientificName'].value_counts()
df_filtrado['scientificName'].nunique()
# son 181 scientificname 

181

In [86]:
df_filtrado['country'].value_counts()

country
United Kingdom        105010
France                 85982
Germany                53245
Spain                  41445
Sweden                 37801
Poland                 31908
Netherlands            28707
Belgium                18177
Ireland                16062
Italy                  13532
Portugal               11549
Finland                 8021
Norway                  7882
Estonia                 7154
Austria                 5135
Switzerland             5020
Denmark                 4940
Ukraine                 4453
Croatia                 2286
Turkey                  1712
Slovakia                1428
Bulgaria                1357
Romania                 1191
Hungary                 1133
Czech Republic          1087
Greece                  1024
Latvia                   980
Belarus                  818
Russian Federation       740
Slovenia                 375
Tunisia                  321
Serbia                   230
Lithuania                213
Montenegro               138
Luxemb

In [144]:

#Eliminar paises no europeos como Tunisia, Aleria y Morocco

# Lista de países no europeos a eliminar
exclude_countries = ['Tunisia', 'Algeria', 'Morocco']

# Crear nuevo dataframe filtrado solo con países europeos
df_filtrado2 = df_filtrado[~df_filtrado['country'].isin(exclude_countries)].copy()

# Verificar
print(df_filtrado2['country'].value_counts())

country
United Kingdom        105010
France                 85982
Germany                53245
Spain                  41445
Sweden                 37801
Poland                 31908
Netherlands            28707
Belgium                18177
Ireland                16062
Italy                  13532
Portugal               11549
Finland                 8021
Norway                  7882
Estonia                 7154
Austria                 5135
Switzerland             5020
Denmark                 4940
Ukraine                 4453
Croatia                 2286
Turkey                  1712
Slovakia                1428
Bulgaria                1357
Romania                 1191
Hungary                 1133
Czech Republic          1087
Greece                  1024
Latvia                   980
Belarus                  818
Russian Federation       740
Slovenia                 375
Serbia                   230
Lithuania                213
Montenegro               138
Luxembourg               105
Albani

In [147]:
df_filtrado2.shape

(501139, 53)

In [146]:
# Luego de la union de los dos datasets, se verifica los duplicados nuevamente

df_filtrado_sd = df_filtrado2[df_filtrado2.duplicated()]
print("Duplicados encontrados:", len(df_filtrado_sd))

Duplicados encontrados: 221143


In [149]:
# Se eliminan los duplicados

df_europa_limpio = df_filtrado2.drop_duplicates()
print("Filas sin duplicados:", len(df_europa_limpio))
print(df_europa_limpio.shape)

Filas sin duplicados: 279996
(279996, 53)


In [150]:
df_europa_limpio.shape

(279996, 53)

In [151]:
df_europa_limpio['country'].value_counts()

country
United Kingdom        56269
France                48481
Germany               29178
Spain                 23266
Sweden                20808
Poland                18170
Netherlands           15992
Belgium               10173
Ireland               10095
Italy                  7871
Portugal               6452
Finland                4952
Norway                 4785
Estonia                4051
Switzerland            2969
Denmark                2852
Austria                2679
Ukraine                2358
Croatia                1325
Turkey                  916
Slovakia                828
Bulgaria                752
Romania                 678
Czech Republic          649
Hungary                 648
Greece                  573
Latvia                  568
Belarus                 484
Russian Federation      425
Slovenia                227
Lithuania               116
Serbia                  116
Montenegro               72
Luxembourg               58
Albania                  41
Bosnia Herze

In [105]:
df_europa_limpio['scientificName'].value_counts()

scientificName
Mystery mystery                        10358
Turdus merula                           4767
Parus major                             4744
Erithacus rubecula                      4424
Fringilla coelebs                       3891
Phylloscopus collybita                  3695
Turdus philomelos                       3670
Sylvia atricapilla                      3624
Strix aluco                             2921
Cyanistes caeruleus                     2768
Rallus aquaticus                        2489
Troglodytes troglodytes                 2465
Phylloscopus trochilus                  2438
Loxia curvirostra                       2302
Emberiza citrinella                     2291
Turdus iliacus                          2244
Emberiza calandra                       2163
Gallinula chloropus                     2115
Phylloscopus trochilus trochilus        2032
Alauda arvensis                         2022
Curruca communis                        1913
Corvus cornix cornix                    

In [152]:
#Luego del Merge, se vuelve a filtrar las especies con mas de 500 registros

import pandas as pd

# 1. Contar registros por scientificName
counts = df_europa_limpio['scientificName'].value_counts()

# 2. Filtrar solo los que tienen más de 500
names = counts[counts > 500].index.tolist()

# 3. Crear un dataframe NUEVO solo con esas especies
df_final = df_europa_limpio[df_europa_limpio['scientificName'].isin(names)].copy()

print("Especies seleccionadas:", len(names))
print("Filas en el nuevo dataframe:", len(df_final))
df_final.head(1)

Especies seleccionadas: 180
Filas en el nuevo dataframe: 279509


Unnamed: 0,id,occurrenceID,catalogNumber,basisOfRecord,collectionCode,dynamicProperties,otherCatalogNumbers,genus,specificEpithet,infraspecificEpithet,...,creator,accessURI,format,variantLiteral,description,caption,resourceCreationTechnique,captureDevice,physicalSetting,license_y
2,999980@XC,https://data.biodiversitydata.nl/xeno-canto/ob...,XC999980,HumanObservation,Wildlife sounds - Birds,"{""recordingDevice"":""Tascam DR-05X""}",,Saxicola,rubicola,,...,Colm Philpott,https://xeno-canto.org/sounds/uploaded/FIMCRHL...,audio/wav,ac:BestQuality,128 s,,automatic recording: no; bitrate: 1411200 bps;...,Tascam DR-05X |,Artificial,CC BY-NC-SA 4.0


In [159]:
df_final['scientificName'].value_counts() #Lista de especies con mas de 500 archivos de sonidos para entrenar

scientificName
Mystery mystery                        11080
Parus major                             7100
Turdus merula                           6853
Erithacus rubecula                      6101
Fringilla coelebs                       5767
Phylloscopus collybita                  4980
Sylvia atricapilla                      4884
Turdus philomelos                       4704
Troglodytes troglodytes                 4336
Cyanistes caeruleus                     3678
Emberiza calandra                       3141
Strix aluco                             3095
Turdus iliacus                          3086
Phylloscopus trochilus                  3030
Emberiza citrinella                     2832
Rallus aquaticus                        2722
Corvus cornix cornix                    2711
Curruca communis                        2705
Dendrocopos major                       2672
Loxia curvirostra                       2659
Luscinia megarhynchos                   2544
Fulica atra                             

In [160]:
df_final['country'].value_counts()

country
United Kingdom        56211
France                48344
Germany               29124
Spain                 23187
Sweden                20778
Poland                18142
Netherlands           15981
Belgium               10167
Ireland               10087
Italy                  7851
Portugal               6439
Finland                4944
Norway                 4776
Estonia                4050
Switzerland            2964
Denmark                2850
Austria                2679
Ukraine                2353
Croatia                1321
Turkey                  916
Slovakia                828
Bulgaria                752
Romania                 675
Czech Republic          649
Hungary                 645
Greece                  572
Latvia                  568
Belarus                 484
Russian Federation      423
Slovenia                227
Lithuania               116
Serbia                  116
Montenegro               72
Luxembourg               58
Albania                  41
Bosnia Herze

In [155]:
df_final.shape

(279509, 53)

In [157]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
Index: 279509 entries, 2 to 1507991
Data columns (total 53 columns):
 #   Column                          Non-Null Count   Dtype         
---  ------                          --------------   -----         
 0   id                              279509 non-null  object        
 1   occurrenceID                    279509 non-null  object        
 2   catalogNumber                   279509 non-null  object        
 3   basisOfRecord                   279509 non-null  object        
 4   collectionCode                  279509 non-null  object        
 5   dynamicProperties               74871 non-null   object        
 6   otherCatalogNumbers             358 non-null     object        
 7   genus                           279509 non-null  object        
 8   specificEpithet                 279509 non-null  object        
 9   infraspecificEpithet            21846 non-null   object        
 10  scientificName                  279509 non-null  object     

In [162]:
df_final.to_csv('~/Desktop/df_final.csv', index=False)

Finalmente, el Dataset df_final tiene:

1. Paises de Europa
2. Nombres de Especies con mas de 500 registros y 181 especies
3. Solamente filas con archivos de sonidos mp3 o wav
4. Tiene 279509 y 53 columnas 