In [9]:
import pandas as pd
import duckdb
import os

print("Pandas version:", pd.__version__)
print("DuckDB version:", duckdb.__version__)

Pandas version: 2.3.3
DuckDB version: 1.4.3


# Dataset Occurrence u observaciones

In [4]:
import duckdb
import pandas as pd

con = duckdb.connect(database=':memory:')

# Traer los primeros 5 registros a Pandas
df = con.execute("""
    SELECT *
    FROM read_csv_auto('https://www.dropbox.com/scl/fi/i9sshclfbojl8az397ce0/Occurrence.txt?rlkey=daw5f2p78yn1r5bt67vdyk1yv&st=0yc5vcl1&dl=1')
    LIMIT 5
""").fetchdf()  # o .df() también funciona
df.head()

# Ahora df es un DataFrame de Pandas

Unnamed: 0,id,occurrenceID,catalogNumber,basisOfRecord,collectionCode,dynamicProperties,otherCatalogNumbers,genus,specificEpithet,infraspecificEpithet,...,verbatimEventDate,fieldNotes,behavior,sex,lifeStage,preparations,references,Associated Taxa,rightsHolder,license
0,9@XC,https://data.biodiversitydata.nl/xeno-canto/ob...,XC9,HumanObservation,Wildlife sounds - Birds,,,Synallaxis,azarae,media,...,12-08-2002,two birds trip:http://www.cs.bris.ac.uk/home/p...,song,,,field recording,https://data.biodiversitydata.nl/xeno-canto/ob...,,Bob Planqué,CC BY-NC
1,9@XC,https://data.biodiversitydata.nl/xeno-canto/ob...,XC9,HumanObservation,Wildlife sounds - Birds,,,Synallaxis,azarae,media,...,12-08-2002,two birds trip:http://www.cs.bris.ac.uk/home/p...,song,,,field recording,https://data.biodiversitydata.nl/xeno-canto/ob...,,Bob Planqué,CC BY-NC
2,99@XC,https://data.biodiversitydata.nl/xeno-canto/ob...,XC99,HumanObservation,Wildlife sounds - Birds,,,Turdus,hauxwelli,,...,01-10-2003,trip:http://www.cs.bris.ac.uk/home/planque/Peru/,song,,,field recording,https://data.biodiversitydata.nl/xeno-canto/ob...,,Bob Planqué,CC BY-NC
3,99@XC,https://data.biodiversitydata.nl/xeno-canto/ob...,XC99,HumanObservation,Wildlife sounds - Birds,,,Turdus,hauxwelli,,...,01-10-2003,trip:http://www.cs.bris.ac.uk/home/planque/Peru/,song,,,field recording,https://data.biodiversitydata.nl/xeno-canto/ob...,,Bob Planqué,CC BY-NC
4,9999@XC,https://data.biodiversitydata.nl/xeno-canto/ob...,XC9999,HumanObservation,Wildlife sounds - Birds,,,Myrmothera,campanisona,signata,...,04-03-1998,,song,,,field recording,https://data.biodiversitydata.nl/xeno-canto/ob...,has background sounds: Selenidera reinwardtii,Allen T. Chartier,CC BY-NC


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 37 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   id                     5 non-null      object        
 1   occurrenceID           5 non-null      object        
 2   catalogNumber          5 non-null      object        
 3   basisOfRecord          5 non-null      object        
 4   collectionCode         5 non-null      object        
 5   dynamicProperties      0 non-null      object        
 6   otherCatalogNumbers    0 non-null      object        
 7   genus                  5 non-null      object        
 8   specificEpithet        5 non-null      object        
 9   infraspecificEpithet   3 non-null      object        
 10  scientificName         5 non-null      object        
 11  taxonRank              5 non-null      object        
 12  kingdom                5 non-null      object        
 13  family   

In [5]:
#Paises con mayor observacion de aves

import duckdb

con = duckdb.connect(database=':memory:')

# Contar ocurrencias por país y ordenar de mayor a menor
df_country_counts = con.execute("""
    SELECT country, COUNT(*) AS num_records
    FROM read_csv_auto('https://www.dropbox.com/scl/fi/i9sshclfbojl8az397ce0/Occurrence.txt?rlkey=daw5f2p78yn1r5bt67vdyk1yv&st=0yc5vcl1&dl=1')
    GROUP BY country
    ORDER BY num_records DESC
""").fetchdf()  # o .df()

# Mostrar los primeros países con más registros
print(df_country_counts.head(20))

           country  num_records
0    United States       136821
1           Brazil       129488
2   United Kingdom       127166
3           France       113395
4         Colombia        68952
5          Ecuador        63606
6            Spain        62638
7          Germany        60480
8           Sweden        47175
9           Mexico        43797
10          Poland        39300
11     Netherlands        35448
12           China        34261
13           India        33301
14            Peru        31302
15       Australia        30171
16    South Africa        28304
17       Argentina        26280
18       Indonesia        22773
19        Malaysia        22524


In [6]:
# Aproximación geográfica de Europa usando las columnas correctas
min_lat, max_lat = 35.0, 71.0
min_lon, max_lon = -10.0, 40.0

df_europe_geo = con.execute(f"""
    SELECT *
    FROM read_csv_auto('https://www.dropbox.com/scl/fi/i9sshclfbojl8az397ce0/Occurrence.txt?rlkey=daw5f2p78yn1r5bt67vdyk1yv&st=0yc5vcl1&dl=1')
    WHERE latitudeDecimal BETWEEN {min_lat} AND {max_lat}
      AND longitudeDecimal BETWEEN {min_lon} AND {max_lon}
    LIMIT 1000
""").fetchdf()

df_europe_geo.head()

print("Número de filas:", len(df_europe_geo))
print("Número de columnas:", df_europe_geo.shape[1])

Número de filas: 1000
Número de columnas: 37


In [7]:
# Aproximación geográfica de Europa sin límite
min_lat, max_lat = 35.0, 71.0
min_lon, max_lon = -10.0, 40.0

df_europe_geo = con.execute(f"""
    SELECT *
    FROM read_csv_auto('https://www.dropbox.com/scl/fi/i9sshclfbojl8az397ce0/Occurrence.txt?rlkey=daw5f2p78yn1r5bt67vdyk1yv&st=0yc5vcl1&dl=1')
    WHERE latitudeDecimal BETWEEN {min_lat} AND {max_lat}
      AND longitudeDecimal BETWEEN {min_lon} AND {max_lon}
""").fetchdf()

print("Número de filas:", len(df_europe_geo))
print("Número de columnas:", df_europe_geo.shape[1])

Número de filas: 627360
Número de columnas: 37


In [None]:
# Filtrado del dataset solo para España
import duckdb

# Crear conexión (o usar :memory:)
con = duckdb.connect()

# Ejecutar consulta SQL
result = con.execute("""
    SELECT *
    FROM read_csv_auto('https://www.dropbox.com/scl/fi/i9sshclfbojl8az397ce0/Occurrence.txt?rlkey=daw5f2p78yn1r5bt67vdyk1yv&st=0yc5vcl1&dl=1')
    WHERE country = 'Spain'
""").df()

result

Unnamed: 0,id,occurrenceID,catalogNumber,basisOfRecord,collectionCode,dynamicProperties,otherCatalogNumbers,genus,specificEpithet,infraspecificEpithet,...,verbatimEventDate,fieldNotes,behavior,sex,lifeStage,preparations,references,Associated Taxa,rightsHolder,license
0,999933@XC,https://data.biodiversitydata.nl/xeno-canto/ob...,XC999933,MachineObservation,Wildlife sounds - Birds,,,Rallus,aquaticus,,...,2025-05-20,animal seen:no; playback used:no,nocturnal flight call,,,field recording,https://data.biodiversitydata.nl/xeno-canto/ob...,,Jorge Rodal,CC BY-NC
1,999870@XC,https://data.biodiversitydata.nl/xeno-canto/ob...,XC999870,HumanObservation,Wildlife sounds - Birds,"{""recordingDevice"":""Zoom H5"",""microphone"":""Tel...",,Cettia,cetti,,...,2025-05-17,Singing from a patch of brambles.; animal seen...,song,undetermined,uncertain,field recording,https://data.biodiversitydata.nl/xeno-canto/ob...,has background sounds: Cuculus canorus|Oriolus...,João Tomás,CC BY-NC
2,999857@XC,https://data.biodiversitydata.nl/xeno-canto/ob...,XC999857,HumanObservation,Wildlife sounds - Birds,"{""recordingDevice"":""Zoom h1n"",""microphone"":""2x...",,Curruca,iberiae,,...,2025-04-20,animal seen:yes; playback used:no,"song, call",male,adult,field recording,https://data.biodiversitydata.nl/xeno-canto/ob...,,Jorge Valella Robledo,CC BY-NC
3,999856@XC,https://data.biodiversitydata.nl/xeno-canto/ob...,XC999856,HumanObservation,Wildlife sounds - Birds,"{""recordingDevice"":""Zoom h1n"",""microphone"":""2x...",,Curruca,undata,,...,2025-04-27,animal seen:yes; playback used:no,song,male,,field recording,https://data.biodiversitydata.nl/xeno-canto/ob...,,Jorge Valella Robledo,CC BY-NC
4,999830@XC,https://data.biodiversitydata.nl/xeno-canto/ob...,XC999830,MachineObservation,Wildlife sounds - Birds,"{""recordingDevice"":""PARABOLIC DISH""}",,Passer,domesticus,,...,2025-05-19,animal seen:no; playback used:no,call,undetermined,uncertain,field recording,https://data.biodiversitydata.nl/xeno-canto/ob...,,Sergi Carreras,CC BY-NC
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62633,1000511@XC,https://data.biodiversitydata.nl/xeno-canto/ob...,XC1000511,HumanObservation,Wildlife sounds - Birds,"{""recordingDevice"":""MIXPRE 6"",""microphone"":""Te...",,Emberiza,calandra,,...,2025-05-19,animal seen:yes; playback used:no,call,undetermined,adult,field recording,https://data.biodiversitydata.nl/xeno-canto/ob...,,Jordi Calvet,CC BY-NC
62634,1000509@XC,https://data.biodiversitydata.nl/xeno-canto/ob...,XC1000509,HumanObservation,Wildlife sounds - Birds,"{""recordingDevice"":""MIXPRE 6"",""microphone"":""Te...",,Linaria,cannabina,,...,2025-05-19,Perched; animal seen:yes; playback used:no,song,male,adult,field recording,https://data.biodiversitydata.nl/xeno-canto/ob...,,Jordi Calvet,CC BY-NC
62635,1000495@XC,https://data.biodiversitydata.nl/xeno-canto/ob...,XC1000495,MachineObservation,Wildlife sounds - Birds,"{""recordingDevice"":""PARABOLIC DISH""}",,Hydroprogne,caspia,,...,2025-05-22,NocMig in the Catalan Coast; animal seen:no; p...,nocturnal flight call,undetermined,uncertain,field recording,https://data.biodiversitydata.nl/xeno-canto/ob...,,Sergi Carreras,CC BY-NC
62636,1000268@XC,https://data.biodiversitydata.nl/xeno-canto/ob...,XC1000268,MachineObservation,Wildlife sounds - Birds,"{""recordingDevice"":""PARABOLIC DISH""}",,Mystery,mystery,,...,2025-05-18,NocMig in the Catalan Coast. The second call i...,nocturnal flight call,undetermined,uncertain,field recording,https://data.biodiversitydata.nl/xeno-canto/ob...,,Sergi Carreras,CC BY-NC


# Datos de dataset Multimedia

In [None]:
import duckdb

con = duckdb.connect(database=':memory:')

# Traer los primeros 5 registros a Pandas
dfm = con.execute("""
    SELECT *
    FROM read_csv_auto('')https://www.dropbox.com/scl/fi/z8lpwp3qnvz0r8lsc5qi1/Multimedia.txt?rlkey=yk05peoav2vw5ojbkqsebx9n7&st=q63xenjk&dl=1
    LIMIT 5
""").fetchdf()  # o .df() también funciona
dfm.head()

# Ahora df es un DataFrame de Pandas

Unnamed: 0,CoreId,associatedObservationReference,Identifier,type,Rating,rightsHolder,creator,accessURI,format,variantLiteral,description,caption,resourceCreationTechnique,captureDevice,physicalSetting,license
0,9@XC,https://data.biodiversitydata.nl/xeno-canto/ob...,https://xeno-canto.org/sounds/uploaded/OH38YHK...,StillImage,,Stichting Xeno-canto voor Natuurgeluiden,Stichting Xeno-canto voor Natuurgeluiden,https://xeno-canto.org/sounds/uploaded/OH38YHK...,image/png,ac:MediumQuality,,Oscillogram of the first ten seconds of the so...,,,,CC BY-NC-SA 3.0
1,9@XC,https://data.biodiversitydata.nl/xeno-canto/ob...,https://xeno-canto.org/sounds/uploaded/OH38YHK...,StillImage,,Stichting Xeno-canto voor Natuurgeluiden,Stichting Xeno-canto voor Natuurgeluiden,https://xeno-canto.org/sounds/uploaded/OH38YHK...,image/png,ac:MediumQuality,,Oscillogram of the first ten seconds of the so...,,,,CC BY-NC-SA 3.0
2,9@XC,https://data.biodiversitydata.nl/xeno-canto/ob...,https://xeno-canto.org/sounds/uploaded/OH38YHK...,StillImage,,Stichting Xeno-canto voor Natuurgeluiden,Stichting Xeno-canto voor Natuurgeluiden,https://xeno-canto.org/sounds/uploaded/OH38YHK...,image/png,ac:MediumQuality,,Spectrogram of the first ten seconds of the so...,,,,CC BY-NC-SA 3.0
3,9@XC,https://data.biodiversitydata.nl/xeno-canto/ob...,https://xeno-canto.org/sounds/uploaded/OH38YHK...,StillImage,,Stichting Xeno-canto voor Natuurgeluiden,Stichting Xeno-canto voor Natuurgeluiden,https://xeno-canto.org/sounds/uploaded/OH38YHK...,image/png,ac:MediumQuality,,Spectrogram of the first ten seconds of the so...,,,,CC BY-NC-SA 3.0
4,9@XC,https://data.biodiversitydata.nl/xeno-canto/ob...,https://xeno-canto.org/sounds/uploaded/OH38YHK...,Sound,5.0,Bob Planqué,Bob Planqué,https://xeno-canto.org/sounds/uploaded/OH38YHK...,audio/mp3,ac:BestQuality,17 s,,automatic recording: no; bitrate: 64000 bps; b...,,Natural,CC BY-NC-SA 3.0


In [None]:
dfm.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype 
---  ------                          --------------  ----- 
 0   CoreId                          5 non-null      object
 1   associatedObservationReference  5 non-null      object
 2   Identifier                      5 non-null      object
 3   type                            5 non-null      object
 4   Rating                          1 non-null      Int64 
 5   rightsHolder                    5 non-null      object
 6   creator                         5 non-null      object
 7   accessURI                       5 non-null      object
 8   format                          5 non-null      object
 9   variantLiteral                  5 non-null      object
 10  description                     1 non-null      object
 11  caption                         4 non-null      object
 12  resourceCreationTechnique       1 non-null      object

# Union de los datasets Occurrence.txt y Multimedia.txt con ID=CoreId

In [10]:

# Conexión en memoria
con = duckdb.connect(database=':memory:')

# Rutas de los archivos
occ_path = os.path.expanduser('https://www.dropbox.com/scl/fi/i9sshclfbojl8az397ce0/Occurrence.txt?rlkey=daw5f2p78yn1r5bt67vdyk1yv&st=0yc5vcl1&dl=1')
multi_path = os.path.expanduser('https://www.dropbox.com/scl/fi/z8lpwp3qnvz0r8lsc5qi1/Multimedia.txt?rlkey=yk05peoav2vw5ojbkqsebx9n7&st=q63xenjk&dl=1')

# Crear tabla Occurrence filtrada solo para España y Alemania
con.execute(f"""
CREATE TABLE occurrence_es AS
SELECT *
FROM read_csv_auto('{occ_path}', header=True)
WHERE country IN ('Spain', 'Germany')
""")

# Crear tabla Multimedia
con.execute(f"""
CREATE TABLE multimedia AS
SELECT *
FROM read_csv_auto('{multi_path}', header=True)
""")

# Hacer JOIN usando Occurrence.id = Multimedia.CoreId
query = """
SELECT 
    o.*, 
    m.Identifier, 
    m.type, 
    m.format, 
    m.accessURI
FROM occurrence_es o
LEFT JOIN multimedia m
    ON o.id = m.CoreId
"""

# Ejecutar consulta y obtener DataFrame
df_joined = con.execute(query).fetchdf()

# Mostrar las primeras filas
df_joined.head()

: 

*Limpieza de duplicados y agrupacion Dataset filtrado con datos de España y Alemania

In [None]:
df_joined.shape

(725663, 41)

In [None]:
duplicados = df_joined[df_joined.duplicated()]
print("Número de filas duplicadas:", len(duplicados))

Número de filas duplicadas: 536234


In [None]:
#limpiar dataset de duplicados 

import pandas as pd

# 1️⃣ Ver duplicados exactos
duplicados = df_joined[df_joined.duplicated()]
print(f"Número de filas duplicadas exactas: {len(duplicados)}")

# 2️⃣ Ver duplicados por 'id' (varias imágenes por ocurrencia)
duplicados_id = df_joined[df_joined.duplicated(subset=['id'], keep=False)]
print(f"Número de IDs con varias filas: {duplicados_id['id'].nunique()}")

# 3️⃣ Eliminar duplicados exactos
df_sin_duplicados = df_joined.drop_duplicates()
print(f"Tamaño después de eliminar duplicados exactos: {df_sin_duplicados.shape}")

# 4️⃣ Agrupar múltiples imágenes por ocurrencia 
df_agrupado = df_sin_duplicados.groupby('id').agg({
    'Identifier': list,
    'type': list,
    'format': list,
    'accessURI': list,
    'country': 'first',  # mantener país
    # puedes agregar otras columnas de Occurrence que quieras conservar
}).reset_index()

print(f"Tamaño después de agrupar por ID: {df_agrupado.shape}")

# 5️⃣ Guardar DataFrame limpio
df_agrupado.to_csv('df_joined_limpio.csv', index=False)


print("¡DataFrame limpio guardado en CSV!")


Número de filas duplicadas exactas: 536234
Número de IDs con varias filas: 62692
Tamaño después de eliminar duplicados exactos: (189429, 41)
Tamaño después de agrupar por ID: (63475, 6)
¡DataFrame limpio guardado en CSV!


In [None]:
import os

output_path = os.path.expanduser('~/Downloads/df_joined.csv')  # Ruta completa
df_joined.to_csv(output_path, index=False)
print(f"Archivo guardado en: {output_path}")

Archivo guardado en: /Users/gabrielajara/Downloads/df_joined.csv
