In [7]:
# !pip install pygbif python-dwca-reader

import time
import requests
import zipfile
import os
import pandas as pd
from pygbif import occurrences

# GBIF credentials
GBIF_USER = "wildamer"
GBIF_PASS = "8#yGm%S7Y!LjYPSL"
GBIF_EMAIL = "itsme@wildamer.com"

# Taxon key for Barn Owl (Tyto alba)
TAXON_KEY = 2497921

# Date range for occurrences
DATE_START = "2010-01-01"
DATE_END   = "2025-12-31"

# European countries (ISO 2-letter codes)
# european_countries = [
#     "AD", "AL", "AT", "BA", "BE", "BG", "BY", "CH", "CY", "CZ", 
#     "DE", "DK", "EE", "ES", "FI", "FR", "GB", "GR", "HR", "HU", 
#     "IE", "IS", "IT", "LI", "LT", "LU", "LV", "MC", "MD", "ME", 
#     "MK", "MT", "NL", "NO", "PL", "PT", "RO", "RS", "SE", "SI", 
#     "SK", "SM", "UA", "VA"
# ]
european_countries = [ "GB", "IE" ]

In [8]:
# Build country predicates as individual dictionaries
country_predicates = [{"type": "equals", "key": "COUNTRY", "value": country} for country in european_countries]

# Build the main predicate structure
predicate = {
    "type": "and",
    "predicates": [
        {"type": "equals", "key": "TAXON_KEY", "value": str(TAXON_KEY)},
        {"type": "greaterThanOrEquals", "key": "EVENT_DATE", "value": DATE_START},
        {"type": "lessThanOrEquals", "key": "EVENT_DATE", "value": DATE_END},
        {"type": "equals", "key": "HAS_COORDINATE", "value": "true"},
        {
            "type": "or",
            "predicates": country_predicates
        }
    ]
}

print(f"Downloading data for {len(european_countries)} European countries")
print(f"Countries: {', '.join(european_countries)}")
print(f"Predicate structure: {len(predicate['predicates'])} main predicates with {len(country_predicates)} country options")

# 1. Initiate the asynchronous download request
res = occurrences.download(
    queries=predicate,
    format="SIMPLE_CSV",
    user=GBIF_USER,
    pwd=GBIF_PASS,
    email=GBIF_EMAIL
)
download_key = res[0]
print(f"Download initiated (key={download_key})")

# 2. Poll the download status until it succeeds
while True:
    meta = occurrences.download_meta(download_key)
    status = meta["status"]
    print(f"Current status: {status}")
    if status == "SUCCEEDED":
        break
    if status in ("KILLED", "FAILED"):
        raise RuntimeError(f"GBIF download {download_key} failed: {status}")
    time.sleep(30)

# 3. Fetch the completed ZIP archive
download_url = meta["downloadLink"]
zip_path = f"{download_key}.zip"
print(f"Downloading archive from {download_url} …")
r = requests.get(download_url)
r.raise_for_status()
with open(zip_path, "wb") as f:
    f.write(r.content)

# 4. Unzip into a directory named after the download key
extract_dir = download_key
with zipfile.ZipFile(zip_path, "r") as z:
    z.extractall(extract_dir)
print(f"Extracted to {extract_dir}")

Downloading data for 2 European countries
Countries: GB, IE
Predicate structure: 5 main predicates with 2 country options


INFO:Your download key is 0012178-250802193616735


Download initiated (key=0012178-250802193616735)
Current status: PREPARING
Current status: PREPARING
Current status: PREPARING
Current status: RUNNING
Current status: RUNNING
Current status: RUNNING
Current status: SUCCEEDED
Downloading archive from https://api.gbif.org/v1/occurrence/download/request/0012178-250802193616735.zip …
Extracted to 0012178-250802193616735


In [9]:
# 5. Load occurrences into pandas
occ_file = os.path.join(extract_dir, f"{download_key}.csv")
df = pd.read_csv(occ_file, sep="\t", low_memory=False)

# 6. Drop any records missing coordinates
df_clean = df.dropna(subset=["decimalLatitude", "decimalLongitude"])

# # 7. Clean and convert eventDate column
# # First, drop records with missing eventDate
# df_clean = df_clean.dropna(subset=["eventDate"])

# # Convert eventDate to datetime format (handles various date formats)
# df_clean["eventDate"] = pd.to_datetime(df_clean["eventDate"], errors="coerce")

# # Drop any records where eventDate conversion failed
# df_clean = df_clean.dropna(subset=["eventDate"])

print(f"Total records after cleaning: {len(df_clean)}")
print(f"Countries represented: {sorted(df_clean['countryCode'].unique())}")

# 8. Save cleaned CSV ready for ArcGIS import
start_year = DATE_START.split('-')[0]
end_year = DATE_END.split('-')[0]
output_csv = f"gbif_barn_owl_GB-IE_{start_year}-{end_year}.csv"
df_clean.to_csv(output_csv, index=False)
print(f"Cleaned data saved to {output_csv}")

# Display basic statistics
print(f"\nDataset summary:")
# print(f"Date range: {df_clean['eventDate'].min().strftime('%Y-%m-%d')} to {df_clean['eventDate'].max().strftime('%Y-%m-%d')}")
print(f"Countries: {len(df_clean['countryCode'].unique())}")
print(f"Records per country:")
print(df_clean['countryCode'].value_counts().head(10))

Total records after cleaning: 47653
Countries represented: ['GB', 'IE']
Cleaned data saved to gbif_barn_owl_GB-IE_2010-2025.csv

Dataset summary:
Countries: 2
Records per country:
countryCode
GB    46482
IE     1171
Name: count, dtype: int64


In [10]:
# 9. Cleanup temporary files
import shutil

print("Cleaning up temporary files...")

# Remove the ZIP file
if os.path.exists(zip_path):
    os.remove(zip_path)
    print(f"Removed ZIP file: {zip_path}")
else:
    print(f"ZIP file not found: {zip_path}")

# Remove the extracted directory and all its contents
if os.path.exists(extract_dir):
    shutil.rmtree(extract_dir)
    print(f"Removed directory: {extract_dir}")
else:
    print(f"Directory not found: {extract_dir}")

print("Cleanup completed!")
print(f"Final output file: {output_csv}")

Cleaning up temporary files...
Removed ZIP file: 0012178-250802193616735.zip
Removed directory: 0012178-250802193616735
Cleanup completed!
Final output file: gbif_barn_owl_GB-IE_2010-2025.csv
