In [10]:
# 1. Instalar dependencias
!pip install xarray netCDF4 geopy tqdm skyfield requests



In [11]:
!pip install pyproj



In [12]:
# 2. Importar librerías
import pandas as pd
import numpy as np
import xarray as xr
from geopy.distance import geodesic
from tqdm.notebook import tqdm
from functools import lru_cache
from datetime import datetime, timezone, timedelta
from skyfield.api import load, Topos
from skyfield.almanac import sunrise_sunset, find_discrete
import requests

tqdm.pandas()

In [13]:
# 3. Funciones de cálculo para posición, distancia y tiempo solar
def adjusted_position_mc(lat, lon, se_x, se_y, n_samples=50):
    lats = np.random.normal(lat, se_y, n_samples)
    lons = np.random.normal(lon, se_x, n_samples)
    return lats.mean(), lons.mean()

def haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371.0
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1; dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1)*np.cos(lat2)*np.sin(dlon/2)**2
    return 2*R*np.arctan2(np.sqrt(a), np.sqrt(1-a))

def calculate_velocity(distance_km, time_sec):
    return (distance_km / time_sec) * 3600 if time_sec>0 else 0

def calculate_acceleration(v1, v2, time_sec):
    return (v2 - v1) / (time_sec/3600) if time_sec>0 else 0

# 4. Efemérides – horas de luz
eph = load('de421.bsp')
ts = load.timescale()
@lru_cache(maxsize=50000)
def get_daylight(lat, lon, date_str):
    d = datetime.strptime(date_str,'%Y-%m-%d').replace(tzinfo=timezone.utc)
    t0 = ts.utc(d.year, d.month, d.day)
    t1 = ts.utc((d+timedelta(days=1)).year, (d+timedelta(days=1)).month, (d+timedelta(days=1)).day)
    obs = Topos(latitude_degrees=lat, longitude_degrees=lon)
    f = sunrise_sunset(eph, obs)
    times, events = find_discrete(t0, t1, f)
    sunr = suns = None
    for ti, ev in zip(times, events):
        if ev==1: sunr = ti.utc_datetime()
        elif ev==0: suns = ti.utc_datetime()
    if sunr and suns:
        dh = (suns - sunr).total_seconds()/3600
        return dh, False, False
    else:
        alt = eph['Earth']+obs
        sun_alt = alt.at(t0).observe(eph['Sun']).apparent().altaz()[0].degrees
        if sun_alt>0: return 24.0, False, True
        else: return 0.0, True, False

# 5. Datos ambientales remotos
url = 'https://polarwatch.noaa.gov/erddap/griddap/nsidcG02202v4nh1day'
ds = xr.open_dataset(url, engine='netcdf4').sel(time=slice('1985-01-01','2017-12-31'))
ds = ds.sel(time=slice('2007-01-01', '2017-12-31'))
vars_env = ['cdr_seaice_conc','temp_surface','wind_speed','cloud_cover']
@lru_cache(maxsize=50000)
def get_env(ds, var, lat, lon, date_str):
    try:
        return float(ds[var].sel(time=np.datetime64(date_str), latitude=lat, longitude=lon, method='nearest').values)
    except: return np.nan

In [14]:
from pyproj import Transformer

# Define el transformador: de lat/lon (WGS84) a proyección polar (usamos EPSG:3413 como estándar NOAA Ártico)
transformer = Transformer.from_crs("EPSG:4326", "EPSG:3413", always_xy=True)

# Función de conversión
def convert_to_projected(lat, lon):
    x, y = transformer.transform(lon, lat)
    return x, y


In [15]:
# 6. Cargar y ordenar CSV principal
df = pd.read_csv('polarBear_CTCRWlocations_chukchiBeaufort_1985-2017.csv')
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Filtrar a los años deseados (ej. 2010 a 2014 inclusive)
df = df[(df['timestamp'].dt.year >= 2007) & (df['timestamp'].dt.year <= 2017)]

df = df.sort_values(['UniqueAnimalID','timestamp']).reset_index(drop=True)


In [16]:
df.head()

Unnamed: 0,UniqueAnimalID,timestamp,mu_lat,mu_lon,se_mu_x,se_mu_y
0,120,2008-08-30 06:00:00,70.571,-151.8802,30,30
1,120,2008-08-30 12:00:00,70.571,-151.8801,30,30
2,120,2008-08-30 18:00:00,70.5706,-151.879,30,30
3,120,2008-08-31 00:00:00,70.5714,-151.8821,30,30
4,120,2008-08-31 06:00:00,70.5714,-151.8822,30,30


In [8]:
# # 7. Procesamiento por batches
# batch_size = 10000
# res = []
# for i in tqdm(range(0, len(df), batch_size), desc='Batches'):
#     b = df.iloc[i:i+batch_size].copy()
#     b['adj_lat'], b['adj_lon'] = zip(*b.progress_apply(lambda r: adjusted_position_mc(r['mu_lat'],r['mu_lon'],r['se_mu_x'],r['se_mu_y']),axis=1))
#     b['date_str'] = b['timestamp'].dt.strftime('%Y-%m-%d')
#     b['lat2'] = b['adj_lat'].round(2)
#     b['lon2'] = b['adj_lon'].round(2)
#     dists, vels, accs = [], [], []
#     for aid, g in b.groupby('UniqueAnimalID'):
#         g = g.reset_index(drop=True)
#         if len(g) < 2:
#           continue  # saltar grupos muy pequeños
#         ds_ = [0]; vs=[0]; ac=[0]
#         for j in range(1,len(g)):
#             dt = (g.loc[j,'timestamp']-g.loc[j-1,'timestamp']).total_seconds()
#             dk = haversine_distance(g.loc[j-1,'adj_lat'],g.loc[j-1,'adj_lon'],g.loc[j,'adj_lat'],g.loc[j,'adj_lon'])
#             dists.append(dk); vels.append(calculate_velocity(dk,dt))
#         for j in range(1,len(vels)):
#             # dt=(g.loc[j,'timestamp']-g.loc[j-1,'timestamp']).total_seconds()
#             dt = (g.iloc[j]['timestamp'] - g.iloc[j-1]['timestamp']).total_seconds()
#             accs.append(calculate_acceleration(vels[j-1],vels[j],dt))
#     b['distance_km']=dists; b['velocity_kmh']=vels; b['acceleration_kmh2']=accs
#     b[['daylight_hours','is_polar_night','is_midnight_sun']] = b.progress_apply(lambda r: pd.Series(get_daylight(r['lat2'],r['lon2'],r['date_str'])), axis=1)
#     for var in vars_env:
#         b[var] = b.progress_apply(lambda r: get_env(ds,var,r['lat2'],r['lon2'],r['date_str']),axis=1)
#     res.append(b)
# df_final = pd.concat(res).reset_index(drop=True)


In [None]:
from tqdm.notebook import tqdm
from functools import partial

batch_size = 10000
res = []

# # # Definir función con ds fijado fuera
# # def get_env_partial(row, ds, var):
# #     return get_env(ds, var, row['lat2'], row['lon2'], row['date_str'])

# # 2. Define la función que usa el dataset GLOBAL
# # get_env ya no recibe ds como parámetro
# def get_env(var, lat, lon, date_str):
#     # Usamos directamente la variable ds cargada antes
#     # Accede al Dataset global, ejemplo (ajusta según tus variables):
#     time = np.datetime64(date_str)
#     value = ds[var].sel(time=time, method='nearest').sel(lat=lat, lon=lon, method='nearest').values.item()
#     return value
@lru_cache(maxsize=50000)
def get_env(var, x_proj, y_proj, date_str):
    try:
        return float(ds[var].sel(
            time=np.datetime64(date_str),
            xgrid=x_proj,
            ygrid=y_proj,
            method='nearest'
        ).values)
    except:
        return np.nan


for i in tqdm(range(0, len(df), batch_size), desc='Batches'):
    b = df.iloc[i:i+batch_size].copy()

    # Ajuste de posición con Monte Carlo y rounding
    b['adj_lat'], b['adj_lon'] = zip(*b.progress_apply(
        lambda r: adjusted_position_mc(r['mu_lat'], r['mu_lon'], r['se_mu_x'], r['se_mu_y']), axis=1))
    b['date_str'] = b['timestamp'].dt.strftime('%Y-%m-%d')
    b['lat2'] = b['adj_lat'].round(2)
    b['lon2'] = b['adj_lon'].round(2)
    b['proj_x'], b['proj_y'] = zip(*b.progress_apply(lambda r: convert_to_projected(r['lat2'], r['lon2']), axis=1))


    # Inicializamos columnas vacías para luego asignar
    b['distance_km'] = 0.0
    b['velocity_kmh'] = 0.0
    b['acceleration_kmh2'] = 0.0

    # Procesar por UniqueAnimalID dentro del batch
    for aid, g in b.groupby('UniqueAnimalID'):
        g = g.reset_index()

        if len(g) < 2:
            continue  # saltar grupos con un solo punto

        dists = [0.0]
        vels = [0.0]
        accs = [0.0]

        for j in range(1, len(g)):
            dt = (g.loc[j, 'timestamp'] - g.loc[j - 1, 'timestamp']).total_seconds()
            dk = haversine_distance(g.loc[j - 1, 'adj_lat'], g.loc[j - 1, 'adj_lon'], g.loc[j, 'adj_lat'], g.loc[j, 'adj_lon'])
            dists.append(dk)
            vels.append(calculate_velocity(dk, dt))

        for j in range(1, len(vels)):
            dt_acc = (g.loc[j, 'timestamp'] - g.loc[j - 1, 'timestamp']).total_seconds()
            accs.append(calculate_acceleration(vels[j - 1], vels[j], dt_acc))

        # Ajustar tamaño de accs si es mayor
        if len(accs) > len(g):
            accs = accs[:len(g)]

        # Asignar resultados al DataFrame original 'b' usando el índice original
        b.loc[g['index'], 'distance_km'] = dists
        b.loc[g['index'], 'velocity_kmh'] = vels
        b.loc[g['index'], 'acceleration_kmh2'] = accs

    # Calcular daylight y variables ambientales
    b[['daylight_hours', 'is_polar_night', 'is_midnight_sun']] = b.progress_apply(
        lambda r: pd.Series(get_daylight(r['lat2'], r['lon2'], r['date_str'])), axis=1)

    for var in vars_env:
        b[var] = b.progress_apply(lambda r: get_env(var, r['proj_x'], r['proj_y'], r['date_str']), axis=1)


    res.append(b)

df_final = pd.concat(res).reset_index(drop=True)

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

In [None]:
# print(ds.variables)


In [None]:
from numpy import radians, sin, cos, arctan2, degrees

# Función para rumbo con propagación de error (Monte Carlo)
def bearing_mc(lat1, lon1, lat2, lon2, se_x1, se_y1, se_x2, se_y2, n=30):
    bearings = []
    for _ in range(n):
        l1 = np.random.normal(lat1, se_y1)
        o1 = np.random.normal(lon1, se_x1)
        l2 = np.random.normal(lat2, se_y2)
        o2 = np.random.normal(lon2, se_x2)

        φ1, φ2 = radians(l1), radians(l2)
        λ1, λ2 = radians(o1), radians(o2)
        y = sin(λ2 - λ1) * cos(φ2)
        x = cos(φ1) * sin(φ2) - sin(φ1) * cos(φ2) * cos(λ2 - λ1)
        θ = degrees(arctan2(y, x))
        bearings.append((θ + 360) % 360)
    return np.mean(bearings)

# Calcular previos para cambio de dirección
df_final['prev_lat'] = df_final.groupby('UniqueAnimalID')['adj_lat'].shift(1)
df_final['prev_lon'] = df_final.groupby('UniqueAnimalID')['adj_lon'].shift(1)
df_final['prev_se_x'] = df_final.groupby('UniqueAnimalID')['se_mu_x'].shift(1)
df_final['prev_se_y'] = df_final.groupby('UniqueAnimalID')['se_mu_y'].shift(1)

# Calcular bearing con error
df_final['bearing'] = df_final.progress_apply(
    lambda r: bearing_mc(
        r['prev_lat'], r['prev_lon'], r['adj_lat'], r['adj_lon'],
        r['prev_se_x'], r['prev_se_y'], r['se_mu_x'], r['se_mu_y']
    ) if pd.notnull(r['prev_lat']) else np.nan,
    axis=1
)

# Calcular cambio de dirección
df_final['prev_bearing'] = df_final.groupby('UniqueAnimalID')['bearing'].shift(1)
df_final['bearing_change'] = df_final.apply(
    lambda r: abs((r['bearing'] - r['prev_bearing'] + 180) % 360 - 180)
    if pd.notnull(r['prev_bearing']) else 0,
    axis=1
)

# --------------------
# Calcular distance_per_day
df_final['date'] = df_final['timestamp'].dt.date
daily_distance = df_final.groupby(['UniqueAnimalID', 'date'])['distance_km'].sum().reset_index()
daily_distance.rename(columns={'distance_km': 'distance_per_day'}, inplace=True)

# Merge con df_final
df_final = df_final.merge(daily_distance, on=['UniqueAnimalID', 'date'], how='left')


In [None]:
# 8. resultados
df_final.head()

In [None]:
# 5. Guardar resultado final
df_final.to_csv('polar_bear_processed_full.csv', index=False)