# LOAD DATA

### Dependencias

In [None]:
# !pip install xarray netCDF4 geopy tqdm skyfield requests pyproj

### Import librerías

In [None]:
import numpy as np
import pandas as pd
import xarray as xr
from datetime import datetime, timezone, timedelta
from functools import lru_cache
from geopy.distance import geodesic
from numpy import radians, sin, cos, arctan2, degrees
from pyproj import Transformer
from skyfield.api import load, Topos
from skyfield.almanac import find_discrete, sunrise_sunset
from tqdm.notebook import tqdm
tqdm.pandas()

### Funciones de cálculo
* **Posición, distancia y tiempo solar**

In [None]:
def adjusted_position_mc(lat, lon, se_x, se_y, n_samples=50):
    lats = np.random.normal(lat, se_y, n_samples)
    lons = np.random.normal(lon, se_x, n_samples)
    return lats.mean(), lons.mean()

def haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371.0
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1; dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1)*np.cos(lat2)*np.sin(dlon/2)**2
    return 2*R*np.arctan2(np.sqrt(a), np.sqrt(1-a))

def calculate_velocity(distance_km, time_sec):
    return (distance_km / time_sec) * 3600 if time_sec>0 else 0

def calculate_acceleration(v1, v2, time_sec):
    return (v2 - v1) / (time_sec/3600) if time_sec>0 else 0

#### **Horas de luz - efemérides**

 Tablas o datos que indican las posiciones de los cuerpos celestes (como el Sol, la Luna, los planetas, etc.) en el cielo, para fechas y horas específicas.

En astronomía, se usan para saber cuándo ocurren eventos importantes, como:

    - Salida y puesta del Sol 🌄🌇

    - Salida y puesta de la Luna 🌕🌘

    - Duración del día (horas de luz)

    - Fases lunares, eclipses, etc.

Por ejemplo, puedes usar _Skyfield_ para:

    - Saber cuándo sale el Sol en una latitud y longitud determinada.

    - Calcular las horas de luz en el Ártico, donde hay fenómenos extremos como el Sol de medianoche o la noche polar.

In [None]:
eph = load('de421.bsp')
ts = load.timescale()
@lru_cache(maxsize=50000)
def get_daylight(lat, lon, date_str):
    d = datetime.strptime(date_str,'%Y-%m-%d').replace(tzinfo=timezone.utc)
    t0 = ts.utc(d.year, d.month, d.day)
    t1 = ts.utc((d+timedelta(days=1)).year, (d+timedelta(days=1)).month, (d+timedelta(days=1)).day)
    obs = Topos(latitude_degrees=lat, longitude_degrees=lon)
    f = sunrise_sunset(eph, obs)
    times, events = find_discrete(t0, t1, f)
    sunr = suns = None
    for ti, ev in zip(times, events):
        if ev==1: sunr = ti.utc_datetime()
        elif ev==0: suns = ti.utc_datetime()
    if sunr and suns:
        dh = (suns - sunr).total_seconds()/3600
        return dh, False, False
    else:
        alt = eph['Earth']+obs
        sun_alt = alt.at(t0).observe(eph['Sun']).apparent().altaz()[0].degrees
        if sun_alt>0: return 24.0, False, True
        else: return 0.0, True, False

#### **Datos ambientales remotos**

* **Cobertura de hielo**


datos ambientales satelitales en forma de grilla a través del URL que corresponde a un servidor ERDDAP del gobierno de EE.UU., específicamente del NOAA PolarWatch. Proporciona acceso a 

En concreto, este endpoint:

https://polarwatch.noaa.gov/erddap/griddap/nsidcG02202v4nh1day

se refiere al producto de concentración diaria de hielo marino del hemisferio norte, conocido como:

NSIDC-0051 / G02202 Version 4 — Northern Hemisphere Daily Sea Ice Concentration

    Filtrar por intervalo 2007 -2017

In [None]:
url = 'https://polarwatch.noaa.gov/erddap/griddap/nsidcG02202v4nh1day'
ds = xr.open_dataset(url, engine='netcdf4').sel(time=slice('1985-01-01','2017-12-31'))
ds = ds.sel(time=slice('2007-01-01', '2017-12-31'))
vars_env = ['cdr_seaice_conc','temp_surface','wind_speed','cloud_cover']


In [None]:

@lru_cache(maxsize=50000)
def get_env(ds, var, lat, lon, date_str):
    try:
        return float(ds[var].sel(time=np.datetime64(date_str), latitude=lat, longitude=lon, method='nearest').values)
    except: return np.nan

La proyección polar es una forma de representar zonas cercanas a los polos (como el Ártico) en un mapa plano. En lugar de usar latitud y longitud directamente (como en los mapas comunes), transforma esas coordenadas en un sistema centrado en el polo.

🧊 ¿Por qué se usa?

   Porque en regiones polares, las líneas de longitud se juntan mucho y los mapas se deforman. La proyección polar mantiene mejor las distancias y formas en esas zonas.

📌 En los datos satelitales como los del hielo marino:  

No están en lat/lon directamente, sino en una grilla con una proyección polar.  

Para extraer un valor (como concentración de hielo) en una lat/lon, necesitas convertir esa lat/lon a coordenadas de esa grilla.

In [None]:
# Definición del transformador: de lat/lon (WGS84) a proyección polar (usamos EPSG:3413 como estándar NOAA Ártico)
transformer = Transformer.from_crs("EPSG:4326", "EPSG:3413", always_xy=True)

# Función de conversión
def convert_to_projected(lat, lon):
    x, y = transformer.transform(lon, lat)
    return x, y

### **Datos CSV principal**  

    Cargar y ordenar

In [None]:
df = pd.read_csv('../data/raw/polarBear_CTCRWlocations_chukchiBeaufort_1985-2017.csv')
df.head()

Unnamed: 0,UniqueAnimalID,timestamp,mu_lat,mu_lon,se_mu_x,se_mu_y
0,1,7/1/1986 0:00,69.7687,-141.3759,14589,14589
1,1,7/1/1986 6:00,69.7703,-141.3863,12248,12248
2,1,7/1/1986 12:00,69.7718,-141.3942,9375,9375
3,1,7/1/1986 18:00,69.773,-141.396,5835,5835
4,1,7/2/1986 0:00,69.7738,-141.3834,2517,2517


    Filtrar por intervalo 2007 - 2017

In [None]:
df['timestamp'] = pd.to_datetime(df['timestamp'])
df = df[(df['timestamp'].dt.year >= 2007) & (df['timestamp'].dt.year <= 2017)]
df = df.sort_values(['UniqueAnimalID','timestamp']).reset_index(drop=True)

In [None]:
batch_size = 10000
res = []

@lru_cache(maxsize=50000)
def get_env(var, x_proj, y_proj, date_str):
    try:
        return float(ds[var].sel(
            time=np.datetime64(date_str),
            xgrid=x_proj,
            ygrid=y_proj,
            method='nearest'
        ).values)
    except:
        return np.nan

In [None]:

for i in tqdm(range(0, len(df), batch_size), desc='Batches'):
    b = df.iloc[i:i+batch_size].copy()

    # Ajuste de posición con Monte Carlo y rounding
    b['adj_lat'], b['adj_lon'] = zip(*b.progress_apply(
        lambda r: adjusted_position_mc(r['mu_lat'], r['mu_lon'], r['se_mu_x'], r['se_mu_y']), axis=1))
    b['date_str'] = b['timestamp'].dt.strftime('%Y-%m-%d')
    b['lat2'] = b['adj_lat'].round(2)
    b['lon2'] = b['adj_lon'].round(2)
    b['proj_x'], b['proj_y'] = zip(*b.progress_apply(lambda r: convert_to_projected(r['lat2'], r['lon2']), axis=1))


    # Inicializamos columnas vacías para luego asignar
    b['distance_km'] = 0.0
    b['velocity_kmh'] = 0.0
    b['acceleration_kmh2'] = 0.0

    # Procesar por UniqueAnimalID dentro del batch
    for aid, g in b.groupby('UniqueAnimalID'):
        g = g.reset_index()

        if len(g) < 2:
            continue  # saltar grupos con un solo punto

        dists = [0.0]
        vels = [0.0]
        accs = [0.0]

        for j in range(1, len(g)):
            dt = (g.loc[j, 'timestamp'] - g.loc[j - 1, 'timestamp']).total_seconds()
            dk = haversine_distance(g.loc[j - 1, 'adj_lat'], g.loc[j - 1, 'adj_lon'], g.loc[j, 'adj_lat'], g.loc[j, 'adj_lon'])
            dists.append(dk)
            vels.append(calculate_velocity(dk, dt))

        for j in range(1, len(vels)):
            dt_acc = (g.loc[j, 'timestamp'] - g.loc[j - 1, 'timestamp']).total_seconds()
            accs.append(calculate_acceleration(vels[j - 1], vels[j], dt_acc))

        # Ajustar tamaño de accs si es mayor
        if len(accs) > len(g):
            accs = accs[:len(g)]

        # Asignar resultados al DataFrame original 'b' usando el índice original
        b.loc[g['index'], 'distance_km'] = dists
        b.loc[g['index'], 'velocity_kmh'] = vels
        b.loc[g['index'], 'acceleration_kmh2'] = accs

    # Calcular daylight y variables ambientales
    b[['daylight_hours', 'is_polar_night', 'is_midnight_sun']] = b.progress_apply(
        lambda r: pd.Series(get_daylight(r['lat2'], r['lon2'], r['date_str'])), axis=1)

    for var in vars_env:
        b[var] = b.progress_apply(lambda r: get_env(var, r['proj_x'], r['proj_y'], r['date_str']), axis=1)

    res.append(b)

    df_final = pd.concat(res).reset_index(drop=True)


In [None]:
def bearing_mc(lat1, lon1, lat2, lon2, se_x1, se_y1, se_x2, se_y2, n=30):
    '''
        Función para dirección con propagación de error (Monte Carlo)
    '''
    bearings = []
    for _ in range(n):
        l1 = np.random.normal(lat1, se_y1)
        o1 = np.random.normal(lon1, se_x1)
        l2 = np.random.normal(lat2, se_y2)
        o2 = np.random.normal(lon2, se_x2)

        φ1, φ2 = radians(l1), radians(l2)
        λ1, λ2 = radians(o1), radians(o2)
        y = sin(λ2 - λ1) * cos(φ2)
        x = cos(φ1) * sin(φ2) - sin(φ1) * cos(φ2) * cos(λ2 - λ1)
        θ = degrees(arctan2(y, x))
        bearings.append((θ + 360) % 360)
    return np.mean(bearings)

# Calcular previos para cambio de dirección
df_final['prev_lat'] = df_final.groupby('UniqueAnimalID')['adj_lat'].shift(1)
df_final['prev_lon'] = df_final.groupby('UniqueAnimalID')['adj_lon'].shift(1)
df_final['prev_se_x'] = df_final.groupby('UniqueAnimalID')['se_mu_x'].shift(1)
df_final['prev_se_y'] = df_final.groupby('UniqueAnimalID')['se_mu_y'].shift(1)

# Calcular bearing con error
df_final['bearing'] = df_final.progress_apply(
    lambda r: bearing_mc(
        r['prev_lat'], r['prev_lon'], r['adj_lat'], r['adj_lon'],
        r['prev_se_x'], r['prev_se_y'], r['se_mu_x'], r['se_mu_y']
    ) if pd.notnull(r['prev_lat']) else np.nan,
    axis=1
)

# Calcular cambio de dirección
df_final['prev_bearing'] = df_final.groupby('UniqueAnimalID')['bearing'].shift(1)
df_final['bearing_change'] = df_final.apply(
    lambda r: abs((r['bearing'] - r['prev_bearing'] + 180) % 360 - 180)
    if pd.notnull(r['prev_bearing']) else 0,
    axis=1
)

#### *__Distance_per_day__*

In [None]:
df_final['date'] = df_final['timestamp'].dt.date
daily_distance = df_final.groupby(['UniqueAnimalID', 'date'])['distance_km'].sum().reset_index()
daily_distance.rename(columns={'distance_km': 'distance_per_day'}, inplace=True)

#### __Merge final__

In [None]:
# Merge con df_final
df_final = df_final.merge(daily_distance, on=['UniqueAnimalID', 'date'], how='left')


-----------------------------------------------------------------------------
# GUARDAR EL RICH DS

In [None]:
df_final.to_csv('polar_bear_processed_full.csv', index=False)

### 1er vistazo

In [86]:
df.tail()

Unnamed: 0,UniqueAnimalID,timestamp,mu_lat,mu_lon,se_mu_x,se_mu_y,adj_lat,adj_lon,date_str,lat2,...,cloud_cover,prev_lat,prev_lon,prev_se_x,prev_se_y,bearing,prev_bearing,bearing_change,date,distance_per_day
122009,541,2007-11-15 18:00:00,76.0064,-147.6617,10621,10621,-198.148496,342.653795,2007-11-15,-198.15,...,,428.106824,190.292103,9930.0,9930.0,148.172633,213.7638,65.591168,2007-11-15,36878.551511
122010,541,2007-11-16 00:00:00,76.0677,-148.0419,10606,10606,2336.847855,-962.52444,2007-11-16,2336.85,...,,-198.148496,342.653795,10621.0,10621.0,176.993764,148.172633,28.821132,2007-11-16,48731.907351
122011,541,2007-11-16 06:00:00,76.127,-148.4143,9888,9888,294.016826,-1662.410103,2007-11-16,294.02,...,,2336.847855,-962.52444,10606.0,10606.0,191.310292,176.993764,14.316528,2007-11-16,48731.907351
122012,541,2007-11-16 12:00:00,76.1821,-148.7634,8623,8623,-957.680355,787.723708,2007-11-16,-957.68,...,,294.016826,-1662.410103,9888.0,9888.0,199.10316,191.310292,7.792868,2007-11-16,48731.907351
122013,541,2007-11-16 18:00:00,76.2298,-149.0669,7409,7409,1878.141242,-657.829179,2007-11-16,1878.14,...,,-957.680355,787.723708,8623.0,8623.0,173.287385,199.10316,25.815775,2007-11-16,48731.907351


In [87]:
df.shape

(122014, 32)

In [88]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122014 entries, 0 to 122013
Data columns (total 32 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   UniqueAnimalID     122014 non-null  int64  
 1   timestamp          122014 non-null  object 
 2   mu_lat             122014 non-null  float64
 3   mu_lon             122014 non-null  float64
 4   se_mu_x            122014 non-null  int64  
 5   se_mu_y            122014 non-null  int64  
 6   adj_lat            122014 non-null  float64
 7   adj_lon            122014 non-null  float64
 8   date_str           122014 non-null  object 
 9   lat2               122014 non-null  float64
 10  lon2               122014 non-null  float64
 11  proj_x             122014 non-null  float64
 12  proj_y             122014 non-null  float64
 13  distance_km        122014 non-null  float64
 14  velocity_kmh       122014 non-null  float64
 15  acceleration_kmh2  122014 non-null  float64
 16  da

In [91]:
df.describe()

  sqr = _ensure_numeric((avg - values) ** 2)
  diff_b_a = subtract(b, a)
  sqr = _ensure_numeric((avg - values) ** 2)
  diff_b_a = subtract(b, a)


Unnamed: 0,UniqueAnimalID,mu_lat,mu_lon,se_mu_x,se_mu_y,adj_lat,adj_lon,lat2,lon2,proj_x,...,wind_speed,cloud_cover,prev_lat,prev_lon,prev_se_x,prev_se_y,bearing,prev_bearing,bearing_change,distance_per_day
count,122014.0,122014.0,122014.0,122014.0,122014.0,122014.0,122014.0,122014.0,122014.0,122014.0,...,0.0,0.0,121841.0,121841.0,121841.0,121841.0,121841.0,121668.0,122014.0,122014.0
mean,411.762019,73.061064,-99.047021,5948.815193,5948.815193,74.111507,-100.241691,74.111503,-100.241706,inf,...,,,74.055601,-100.065536,5951.767689,5951.767689,180.020006,180.023415,22.663859,31463.322642
std,77.735218,2.572983,125.040604,7016.884808,7016.884808,1297.923625,1310.889545,1297.923627,1310.88954,,...,,,1298.36096,1311.120036,7017.371087,7017.371087,20.12566,20.124781,17.156677,16965.891589
min,120.0,67.2807,-179.9995,11.0,11.0,-17851.466044,-17984.632851,-17851.47,-17984.63,-9986628000.0,...,,,-17851.466044,-17984.632851,11.0,11.0,96.82753,96.82753,0.0,0.0
25%,363.0,71.1101,-166.715175,31.0,31.0,-140.192963,-327.366128,-140.195,-327.365,-1240705.0,...,,,-140.526205,-328.145804,31.0,31.0,166.434664,166.436526,8.996837,21496.72132
50%,411.0,72.4874,-149.2734,3659.0,3659.0,72.614608,-145.692597,72.61,-145.69,,...,,,72.61503,-145.699344,3665.0,3665.0,180.020407,180.027144,19.245749,35549.531289
75%,470.0,74.4658,-134.656575,9439.0,9439.0,290.086473,179.903156,290.0875,179.9,,...,,,290.391636,179.956853,9444.0,9444.0,193.634771,193.63602,32.655973,43757.914605
max,541.0,83.9014,179.9998,48920.0,48920.0,18979.301685,19181.051648,18979.3,19181.05,inf,...,,,18979.301685,19181.051648,48920.0,48920.0,266.982138,266.982138,127.915663,75706.323067


In [92]:
df.UniqueAnimalID.nunique()

173

In [78]:
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['hour'] = df['timestamp'].dt.hour
df['year'] = df['timestamp'].dt.year
print(df.year.unique())
# df[df['timestamp'] == '1/7/1986']
df.head(5)

[1986 1985 1988 1990 1992 1993 1994 1999 2000 2001 2002 1998 1987 1989
 2004 2005 2006 1991 2003 2008 2007 2009 1995 2010 2012 2013 2014 2015
 2011 2016 2017]


Unnamed: 0,UniqueAnimalID,timestamp,mu_lat,mu_lon,se_mu_x,se_mu_y,hour,year
0,1,1986-07-01 00:00:00,69.7687,-141.3759,14589,14589,0,1986
1,1,1986-07-01 06:00:00,69.7703,-141.3863,12248,12248,6,1986
2,1,1986-07-01 12:00:00,69.7718,-141.3942,9375,9375,12,1986
3,1,1986-07-01 18:00:00,69.773,-141.396,5835,5835,18,1986
4,1,1986-07-02 00:00:00,69.7738,-141.3834,2517,2517,0,1986


In [73]:
df[df.UniqueAnimalID == 1].sort_values(by='timestamp', ascending=True)

Unnamed: 0,UniqueAnimalID,timestamp,mu_lat,mu_lon,se_mu_x,se_mu_y,timestamp_dt,hour,date,year,prev_lat,prev_lon,distance_km
0,1,1986-07-01 00:00:00,69.7687,-141.3759,14589,14589,1986-07-01 00:00:00,0,182,1986,,,0.000000
1,1,1986-07-01 06:00:00,69.7703,-141.3863,12248,12248,1986-07-01 06:00:00,6,182,1986,69.7687,-141.3759,0.439410
2,1,1986-07-01 12:00:00,69.7718,-141.3942,9375,9375,1986-07-01 12:00:00,12,182,1986,69.7703,-141.3863,0.347873
3,1,1986-07-01 18:00:00,69.7730,-141.3960,5835,5835,1986-07-01 18:00:00,18,182,1986,69.7718,-141.3942,0.150830
4,1,1986-07-02 00:00:00,69.7738,-141.3834,2517,2517,1986-07-02 00:00:00,0,183,1986,69.7730,-141.3960,0.494491
...,...,...,...,...,...,...,...,...,...,...,...,...,...
369,1,1986-11-29 18:00:00,70.1560,-144.3385,18775,18775,1986-11-29 18:00:00,18,333,1986,70.1627,-144.3830,1.844559
370,1,1986-11-30 00:00:00,70.1494,-144.2940,17939,17939,1986-11-30 00:00:00,0,334,1986,70.1560,-144.3385,1.840563
371,1,1986-11-30 06:00:00,70.1428,-144.2492,16854,16854,1986-11-30 06:00:00,6,334,1986,70.1494,-144.2940,1.851488
372,1,1986-11-30 12:00:00,70.1363,-144.2040,15477,15477,1986-11-30 12:00:00,12,334,1986,70.1428,-144.2492,1.861538


In [74]:
df.head()

Unnamed: 0,UniqueAnimalID,timestamp,mu_lat,mu_lon,se_mu_x,se_mu_y,timestamp_dt,hour,date,year,prev_lat,prev_lon,distance_km
0,1,1986-07-01 00:00:00,69.7687,-141.3759,14589,14589,1986-07-01 00:00:00,0,182,1986,,,0.0
1,1,1986-07-01 06:00:00,69.7703,-141.3863,12248,12248,1986-07-01 06:00:00,6,182,1986,69.7687,-141.3759,0.43941
2,1,1986-07-01 12:00:00,69.7718,-141.3942,9375,9375,1986-07-01 12:00:00,12,182,1986,69.7703,-141.3863,0.347873
3,1,1986-07-01 18:00:00,69.773,-141.396,5835,5835,1986-07-01 18:00:00,18,182,1986,69.7718,-141.3942,0.15083
4,1,1986-07-02 00:00:00,69.7738,-141.3834,2517,2517,1986-07-02 00:00:00,0,183,1986,69.773,-141.396,0.494491


In [75]:
df.tail()

Unnamed: 0,UniqueAnimalID,timestamp,mu_lat,mu_lon,se_mu_x,se_mu_y,timestamp_dt,hour,date,year,prev_lat,prev_lon,distance_km
338408,541,2007-11-15 18:00:00,76.0064,-147.6617,10621,10621,2007-11-15 18:00:00,18,319,2007,75.9453,-147.2863,12.236056
338409,541,2007-11-16 00:00:00,76.0677,-148.0419,10606,10606,2007-11-16 00:00:00,0,320,2007,76.0064,-147.6617,12.319877
338410,541,2007-11-16 06:00:00,76.127,-148.4143,9888,9888,2007-11-16 06:00:00,6,320,2007,76.0677,-148.0419,11.985894
338411,541,2007-11-16 12:00:00,76.1821,-148.7634,8623,8623,2007-11-16 12:00:00,12,320,2007,76.127,-148.4143,11.174337
338412,541,2007-11-16 18:00:00,76.2298,-149.0669,7409,7409,2007-11-16 18:00:00,18,320,2007,76.1821,-148.7634,9.677647


In [37]:
df.groupby('year')['UniqueAnimalID'].unique()

year
1985                                              [3, 25]
1986    [1, 3, 19, 26, 27, 31, 42, 43, 49, 62, 64, 65,...
1987    [12, 24, 26, 43, 52, 55, 56, 58, 59, 67, 69, 7...
1988    [4, 19, 24, 27, 30, 40, 53, 55, 56, 58, 72, 73...
1989    [12, 14, 19, 24, 33, 34, 40, 46, 48, 55, 78, 7...
1990    [4, 16, 56, 81, 112, 113, 114, 115, 117, 118, ...
1991    [37, 39, 41, 51, 52, 72, 112, 115, 133, 143, 1...
1992    [5, 13, 17, 20, 22, 32, 41, 44, 50, 56, 127, 1...
1993    [5, 17, 20, 22, 32, 41, 52, 56, 127, 169, 172,...
1994                         [5, 237, 242, 249, 277, 279]
1995                                      [252, 253, 254]
1998                          [10, 41, 54, 123, 284, 292]
1999    [5, 10, 29, 41, 54, 170, 280, 283, 292, 294, 2...
2000    [5, 21, 41, 54, 60, 170, 275, 294, 297, 303, 3...
2001    [5, 56, 60, 83, 275, 294, 297, 303, 312, 314, ...
2002    [5, 54, 56, 258, 285, 303, 308, 314, 315, 316,...
2003         [56, 258, 259, 267, 270, 271, 317, 328, 329]
2004    [

# ENRICH

* ## _Día de año, hora del día_

In [69]:
df['date'] = df.timestamp.dt.dayofyear
df['hour'] = df.timestamp.dt.hour
df['year'] = df.timestamp.dt.year

* ## _Distance_per_day_

In [70]:
df = df.sort_values(['UniqueAnimalID', 'timestamp'])
df['prev_lat'] = df.groupby('UniqueAnimalID')['mu_lat'].shift(1)
df['prev_lon'] = df.groupby('UniqueAnimalID')['mu_lon'].shift(1)

df['distance_km'] = df.apply(lambda row:
        geodesic((row['prev_lat'], row['prev_lon']), (row['mu_lat'], row['mu_lon'])).kilometers
        if pd.notnull(row['prev_lat']) else 0, axis=1
)

daily_distance = df.groupby(['UniqueAnimalID', 'date'])['distance_km'].sum().reset_index()
daily_distance.rename(columns={'distance_km': 'distance_per_day'}, inplace=True)
daily_distance.head()

* ## _Distancia entre puntos_


* ## _Velocidad_


In [None]:
def calculate_distance(lat1, lon1, lat2, lon2):
    '''
        Función para calcular la distancia entre dos puntos usando la fórmula de Haversine
    '''
    # Radio de la Tierra en kilómetros
    R = 6371.0

    # Convertir de grados a radianes
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])

    # Diferencias entre las coordenadas
    dlat = lat2 - lat1
    dlon = lon2 - lon1

    # Fórmula de Haversine
    a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))

    # Distancia en kilómetros
    distance = R * c
    return distance


In [26]:
# TODO:  # Asegúrate de que el tiempo esté en segundos

In [None]:

def calculate_velocity(distance, time_diff):
    '''
        Función para calcular la velocidad entre dos puntos
    '''
    # Velocidad = Distancia / Tiempo
   
    velocity = distance / time_diff
    return velocity


* ## _Acceleración_


In [None]:

def calculate_acceleration(velocity1, velocity2, time_diff):
    '''
        Función para calcular la aceleración
    '''
    # Aceleración = Cambio de velocidad / Tiempo
    acceleration = (velocity2 - velocity1) / time_diff
    return acceleration

# Convertir timestamps a objetos datetime
for i in range(1, len(df)):
    # Convertir las fechas a objetos datetime
    # time1 = datetime.strptime(df[i-1]['timestamp'], '%Y-%m-%d %H:%M:%S')
    # time2 = datetime.strptime(df[i]['timestamp'], '%Y-%m-%d %H:%M:%S')
    time1 = df[i-1]['timestamp']
    time2 = df[i]['timestamp']
    
    # Calcular la diferencia de tiempo en segundos
    time_delta = (time2 - time1).total_seconds()

    # Calcular la distancia entre dos puntos
    dist = calculate_distance(df[i-1]['lat'], df[i-1]['lon'], df[i]['lat'], df[i]['lon'])

    # Calcular la velocidad
    velocity = calculate_velocity(dist, time_delta)
    print(f"Velocidad entre Punto {i-1} y Punto {i}: {velocity:.2f} km/h")

    # Si hay un punto siguiente, calcular la aceleración
    if i + 1 < len(df):
        time1_next = datetime.strptime(df[i]['timestamp'], '%Y-%m-%d %H:%M:%S')
        time2_next = datetime.strptime(df[i+1]['timestamp'], '%Y-%m-%d %H:%M:%S')
        time_delta_next = (time2_next - time1_next).total_seconds()

        dist_next = calculate_distance(df[i]['lat'], df[i]['lon'], df[i+1]['lat'], df[i+1]['lon'])
        velocity_next = calculate_velocity(dist_next, time_delta_next)

        # Calcular la aceleración entre Punto i y Punto i+1
        acceleration = calculate_acceleration(velocity, velocity_next, time_delta_next)
        print(f"Aceleración entre Punto {i} y Punto {i+1}: {acceleration:.2f} km/h²")



* ## _Cambio de dirección_

* ## _Cantidad de luz diaria_

In [60]:
# Carga de efemérides de Skyfield
eph = load('de421.bsp')
ts = load.timescale()

def get_daylight_info(timestamp, lat, lon):
    '''
        Función para calcular daylight hours y eventos extremos
    '''
    date = datetime.fromtimestamp(timestamp, tz=timezone.utc)
    t0 = ts.utc(date.year, date.month, date.day)
    t1 = ts.utc(date.year, date.month, date.day + 1)

    observer = Topos(latitude_degrees=lat, longitude_degrees=lon)
    f = sunrise_sunset(eph, observer)

    times, events = find_discrete(t0, t1, f)

    sunrise = None
    sunset = None

    for ti, event in zip(times, events):
        if event == 1:
            sunrise = ti.utc_datetime()
        elif event == 0:
            sunset = ti.utc_datetime()

    # Se evaluan los resultados
    if sunrise and sunset:
        daylight_hours = (sunset - sunrise).total_seconds() / 3600
        return daylight_hours, False, False  # normal
    elif sunrise is None and sunset is None:
        # Puede ser noche polar o sol de medianoche
        alt = eph['Earth'] + observer
        sun_alt = alt.at(t0).observe(eph['Sun']).apparent().altaz()[0].degrees
        if sun_alt > 0:
            return 24.0, False, True  # sol de medianoche
        else:
            return 0.0, True, False  # noche polar
    else:
        # Incompleto, raro pero posible
        return None, None, None


In [63]:
type(df.timestamp[0])

str

In [76]:
# Se necesita Unix timestamp
df['timestamp'] = df['timestamp'].apply(lambda x: int(x.timestamp()))

# Aplicamos la función a cada fila
results = df.apply(lambda row: get_daylight_info(row['timestamp'], row['mu_lat'], row['mu_lon']), axis=1)

# Separar los resultados en nuevas columnas
df[['daylight_hours', 'is_polar_night', 'is_midnight_sun']] = pd.DataFrame(results.tolist(), index=df.index)


KeyboardInterrupt: 

In [None]:
df.head()

Unnamed: 0,UniqueAnimalID,timestamp,mu_lat,mu_lon,se_mu_x,se_mu_y,hour,date,year
0,1,1986-07-01 00:00:00,69.7687,-141.3759,14589,14589,0,182,1986
1,1,1986-07-01 06:00:00,69.7703,-141.3863,12248,12248,6,182,1986
2,1,1986-07-01 12:00:00,69.7718,-141.3942,9375,9375,12,182,1986
3,1,1986-07-01 18:00:00,69.773,-141.396,5835,5835,18,182,1986
4,1,1986-07-02 00:00:00,69.7738,-141.3834,2517,2517,0,183,1986


XXXX

In [None]:
import pandas as pd
import numpy as np
from geopy.distance import geodesic
from tqdm import tqdm

# Simulación por Monte Carlo
N_SIMULATIONS = 100 

def calculate_distance(lat1, lon1, lat2, lon2):
    '''Distancia geodésica en km'''
    return geodesic((lat1, lon1), (lat2, lon2)).km

def calculate_velocity(distance_km, time_seconds):
    if time_seconds == 0:
        return np.nan
    return distance_km / (time_seconds / 3600)  # km/h

def calculate_acceleration(v1, v2, dt_seconds):
    if dt_seconds == 0:
        return np.nan
    return (v2 - v1) / (dt_seconds / 3600)  # km/h²

def simulate_location(lat, lon, se_lat, se_lon):
    '''Simula coordenadas usando errores estándar (normal distrib)'''
    new_lat = np.random.normal(lat, se_lat)
    new_lon = np.random.normal(lon, se_lon)
    return new_lat, new_lon

def process_animal(df_animal):
    df_animal = df_animal.sort_values('timestamp').reset_index(drop=True)
    velocities = []
    accelerations = []
    velocity_errors = []
    acceleration_errors = []

    for i in range(1, len(df_animal) - 1):
        time0 = df_animal.loc[i-1, 'timestamp']
        time1 = df_animal.loc[i, 'timestamp']
        time2 = df_animal.loc[i+1, 'timestamp']
        
        dt1 = (time1 - time0).total_seconds()
        dt2 = (time2 - time1).total_seconds()

        lat0, lon0 = df_animal.loc[i-1, ['mu_lat', 'mu_lon']]
        lat1, lon1 = df_animal.loc[i, ['mu_lat', 'mu_lon']]
        lat2, lon2 = df_animal.loc[i+1, ['mu_lat', 'mu_lon']]

        # errores
        se0 = df_animal.loc[i-1, ['se_mu_x', 'se_mu_y']]
        se1 = df_animal.loc[i, ['se_mu_x', 'se_mu_y']]
        se2 = df_animal.loc[i+1, ['se_mu_x', 'se_mu_y']]

        v_sim = []
        a_sim = []

        for _ in range(N_SIMULATIONS):
            # simular coordenadas con incertidumbre
            slat0, slon0 = simulate_location(lat0, lon0, se0[0], se0[1])
            slat1, slon1 = simulate_location(lat1, lon1, se1[0], se1[1])
            slat2, slon2 = simulate_location(lat2, lon2, se2[0], se2[1])

            d1 = calculate_distance(slat0, slon0, slat1, slon1)
            d2 = calculate_distance(slat1, slon1, slat2, slon2)

            v1 = calculate_velocity(d1, dt1)
            v2 = calculate_velocity(d2, dt2)
            a = calculate_acceleration(v1, v2, dt2)

            v_sim.append(v1)
            a_sim.append(a)

        # Guardar valores medios y desviación estándar
        velocities.append(np.nanmean(v_sim))
        velocity_errors.append(np.nanstd(v_sim))
        accelerations.append(np.nanmean(a_sim))
        acceleration_errors.append(np.nanstd(a_sim))

    # Alinear con índice original (centrado)
    df_animal.loc[1:-1, 'velocity_kmh'] = velocities
    df_animal.loc[1:-1, 'velocity_error'] = velocity_errors
    df_animal.loc[1:-1, 'acceleration_kmh2'] = accelerations
    df_animal.loc[1:-1, 'acceleration_error'] = acceleration_errors

    return df_animal

# Cargar datos
df = pd.read_csv('polarbears.csv', parse_dates=['timestamp'])

# Inicializar columnas
df['velocity_kmh'] = np.nan
df['acceleration_kmh2'] = np.nan
df['velocity_error'] = np.nan
df['acceleration_error'] = np.nan

# Aplicar por animal
df_processed = df.groupby('UniqueAnimalID', group_keys=False).apply(process_animal)


....
