In [50]:
import os
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

In [51]:
data_folder = f"../data"

sample_filename = "muestreos_parcelas.parquet"

df_samples = pd.read_parquet(f"{data_folder}/{sample_filename}")
#df_samples.head(20).to_csv("sample_parcelas.txt", sep="\t")

df_samples["fecha"] = pd.to_datetime(df_samples["fecha"])
df_samples.sort_values(by="fecha", inplace=True)

df_samples['año'] = df_samples['fecha'].dt.year

df_samples.info()

<class 'pandas.core.frame.DataFrame'>
Index: 581793 entries, 119217 to 541761
Data columns (total 62 columns):
 #   Column                                              Non-Null Count   Dtype         
---  ------                                              --------------   -----         
 0   generated_muestreos                                 581793 non-null  datetime64[us]
 1   codparcela                                          581793 non-null  category      
 2   provincia                                           581793 non-null  string        
 3   municipio                                           581793 non-null  string        
 4   fecha                                               581793 non-null  datetime64[us]
 5   campaña                                             581793 non-null  uint16        
 6   poligono                                            581731 non-null  UInt16        
 7   parcela                                             581763 non-null  UInt16        

In [52]:
import utm


def utm_to_latlon(row):
    try:
        lat, lon = utm.to_latlon(row["102_coordenada_x_(utm)"], row["103_coordenada_y_(utm)"], 30, 'S')
    except:
        lat, lon = -9999999, -9999999

    return pd.Series([lat, lon], index=['lat', 'lon'])

In [3]:
# Adding the number of days till next visit for each row
df_samples["next_date"] = df_samples.groupby("codparcela", observed=True)["fecha"].shift(-1)
df_samples["days_until_next_visit"] = (df_samples["next_date"] - df_samples["fecha"]).dt.days


# # Removing the parcels with only one entry and the last entry for every parcel (we would need to also add the y value of the next entry)

df_samples = df_samples.dropna(subset=["days_until_next_visit"])  # 5150 entries removed
df_samples["days_until_next_visit"] = df_samples["days_until_next_visit"].astype("int32")


In [39]:
# Realizar el groupby y las agregaciones
group_dates = df_samples.groupby('codparcela').agg({'fecha': ['min', 'max', 'count']}).reset_index()

# Renombrar las columnas resultantes
group_dates.columns = ['codparcela', 'fecha_primera_muestra', 'fecha_ultima_muestra', 'n_muestras']

# Mostrar los resultados
group_dates.sort_values(by="n_muestras", ascending=False, inplace=True)

  group_dates = df_samples.groupby('codparcela').agg({'fecha': ['min', 'max', 'count']}).reset_index()


In [40]:
#group_dates = df_samples.groupby(["codparcela"])["fecha"].agg([('min_fecha', 'min'), ('max_fecha', 'max')]).reset_index()
#group_max_diff_days = df_samples.groupby(["codparcela", "año", "days_until_next_visit"])["days_until_next_visit"].agg([('max_days_until_next_visit', 'max')]).reset_index()

#group_coords = df_samples.groupby("codparcela")[["municipio", "102_coordenada_x_(utm)", "103_coordenada_y_(utm)", "huso_etrs89_regcan95"]].apply("first").reset_index()
group_coords = df_samples.groupby('codparcela').agg({'municipio': 'first', 
                                      '102_coordenada_x_(utm)': 'first', 
                                      '103_coordenada_y_(utm)': 'first'}).reset_index()


  group_coords = df_samples.groupby('codparcela').agg({'municipio': 'first',


In [41]:
group_dates.head()

Unnamed: 0,codparcela,fecha_primera_muestra,fecha_ultima_muestra,n_muestras
2598,015-00002-00-00,2006-03-08,2021-03-16,927
4488,043-00008-00-00,2006-04-10,2017-11-13,814
2956,017-00204-00-00,2006-05-10,2019-09-03,757
1895,010-00016-00-00,2006-03-06,2017-07-18,725
1312,007-00018-01-01,2010-03-08,2021-03-24,706


In [42]:
df_parcelas  = pd.merge(group_dates, group_coords, on='codparcela', how='inner')
df_parcelas.head()

Unnamed: 0,codparcela,fecha_primera_muestra,fecha_ultima_muestra,n_muestras,municipio,102_coordenada_x_(utm),103_coordenada_y_(utm)
0,015-00002-00-00,2006-03-08,2021-03-16,927,torredonjimeno,413322.0,4184311.0
1,043-00008-00-00,2006-04-10,2017-11-13,814,olvera,295019.0,4091226.0
2,017-00204-00-00,2006-05-10,2019-09-03,757,sabiote,472587.0,4211647.0
3,010-00016-00-00,2006-03-06,2017-07-18,725,canena,455318.0,4213752.0
4,007-00018-01-01,2010-03-08,2021-03-24,706,adamuz,371625.0,4211704.0


In [43]:
import utm


def utm_to_latlon(row):
    try:
        lat, lon = utm.to_latlon(row["102_coordenada_x_(utm)"], row["103_coordenada_y_(utm)"], 30, 'S')
    except:
        lat, lon = -9999999, -9999999

    return pd.Series([lat, lon], index=['lat', 'lon'])


df_parcelas[['lat', 'lon']] = df_parcelas.apply(utm_to_latlon, axis=1)

df_parcelas[(df_parcelas["lat"] < 40) &
               (df_parcelas["lat"] > 35) &
               (df_parcelas["lat"] > -10) &
               (df_parcelas["lat"] < 0)]

df_parcelas["n_muestras_cumsum"] = df_parcelas["n_muestras"].cumsum()

df_parcelas.head()

Unnamed: 0,codparcela,fecha_primera_muestra,fecha_ultima_muestra,n_muestras,municipio,102_coordenada_x_(utm),103_coordenada_y_(utm),lat,lon,n_muestras_cumsum
0,015-00002-00-00,2006-03-08,2021-03-16,927,torredonjimeno,413322.0,4184311.0,37.802069,-3.984598,927
1,043-00008-00-00,2006-04-10,2017-11-13,814,olvera,295019.0,4091226.0,36.944813,-5.301941,1741
2,017-00204-00-00,2006-05-10,2019-09-03,757,sabiote,472587.0,4211647.0,38.052148,-3.312451,2498
3,010-00016-00-00,2006-03-06,2017-07-18,725,canena,455318.0,4213752.0,38.070431,-3.509408,3223
4,007-00018-01-01,2010-03-08,2021-03-24,706,adamuz,371625.0,4211704.0,38.043972,-4.46301,3929


In [26]:
df_parcelas.to_csv("parcelas_download.txt", sep="\t", index=None)

In [9]:
df_parcelas = df_parcelas[df_parcelas["n_muestras_cumsum"]>578377]

In [10]:
df_parcelas.info()

<class 'pandas.core.frame.DataFrame'>
Index: 578 entries, 4661 to 5238
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   codparcela              578 non-null    category      
 1   fecha_primera_muestra   578 non-null    datetime64[us]
 2   fecha_ultima_muestra    578 non-null    datetime64[us]
 3   n_muestras              578 non-null    int64         
 4   municipio               578 non-null    object        
 5   102_coordenada_x_(utm)  436 non-null    float64       
 6   103_coordenada_y_(utm)  436 non-null    float64       
 7   huso_etrs89_regcan95    578 non-null    int64         
 8   lat                     578 non-null    float64       
 9   lon                     578 non-null    float64       
 10  n_muestras_cumsum       578 non-null    int64         
dtypes: category(1), datetime64[us](2), float64(4), int64(3), object(1)
memory usage: 220.8+ KB


In [26]:
df_parcelas.head()

Unnamed: 0,codparcela,fecha_primera_muestra,fecha_ultima_muestra,n_muestras,municipio,102_coordenada_x_(utm),103_coordenada_y_(utm),lat,lon,n_muestras_cumsum
0,015-00002-00-00,2006-03-08,2021-03-16,927,torredonjimeno,413322.0,4184311.0,37.802069,-3.984598,927
1,043-00008-00-00,2006-04-10,2017-11-13,814,olvera,295019.0,4091226.0,36.944813,-5.301941,1741
2,017-00204-00-00,2006-05-10,2019-09-03,757,sabiote,472587.0,4211647.0,38.052148,-3.312451,2498
3,010-00016-00-00,2006-03-06,2017-07-18,725,canena,455318.0,4213752.0,38.070431,-3.509408,3223
4,007-00018-01-01,2010-03-08,2021-03-24,706,adamuz,371625.0,4211704.0,38.043972,-4.46301,3929


In [12]:
from datetime import datetime, date
import requests
import matplotlib.pyplot as plt
import numpy as np
import time
import logging

logger = logging.getLogger(__name__)
logger.level = logging.INFO

def request_with_cooloff(api_url: str, payload: str, num_attempts: int):
    cooloff = 1

    for call_count in range(cooloff):
        try:
            response = requests.get(api_url, params=payload)
            response.raise_for_status()
            break

        except requests.exceptions.ConnectionError as e:
            logger.info("API refused the connection")
            logger.warning(e)
            if call_count != (num_attempts - 1):
                time.sleep(cooloff)
                cooloff *= 2
                call_count += 1
                continue
            else:
                raise

        except requests.exceptions.HTTPError as e:
            logger.warning(e)
            if response.status_code == 404:
                raise

            logger.info(f"API return code {response.status_code} cooloff at {call_count}")
            if call_count != (num_attempts - 1):
                time.sleep(cooloff)
                cooloff *= 2
                call_count += 1
                continue
            else:
                raise

    # We got through the loop without error so we've received a valid response
    return response

def getTimeseriesForPoint(covId:str, 
                          tsvBaseURL:str, 
                          start:datetime, 
                          end:datetime, 
                          lat:float, 
                          lon:float):
    
    tsURL = tsvBaseURL + covId + '/point'
    payload = {
        'lon': str(lon),
        'lat': str(lat),
        'startDate': start.strftime('%Y-%m-%d'),
        'endDate': end.strftime('%Y-%m-%d')
    }

    # Introduce a cooldown of 1 second between API calls
    response = request_with_cooloff(api_url=tsURL, payload=payload, num_attempts=15)
    timeseries = response.json()['results']
    
    return timeseries

In [44]:
def download_variable_parcela(variable:str, 
                              start_time:datetime,
                              end_time:datetime,
                              latitude:float,
                              longitude:float):
    
    raw_data = getTimeseriesForPoint(covId=variable,
                            tsvBaseURL='https://services.terrascope.be/timeseries/v1.0/ts/',
                                start=start_time,
                                end=end_time,
                                lat=latitude,
                                lon=longitude,
                                printURL=False)
    
    df = pd.DataFrame()
    n_points = len(raw_data)
    dates = [raw_data[i]["date"] for i in range(n_points)]
    values = [raw_data[i]["result"]["average"] for i in range(n_points)]

    df["date"] = dates
    df["value"] = values
    
    return df

In [47]:
for i, row in df_parcelas.iterrows():


    id_parcela = row["codparcela"]
    start_time = datetime.strptime(row["fecha_primera_muestra"].strftime("%d-%m-%Y"), "%d-%m-%Y")
    end_time =datetime.strptime(row["fecha_ultima_muestra"].strftime("%d-%m-%Y"), "%d-%m-%Y")

    start_time_document = start_time.strftime("%Y_%m_%d")
    end_time_document = end_time.strftime("%Y_%m_%d")

    lat_parcela = row["lat"]
    lon_parcela = row["lon"]

    cumsum = row["n_muestras_cumsum"]
    n_samples = row["n_muestras"] 
    

    variables_list = ["BIOPAR_FAPAR_V2_GLOBAL", "BIOPAR_NDVI_V2_GLOBAL"]


    for variable in variables_list:

        df_variable = download_variable_parcela(variable, 
                              start_time,
                              end_time,
                              lat_parcela,
                              lon_parcela)

        filename = f"{id_parcela}_{variable}_{start_time_document}_{end_time_document}.txt".replace("/", "")

        df_variable.to_csv(f"{data_folder}/datos_parcela_test/{filename}", sep="\t")

    print(f"Parcela:{id_parcela}, cumsum samples={cumsum}")


Parcela:015-00002-00-00, cumsum samples=927


KeyboardInterrupt: 