In [1]:
# type: ignore

from PIL import Image
import requests
import numpy as np
from PIL import Image

import pandas as pd 
import geopandas as gpd
# import osmnx as ox
# import networkx as nx

import pyproj
from shapely.ops import transform

import shapely
from shapely import Point

# import boto3
import os
import django

pd.set_option('display.precision', 2)

os.environ.setdefault("DJANGO_SETTINGS_MODULE", "water.settings")
os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"
django.setup()

from water.models import ReservoirState, Reservoir, ReservoirStateSerializer, RainFall
from django.db.models import Count

from water.utils import parse as p

In [5]:
filename_all = '../../../../data/datasets/all_parsed_cleaned.csv'
df_all_raw = pd.read_csv(filename_all).sort_values(['province', 'reservoir', 'ds'])
name_df = df_all_raw.reservoir.unique()

file = '../../../../data/reservoirs/InfGeografica/InfVectorial/Gpkg/Inv_presas_embalses.gpkg'

# Open the file in geopandas
gdf_raw = gpd.read_file(file)
name_geo = gdf_raw[gdf_raw.nombre.notnull()].nombre.unique()

replace_vals = ['embalse del ', 'embalse de ', 'embalse ']

def clean_name(name):
    for val in replace_vals:
        name = name.lower().replace(val, '')
    return name

names_geo_dict = {name: clean_name(name) for name in name_geo}

import difflib

diff_obs = difflib.ndiff('ab', 'cd')

# Summarize the differences
# Show number of changes
def diffnum_difflib(a,b):
    diff = difflib.ndiff(a, b)
    diff = list(diff)
    diff = [x for x in diff if x[0] != ' ']
    return len(diff)

df_matches = pd.DataFrame(columns=['name_df', 'name_geo', 'diff'])
df_matches['name_df'] = name_df
geo_clean = list(names_geo_dict.values())

for i, name in enumerate(name_df):
    diffs = []
    for j, name2 in enumerate(geo_clean):
        diffs.append(diffnum_difflib(name, name2))
    df_matches['diff'].iloc[i] = min(diffs)
    df_matches['name_geo'].iloc[i] = geo_clean[diffs.index(min(diffs))]

df_matches['name_geo_full'] = df_matches['name_geo'].map({v: k for k, v in names_geo_dict.items()})
df_matches.sample(10)

dict_matches = df_matches.set_index('name_geo_full')['name_df'].to_dict()
dict_matches

{'Embalse de Benínar': 'beninar',
 'Embalse de Cuevas de Almanzora': 'cuevas de almanzora',
 'Embalse de Almodóvar': 'almodovar',
 'Embalse de Arcos': 'arcos',
 'Embalse de Barbate': 'barbate',
 'Embalse de Bornos': 'bornos',
 'Embalse del Celemín': 'celemin',
 'Embalse de Charco Redondo': 'charco redondo',
 'Embalse de Guadalcacín': 'guadalcacin',
 'Embalse de Guadarranque': 'guadarranque',
 'Embalse de los Hurones': 'los hurones',
 'Embalse de Zahariche': 'zahara',
 'Embalse de Zahara - El Gastor': 'zahara - el gastor',
 'Embalse del Arenoso': 'arenoso',
 'Embalse del Bembézar': 'bembezar',
 'Embalse de Guadalmellato': 'guadalmellato',
 'Embalse de Guadalén': 'guadanunno',
 'Embalse de Iznájar': 'iznajar',
 'Embalse de la Feda': 'la brenna',
 'Embalse de Martín Gonzalo': 'martin gonzalo',
 'Embalse de Puente Nuevo': 'puente nuevo',
 'Embalse del Retortillo': 'retortillo',
 'Embalse de San Rafael de Navallana': 'san rafael de navallana',
 'Embalse de Sierra Boyera': 'sierra boyera',
 

In [2]:
filename_all = '../../../../data/datasets/all_parsed_cleaned.csv'
df_all_raw = pd.read_csv(filename_all).sort_values(['province', 'reservoir', 'ds'])

In [3]:
capacities = df_all_raw.groupby(['province', 'reservoir'])['capacity_hm3'].agg(['last', 'nunique']).reset_index()

df_all_raw.query('reservoir=="el limonero"').capacity_hm3.unique()

array([24.7, 22.3])

In [16]:
df_all_raw.province.unique()

array(['almeria', 'cadiz', 'cordoba', 'granada', 'huelva', 'jaen',
       'malaga', 'sevilla'], dtype=object)

In [4]:
ds_start = "2012-09-01"
province = "malaga"
num_states = 10000
num_res_max = 5

capacities = df_all_raw.groupby(['province', 'reservoir'])['capacity_hm3'].agg(['last', 'nunique']).reset_index()
capacities = capacities[capacities.province == province].copy()
capacities = capacities.sort_values('last', ascending=False).head(num_res_max)
# capacities = capacities[capacities['nunique'] ==1].copy()
# assert capacities['nunique'].max() == 1
capacities

df_selected = df_all_raw[df_all_raw.reservoir.isin(capacities.reservoir.unique())]
df_selected = df_selected[df_selected.ds >= ds_start].copy()

capacities

Unnamed: 0,province,reservoir,last,nunique
63,malaga,la vinnuela,165.4,1
61,malaga,guadalteba,153.3,1
60,malaga,guadalhorce,125.7,1
58,malaga,conde de guadalhorce,66.5,1
62,malaga,la concepcion,61.9,1


In [7]:
name_to_full = {v: k for (k, v) in dict_matches.items()}
name_to_full

{'beninar': 'Embalse de Benínar',
 'cuevas de almanzora': 'Embalse de Cuevas de Almanzora',
 'almodovar': 'Embalse de Almodóvar',
 'arcos': 'Embalse de Arcos',
 'barbate': 'Embalse de Barbate',
 'bornos': 'Embalse de Bornos',
 'celemin': 'Embalse del Celemín',
 'charco redondo': 'Embalse de Charco Redondo',
 'guadalcacin': 'Embalse de Guadalcacín',
 'guadarranque': 'Embalse de Guadarranque',
 'los hurones': 'Embalse de los Hurones',
 'zahara': 'Embalse de Zahariche',
 'zahara - el gastor': 'Embalse de Zahara - El Gastor',
 'arenoso': 'Embalse del Arenoso',
 'bembezar': 'Embalse del Bembézar',
 'guadalmellato': 'Embalse de Guadalmellato',
 'guadanunno': 'Embalse de Guadalén',
 'iznajar': 'Embalse de Iznájar',
 'la brenna': 'Embalse de la Feda',
 'martin gonzalo': 'Embalse de Martín Gonzalo',
 'puente nuevo': 'Embalse de Puente Nuevo',
 'retortillo': 'Embalse del Retortillo',
 'san rafael de navallana': 'Embalse de San Rafael de Navallana',
 'sierra boyera': 'Embalse de Sierra Boyera',
 

In [8]:
ReservoirState.objects.all().delete()
Reservoir.objects.all().delete()

for _, row in capacities.iterrows():
    name=row['reservoir']
    province = row['province']
    name_full = name_to_full[name]
    reservoir = Reservoir.objects.create(name=row['reservoir'], name_full=name_full, province=province, capacity=row['last'])
    reservoir.save()
    
print(len(df_selected))
    
for _, row in df_selected.head(num_states).iterrows():
    reservoir = Reservoir.objects.get(name=row['reservoir'])
    state = ReservoirState.objects.create(
        reservoir=reservoir,
        date=row['ds'],
        volume=row['stored_hm3'],
    )
    state.save()
    
ReservoirState.objects.all().count()

2859


2859

In [11]:
Reservoir.objects.all()[0].province

'malaga'

In [12]:
df_all = p.add_cols(df_selected)
df_all['date_lag'] = df_all.groupby(['province', 'reservoir'])['date'].shift(1)
df_all['date_diff'] = (df_all.date - df_all.date_lag).dt.days

cols = ['rainfallsince', 'stored_hm3', 'capacity_hm3']
for var in ['rainfallsince', 'stored_hm3']:
    df_all[f'{var}_diff'] = df_all.groupby(['province', 'reservoir'])[var].diff()
    df_all[f'{var}_diff_0'] = df_all[f'{var}_diff']
    for lags in range(1, 10):
        df_all[f'{var}_diff_{lags}'] = df_all.groupby(['province', 'reservoir'])[f'{var}_diff'].shift(lags)

# Delete all rainfall objects
RainFall.objects.all().delete()

# Get all reservoirs with state data
reservoirs = Reservoir.objects.annotate(
        num_states=Count("reservoir_reservoirstate")
    ).filter(num_states__gt=0)

reservoir_names = reservoirs.values_list('name', flat=True)

for _, row in df_all[df_all.reservoir.isin(reservoir_names)].head(num_states).iterrows():
    reservoir = Reservoir.objects.get(name=row['reservoir'])
    rainfall = RainFall.objects.create(
        date=row['ds'],
        reservoir=reservoir,
        amount=row['rainfallsince_diff'],
        amount_cumulative=row['rainfallsince'],
        amount_cumulative_historical=row['avgrainfall1971_2000'],
    )
len(df_all), RainFall.objects.all().count()

(2859, 2859)

In [13]:
RainFall.objects.all().count(), ReservoirState.objects.all().count()

(2859, 2859)

# Reservoir Map

In [14]:
# Get the data from the database with uuid and name
reservoirs = Reservoir.objects.all()
reservoirs = reservoirs.values_list('name', 'uuid')
# Turn this into a dict with the name being the key
reservoirs_dict = {name: str(uuid) for name, uuid in reservoirs}
reservoirs_dict

{'la vinnuela': 'a3122a79-764d-4ef3-87b4-5dda3ff8c033',
 'guadalteba': '64afab63-8e11-4512-aedc-37278da2d704',
 'guadalhorce': '530d3cd9-a6c2-4a46-8f10-7d868332015a',
 'conde de guadalhorce': '222a934c-b6b4-41bc-8b40-0812344d3f59',
 'la concepcion': '28bb8e8b-3dfc-47ed-bdec-87ae8fc0bb9d'}

In [15]:
filename = '../../../frontend/src/data/reservoirs.json'
gdf = gdf_raw[gdf_raw.nombre.isin(df_matches['name_geo_full'])].copy()
gdf['name_data'] = [dict_matches[n] for n in gdf.nombre]
gdf['reservoir_uuid'] = [reservoirs_dict.get(name) for name in gdf['name_data']]
gdf.geometry = gdf.geometry.simplify(100)
# Turn this into a geo crs
gdf = gdf.to_crs("EPSG:4326")

gdf = gdf[gdf.reservoir_uuid.notnull()].copy()

gdf.to_file(filename, driver='GeoJSON')
gdf.iloc[0]

cod_emb                                                         8.0
nombre                             Embalse del Conde de Guadalhorce
cod_est                                                         S31
cod_roea                                                        n/e
idemb_cbrh                                                4009302.0
cod_massup                                                     None
fuente_nom        Mapa Topográfico de Andalucía 1:10000. Restitu...
area                                                      4699644.0
perimetro                                                   30445.0
fecha_alta                                                     2010
fecha_baja                                                        0
geometry          POLYGON ((-4.839423060945539 36.89797341929856...
name_data                                      conde de guadalhorce
reservoir_uuid                 222a934c-b6b4-41bc-8b40-0812344d3f59
Name: 7, dtype: object