## Dati Satellite SRTM (dem)

In [1]:
from path import Path
import arrow
import json
import pytz
from pprint import pprint
from tqdm.notebook import tqdm
import re, os, collections, itertools, uuid, logging
import tempfile
import shapely

import zipfile
import urllib

import ee
import pyproj
import numpy as np
import scipy as sp
import pandas as pd
import geopandas as gpd
from matplotlib import pyplot as plt
import seaborn as sns

plt.rcParams['figure.figsize'] = (15, 5) # bigger plots
plt.style.use('fivethirtyeight')
%matplotlib inline
%precision 4

'%.4f'

In [2]:
helper_dir = str(Path('..').abspath())
if helper_dir not in os.sys.path:
    os.sys.path.append(helper_dir)
    
from leak_helpers.earth_engine import display_ee, get_boundary, tifs2np, bands_srtm, download_image

In [3]:
# # Non voglio stampare i Warning
# import warnings
# warnings.filterwarnings("ignore")

# Load leaks

Load the leaks from a geojson file and make sure they have unique fields reportdate and workorderid (see asserts below)

In [3]:
# load 
root = "../../../ricerca_perdite"
leaks = gpd.read_file(root+"/data/leak_dataset/leaks.geojson")

leaks_datas = [leaks]

leaks_datas

[         id  anno civico      comune  comune_loc  \
 0         0  2015      0   PODENZANA   PODENZANA   
 1         1  2015      1     TRESANA     TRESANA   
 2         3  2015     00  pontremoli  PONTREMOLI   
 3         9  2016         pontremoli  PONTREMOLI   
 4        44  2021   None  PONTREMOLI  PONTREMOLI   
 ...     ...   ...    ...         ...         ...   
 1708  34754  2021     51    CAMAIORE    CAMAIORE   
 1709  34760  2021   None    CAMAIORE    CAMAIORE   
 1710  34873  2021   None   MASSAROSA   MASSAROSA   
 1711  34881  2021     24  FILATTIERA  FILATTIERA   
 1712  35025  2022   None   MASSAROSA   MASSAROSA   
 
                                             description  diametro  \
 0       Perdita acqua dalla rete montedivalli podenzana     110.0   
 1     perdita su adduttrice serbatoio giovagallo ese...      90.0   
 2       rotture idriche addutrice serbatoio di s. marco     125.0   
 3                                      PERDITA STRADALE     110.0   
 4     Manca

In [4]:
# join them all, with primary columns and random metadata
primary_cols = ['workorderid','reportdate','geometry']
leaks = gpd.GeoDataFrame(pd.concat([leaks_data[primary_cols] for leaks_data in leaks_datas]), crs='epsg:4326')
leaks['metadata'] = np.concatenate([leaks_data.drop(primary_cols,1).to_dict('records') for leaks_data in leaks_datas])
leaks.index = leaks.workorderid
leaks

  leaks['metadata'] = np.concatenate([leaks_data.drop(primary_cols,1).to_dict('records') for leaks_data in leaks_datas])


Unnamed: 0_level_0,workorderid,reportdate,geometry,metadata
workorderid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
434549,434549,2015-05-15T16:14:00+00:00,POINT (9.85871 44.19513),"{'id': '0', 'anno': 2015, 'civico': '0', 'comu..."
449895,449895,2015-05-30T17:14:00+00:00,POINT (9.87618 44.23608),"{'id': '1', 'anno': 2015, 'civico': '1', 'comu..."
526117,526117,2015-09-08T06:34:01.403000+00:00,POINT (9.84730 44.38188),"{'id': '3', 'anno': 2015, 'civico': '00', 'com..."
815762,815762,2016-09-05T11:59:11.637000+00:00,POINT (9.86374 44.36673),"{'id': '9', 'anno': 2016, 'civico': '', 'comun..."
2621776,2621776,2021-07-17T05:44:00+00:00,POINT (9.84420 44.38554),"{'id': '44', 'anno': 2021, 'civico': None, 'co..."
...,...,...,...,...
2621532,2621532,2021-07-16T05:40:11.870000+00:00,POINT (10.30386 43.93762),"{'id': '34754', 'anno': 2021, 'civico': '51', ..."
2626643,2626643,2021-07-19T04:50:00+00:00,POINT (10.30334 43.94759),"{'id': '34760', 'anno': 2021, 'civico': None, ..."
2674832,2674832,2021-09-04T15:13:31.657000+00:00,POINT (10.29917 43.88345),"{'id': '34873', 'anno': 2021, 'civico': None, ..."
2679778,2679778,2021-09-07T09:46:49.459999+00:00,POINT (9.93401 44.33066),"{'id': '34881', 'anno': 2021, 'civico': '24', ..."


## Params

Customise the values in the cell below

In [5]:
# params
bands = bands_srtm
satellite = 'USGS/SRTMGL1_003'
resolution_min = 10.0 # m

# since the lowest res band is 60m and I want to capture neighbours I should get 6+ pixels
pixel_length = 25.0

# you need to tweak this until you pass the "Test the distance need to get your rectangle" cell
fudge_distance_factor = -0.5

## Init

In [6]:
%%javascript
var kernel = IPython.notebook.kernel;
var thename = window.document.getElementById("notebook_name").innerHTML;
var command = "notebook_name = " + "'"+thename+"'";
kernel.execute(command);

<IPython.core.display.Javascript object>

In [7]:
#notebook_name='scraping_earth_engine_srtm'
notebook_name

'scraping_earth_engine_srtm'

In [8]:
# constant params, probobly don't change
crs_grid = 3857 # keep this as auxilary sphere, this is the CRS the downloaded images will be in

# init
## init directories
ts=arrow.utcnow().format('YYYYMMDD-HH-mm-ss')
temp_dir = Path(root+'/data/scraped_satellite_images/'+satellite.replace("/","_")+'/tmp/')
output_dir = Path(root+'/data/scraped_satellite_images/'+satellite.replace("/","_"))
cache_dir = Path(output_dir+'/cache')
output_dir.makedirs_p()
temp_dir.makedirs_p()
cache_dir.makedirs_p()

## init logger
logger = logging.getLogger(notebook_name)
# logger.setLevel(logging.WARN)

temp_dir, output_dir, cache_dir

(Path('../../../ricerca_perdite/data/scraped_satellite_images/USGS_SRTMGL1_003/tmp/'),
 Path('../../../ricerca_perdite/data/scraped_satellite_images/USGS_SRTMGL1_003'),
 Path('../../../ricerca_perdite/data/scraped_satellite_images/USGS_SRTMGL1_003/cache'))

In [9]:
# record cofig in a json file
metadata = dict(
    notebook_name=notebook_name,
    satellite=satellite,
    pixel_length=pixel_length,
    resolution_min=resolution_min,
    bands=bands,
    ts=ts,
    crs_grid=crs_grid,
    cache_dir=str(cache_dir),
    temp_dir=str(temp_dir),
    output_dir=str(output_dir),
)
metadata_file = output_dir.joinpath('script_metadata.json')
json.dump(metadata, open(metadata_file,'w'))

# Earth Engine

In [10]:
# test earth-engine setup
from oauth2client import crypt # should have not error
import ee
ee.Initialize() # should give no errors, if so follow instructions

# test
image = ee.Image(satellite)

info = image.getInfo()
info

{'type': 'Image',
 'bands': [{'id': 'elevation',
   'data_type': {'type': 'PixelType',
    'precision': 'int',
    'min': -32768,
    'max': 32767},
   'dimensions': [1296001, 417601],
   'crs': 'EPSG:4326',
   'crs_transform': [0.0003, 0, -180.0001, 0, -0.0003, 60.0001]}],
 'id': 'USGS/SRTMGL1_003',
 'version': 1641990767055141,
 'properties': {'system:visualization_0_min': '0.0',
  'type_name': 'Image',
  'keywords': ['dem',
   'elevation',
   'geophysical',
   'nasa',
   'srtm',
   'topography',
   'usgs'],
  'thumb': 'https://mw1.google.com/ges/dd/images/SRTM90_V4_thumb.png',
  'description': '<p>The Shuttle Radar Topography Mission (SRTM, see <a href="https://onlinelibrary.wiley.com/doi/10.1029/2005RG000183/full">Farr\net al. 2007</a>)\ndigital elevation data is an international research effort that\nobtained digital elevation models on a near-global scale. This\nSRTM V3 product (SRTM Plus) is provided by NASA JPL\nat a resolution of 1 arc-second (approximately 30m).</p><p>This da

In [14]:
# leak = leaks.iloc[[0]]
# boundary = get_boundary(leak, distance=distance)

# image = ee.Image(satellite)
# elevation = image.clip(boundary)
# slope = ee.Terrain.slope(image).clip(boundary)


# # download_image(
# #         slope, 
# #         scale=resolution_min, 
# #         crs=crs_grid, 
# #         name='prova',
# #         cache_dir=cache_dir
# #     )

# elevation.getInfo(), slope.getInfo()

# Fetching images

In [11]:
import dataset
cache_file = 'sqlite:///{}'.format(cache_dir.dirname().joinpath('cache.db'))
db = dataset.connect(cache_file)
cache_table = db.get_table('cached_ids', primary_id='workorderid')

def get_cached_ids():
    return set(row['workorderid'] for row in cache_table.distinct('workorderid'))

def init_cache(workorderid):
    """We will cache downloads in folders like 'id_after'"""
    if workorderid:
        try:
            cache_table.insert(dict(workorderid=workorderid))
        except:
            db.rollback()
        else:
            db.commit()
    return

#Aggiunge il workorderid per il quale ho già l'immagine
img_path = Path('../../data/scraped_satellite_images/'+satellite.replace("/","_")+'/cache/')
for i in os.listdir(img_path):
    init_cache(i.split('_')[0])

# Conta il set di workorderid già scaricati
len(get_cached_ids())

33184

In [12]:
# # # Cancella dati dalla tabella
# cache_table.delete()

# Conta il set di workorderid che mancano da provare a scaricare
leak_to_scrape = set(leaks.workorderid).difference(set(get_cached_ids()))

len(leak_to_scrape)
#leak_to_scrape

126

### Test the distance need to get your rectangle

Here we need to tweak `fudge_distance_factor` so that we get the image size of our choice. Start with zero and try -1, -0.5, -.25,0,0.25,0.5,0.75. This is to deal with rounding, projecting between CRS's etc. Don't worry the asserts below will yet you know when it's right.

Occasionaly the problem might be that the leak is at the edge of the image, giving a cropped image. Ignore these rare cases.

In [13]:
distance = resolution_min*(pixel_length/2.00+fudge_distance_factor)

In [14]:
import time
import traceback
cached_ids = get_cached_ids()


def get_image_for_leak(i, cached_ids=cached_ids):
    leak = leaks.loc[[i]]
    repo_date_ts = arrow.get(leak.reportdate.values[0]).timestamp()
    
    # crappy way or recording that we tried this one
    workorderid = leak.workorderid.values[0]
    if workorderid in cached_ids:
        logger.info('Skipping cached download for leak id %s ',workorderid)
        return
    
    boundary = get_boundary(leak, distance=distance)
    
    srtm_img = ee.Image(satellite)

    # download as save images    
    logger.info('results for %s', workorderid)
    
    elevation = srtm_img.clip(boundary)
    slope = ee.Terrain.slope(srtm_img).clip(boundary)
    info = elevation.getInfo()
    info['bands'].extend(slope.getInfo())
    
    name=str(workorderid)
    
    # scarico il DEM con le quote
    path,files=download_image(
        elevation, 
        scale=resolution_min, 
        crs=crs_grid, 
        name=name,
        cache_dir=cache_dir
    )
    
    # scarico le pendenze con le quote
    path,files=download_image(
        slope, 
        scale=resolution_min, 
        crs=crs_grid, 
        name=name,
        cache_dir=cache_dir
    )
    # also save metadata so we can filter by date
    with open(path.joinpath('metadata.json'), 'w') as fo:
        metadata = dict(
            image=info,
            scale=resolution_min,
            crs=crs_grid,
            name=name,
            distance=distance,
            leak=json.loads(leak.to_json())
        )
        json.dump(metadata, fo)

    cached_ids = init_cache(str(workorderid)) # so we know there where results
    return

leak_to_scrape = set(leaks.workorderid).difference(set(cached_ids))
for i in tqdm(leak_to_scrape):
    try:
        get_image_for_leak(i)
    except urllib.error.HTTPError as e:
        print(i,e) # "HTTP Error 429: unknown"
        traceback.print_stack()
        if e.code == 429:
            print('sleep for 13s')
            time.sleep(13);
    except ee.ee_exception.EEException as e:
        print(i,e) # "Earth Engine memory capacity exceeded."
        traceback.print_stack()
        ee.Initialize()
    except zipfile.BadZipFile as e:
        print(i,e) # "File is not a zip file"
        traceback.print_stack()
    except Exception as e:
        print(i,e)
        traceback.print_stack()

  0%|          | 0/126 [00:00<?, ?it/s]

In [15]:
#### Quality check dei dati

X = []
discarded = []

for path in tqdm(cache_dir.listdir()):
    files = [file.relpath(path) for file in path.listdir() if file.endswith('.tif')]
    if files:      
        # load data
        data = tifs2np(path,files,bands=bands)
        try:
            assert data[bands.index('elevation')].max() < 2000 , 'la quota deve essere inferiore a 2000 m'
        except Exception as exc:
            print(path, exc)
            discarded.append(data)
        else:
            X.append(data)

len(X), len(discarded)

  0%|          | 0/33310 [00:00<?, ?it/s]

(33310, 0)

In [None]:
# np.array(discarded,dtype=np.int16)[0][0]

# import cv2
# import imageio
# import tifffile

# def tifs2np_test(path, files, bands):
#     """Convert tifs to numpy array"""
#     tifs = [f for f in files if f.endswith('.tif')]
#     channels = {}
#     for tif in tifs:
#         band = tif.split('.')[-2]
#         # read tif as float32
#         #print(path.joinpath(tif))
# #         x = cv2.imread(path.joinpath(tif), cv2.IMREAD_UNCHANGED) 
# #         x = plt.imread(path.joinpath(tif))#, format='tiff-pil') #, pilmode="F") 
# #         x = imageio.imread(path.joinpath(tif), format='tiff-pil', pilmode="F") 
#         x = tifffile.imread(path.joinpath(tif))#, format='tiff-pil', pilmode="F") 
#         channels[band] = x

#     pixel_length = x.shape[1]
#     pixel_width = x.shape[0]
#     data = []
#     for band in bands:
#         if band not in channels:
#             channels[band] = np.zeros((pixel_width, pixel_length))
#         data.append(channels[band])
#     return np.array(data)


# path_test = Path('../../../ricerca_perdite/data/scraped_satellite_images/USGS_SRTMGL1_003/cache/1000409_3857_10.0')
# files_test = [file.relpath(path_test) for file in path_test.listdir() if file.endswith('.tif')]
# print(files_test)
# data_test = tifs2np_test(path_test, files_test, bands=bands)

# data_test