## Downloading Sentinel-2 imagery

This script can be used to download all sentinel-2 images inside a geographical area defined by a `.shp` file and certain time-range. For large areas, this process will take a long time since the bandwitdh for downloading from ESA is not too large, you can use this script to query only the tiles to be downloaded and save it as a `.pkl` file. Then you can use `download1C_df.sh` to load the `.pkl` file and download then.


In [4]:
from osgeo import gdal, osr, ogr, gdalconst
import os
import numpy as np
from shapely.geometry import mapping, shape
from shapely.wkt import loads
from shapely.geometry import Polygon
import json
import xml.etree.ElementTree as ET 
import glob
%matplotlib inline
import pandas as pd
from sentinelsat import SentinelAPI, read_geojson, geojson_to_wkt
from datetime import date


## Set parameters

Download the .shp files from [here](https://www.diva-gis.org/gdata) for each country of interest

## Load the country shape file

In [3]:
root_dir = '/scratch/andresro/leon_igp/barry_palm/data'
#root_dir = '/home/pf/pfstaff/projects/andresro/barry_palm/data'

country='Malaysia'

if country == 'Phillipines':
    loc='phillipines_2017'
    adm1_path = '/home/pf/pfstaff/projects/andresro/data/countries/phillipines/PHL_adm1.shp'
    NAME = 'all'
#     NAME = ['Palawan','Cebu','Davao del Norte','Davao del Sur','Davao Oriental','Batangas','Quezon','Rizal','Laguna']
elif country == 'Malaysia':
    loc='palmcountries_2017'
    NAME = 'all'
    adm1_path = '/home/pf/pfstaff/projects/andresro/data/countries/malaysia/MYS_adm1.shp'
#     NAME=['Sarawak']
elif country == 'Indonesia':
    loc='palmcountries_2017'
    adm1_path = '/home/pf/pfstaff/projects/andresro/data/countries/indonesia/IDN_adm1.shp'
    NAME = 'all'
#     NAME = ['Kalimantan Barat','Riau','Sulawesi Barat','Sulawesi Tengah','Sulawesi Utara','Gorontalo']


product_dir = os.path.join(root_dir,'1C',loc,'PRODUCT')
save_dir = os.path.join(root_dir,'1C','dataframes_download')
save_dir1 = os.path.join(save_dir,loc)
if not os.path.exists(save_dir1):
    os.makedirs(save_dir1)
    
if not os.path.exists(product_dir):
    os.makedirs(product_dir)
    
print('product_dir: ', product_dir)


product_dir:  /scratch/andresro/leon_igp/barry_palm/data/1C/palmcountries_2017/PRODUCT


In [5]:

fieldname = 'NAME_1'

shp = ogr.Open(adm1_path)
lyr = shp.GetLayer(0)
lyrdf =lyr.GetLayerDefn()
id_ = lyrdf.GetFieldIndex(fieldname)
    
print('Total features', lyr.GetFeatureCount())
features_extent = {}
features_polygones = {}
for i in range(lyr.GetFeatureCount()):
    feat = lyr.GetFeature(i)
    value =feat.GetField(id_)
#     if value == name_:
    geom=feat.GetGeometryRef()
    extent = geom.GetEnvelope()
    lon1,lat1 = extent[0],extent[2]
    lon2,lat2 = extent[1],extent[3]
    wkt_ext = f'POLYGON(({lon1} {lat1}, {lon1} {lat2}, {lon2} {lat2},  {lon2} {lat1},  {lon1} {lat1} ))'
    features_extent[value] = wkt_ext
    features_polygones[value]=loads(geom.ExportToWkt())

    

Total features 13


## Get Sentinel-2 tile names in Polygon

You can download the sentinel-2 tiles from [here](https://sentinel.esa.int/web/sentinel/missions/sentinel-2/data-products) as .xml and convert them as .shp with QGIS to create ´Features.shp´

In [6]:
sentinel2_tiles_path = '/home/pf/pfstaff/projects/nlang_HCS/data/Sentinel2_mission/sentinel2_tiles/Features.shp'
driver = ogr.GetDriverByName('ESRI Shapefile')
sentinel2_tiles = driver.Open(sentinel2_tiles_path, 0) # 0 means read-only. 1 means writeable.

print('Opened {}'.format(sentinel2_tiles_path))
layer = sentinel2_tiles.GetLayer()
featureCount = layer.GetFeatureCount()
print('Number of layers: ', sentinel2_tiles.GetLayerCount())
# print(layer.GetLayerDefn())
print("Number of features: ", featureCount)


def getGeom(Shapefile, shapely = True):
    feature_dict={}
    n_layers = Shapefile.GetLayerCount()
    wkt_list  = []
    for _ in range(n_layers):
        Shapefile_layer = Shapefile.GetLayer()

        n_points = Shapefile_layer.GetFeatureCount()

        for _ in range(n_points):
            feat = Shapefile_layer.GetNextFeature()
            if feat:
                name = feat.GetFieldAsString("Name")
                geom = feat.geometry().ExportToWkt()
                if shapely:
                    geom = loads(geom)
                wkt_list.append(geom)
                # save in dictionary
                feature_dict[name]=geom

    print('{} geometries loaded'.format(len(wkt_list)))

    return wkt_list, feature_dict
    
tiles_geometry, feature_dict = getGeom(Shapefile=sentinel2_tiles)


Opened /home/pf/pfstaff/projects/nlang_HCS/data/Sentinel2_mission/sentinel2_tiles/Features.shp
Number of layers:  1
Number of features:  56984
56984 geometries loaded


In [7]:

roi_tiles_per_feature = {}

for name, poly in features_polygones.items():

    for tile, tile_poly in feature_dict.items():
        if poly.intersects(tile_poly):
            if name in roi_tiles_per_feature.keys():                
                roi_tiles_per_feature[name].append(tile)
            else:
                roi_tiles_per_feature[name] = [tile]
#     print(name,len(roi_tiles_per_feature[name]))


In [8]:
df = pd.DataFrame.from_dict(roi_tiles_per_feature, orient='index')

df['count'] = df.shape[1]-df.isnull().sum(axis=1)
df = df.sort_values('count',ascending=False)
df['count'].sum()
# for key, value in roi_tiles_per_feature.items():
#     print(key,len(value))

122

In [9]:
a = set(np.array(df.drop('count', axis=1)).flatten())
len(a)
df['tiles_country'] = len(a)

In [10]:
filename = save_dir1+'/'+f'Tiles_per_region_{country}.csv'
df.to_csv(filename)
print(filename, 'saved!')

/scratch/andresro/leon_igp/barry_palm/data/1C/dataframes_download/palmcountries_2017/Tiles_per_region_Malaysia.csv saved!


In [11]:
df.index

Index(['Sarawak', 'Sabah', 'Pahang', 'Johor', 'Kedah', 'Perak', 'Kelantan',
       'Selangor', 'Trengganu', 'Negeri Sembilan', 'Melaka', 'Perlis',
       'Pulau Pinang'],
      dtype='object')

## Query tiles from SCIHUB server

If you do not have a scihub account create one [here](https://scihub.copernicus.eu/dhus/#/self-registration).

Before downloading we will just query all the sentinel tiles to create a database where we can track which tiles are available to download.

Now you can add your details to download the corresponding images. You can choose which images to download depending on the state name or 'all' for downloading all states inside the .shp file

In [3]:
# connect to the API

username =''
password=''
api = SentinelAPI(username, password, 'https://scihub.copernicus.eu/dhus')


In [13]:
is_all = False
is_load =True

if NAME == 'all':
    is_all = True
    

if is_load:
    name_ = '_'.join(NAME).replace(' ','_') if not is_all else 'all'
    file_ = glob.glob(f'{save_dir1}/{country}_{name_}_*.pkl')[0]
    
    df_download = pd.read_pickle(file_)
else:
    
    products_df = []
    # search by polygon, time, and SciHub query keywords
    NAME1 = set(df.index) if is_all else NAME
    for name_ in NAME1:
        print(name_)
        products = api.query(area=features_extent[name_],
                             # CHANGE desired time-frame here
                             date=('20170101', date(2017, 12,31)),
                             platformname='Sentinel-2')

        # convert to Pandas DataFrame
        products_df.append(api.to_dataframe(products))



    df_out = None
    for d_ in products_df:
        if df_out is None:
            df_out = d_
        else:
            df_out = df_out.append(d_)

    # df_out.shape

    df1 = df_out.copy()
    df1.drop_duplicates(subset=['title'], inplace =True)
    df1[df1.tileid.isna()].shape, df1[df1.relativeorbitnumber.isna()].shape

    df1['sizeMB'] = df1['size'].map(lambda x: float(x.replace(' MB','')))
    df1['tileid'] = df1['title'].map(lambda x: x.split('_')[5][1:])


#     Remove Tiles not in polygon

    index = None
    for name_ in NAME1:
        index_ = [x in roi_tiles_per_feature[name_] for x in df1.tileid]
        if index is None:
            index = index_
        else:
            index = np.logical_or(index_,index)


    # index
    df1 = df1[index]

    df_download = df1.sort_values(['cloudcoverpercentage', 'ingestiondate'], ascending=[True, True]).groupby(['tileid','relativeorbitnumber']).head(10)
    df_download = df_download.sort_values(['tileid','relativeorbitnumber','cloudcoverpercentage'])


    print(len(np.unique(df_download.tileid)),len(np.unique(df_download.title)))

    for counter, (id_, d) in enumerate(df_download.groupby(['tileid','relativeorbitnumber'])):
        print(counter, id_,f' N {d.shape[0]} mean cc {d.cloudcoverpercentage.mean():.2f}')

    df_download.sizeMB.sum()/(10*60*60)
    name_ = '_'.join(NAME).replace(' ','_') if not is_all else 'all'
#     name_ = '_'.join(NAME).replace(' ','_')
    file_=save_dir1+'/'+country+name_+f'{save_dir1}/{country}_{name_}_{df_download.shape[0]}.pkl'
    
    df_download.to_pickle(file_)

In [14]:
file1_ = file_.replace('.pkl','.txt')

with open(file1_, 'w') as f:
    for item in df_download.title:
        f.write("%s\n" % item)
print(file1_,'saved')        

/scratch/andresro/leon_igp/barry_palm/data/1C/dataframes_download/palmcountries_2017/Malaysia_all_1150.txt saved


In [15]:
df_download.title[0]

'S2B_MSIL1C_20171118T034019_N0206_R061_T47NNG_20171118T090542'

In [16]:
print('total size',np.sum(df_download.sizeMB))

total size 604831.73


This script could be used to download directly all the scrips but if we have too many tiles, this will usually take several days to complete.

In [17]:
# # download sorted and reduced products
# api.download_all(df.index,directory_path=product_dir)

'/scratch/andresro/leon_igp/barry_palm/data/1C/dataframes_download/palmcountries_2017'

## Check 1C downloads

In [None]:
tiles = df_download.title.map(lambda x: '_'.join(x.split('_')[4:6]))

In [None]:

base_path =root_dir+'/1C/{}/PRODUCT/'.format(loc) 
# base_path ='/home/pf/pfstaff/projects/andresro/barry_palm/data/1C/palm_2017/PRODUCT/'

filelist = glob.glob(base_path+'*.zip')

existing_ds = [os.path.split(x)[-1].replace('.zip','') for x in filelist]
# pending_ds = [x for x in df_download.title if x not in existing_ds]
pending_ds = [x not in existing_ds for x in df_download.title]

print('total',df_download.shape[0])
print(f'existing {len(existing_ds)} in {base_path}')
print('pending',np.sum(pending_ds))



In [None]:
df_existing = df_download[~np.array(pending_ds)]

file_existing_correct = base_path+'/correct_zip.txt'
lines = [line.rstrip('\n') for line in open(file_existing_correct)]

is_checked = [x not in lines for x in df_existing.title]
df_to_check = df_existing[is_checked]

print('1C ds pending checksum:',df_to_check.shape[0])

In [21]:
for id_, row in df_download.head(5).iterrows():
    print(row.link)
#     print(f'wget --content-disposition --continue --user={username} --password={password} "https://scihub.copernicus.eu/dhus/odata/v1/Products(\'{id_}\')/\$value" -P {save_dir}')
#     print(f'wget --content-disposition --continue --user={username} --password={password} "{row.link}"')

https://scihub.copernicus.eu/dhus/odata/v1/Products('3df8a4c9-11e7-4c1f-bc93-e7a5123581af')/$value
https://scihub.copernicus.eu/dhus/odata/v1/Products('150e0721-caa3-4e5a-a688-72e456f65d9f')/$value
https://scihub.copernicus.eu/dhus/odata/v1/Products('524e0315-a635-41ef-b506-b72cfacd325f')/$value
https://scihub.copernicus.eu/dhus/odata/v1/Products('7e2432d5-5454-4c1e-ab53-f974fa797906')/$value
https://scihub.copernicus.eu/dhus/odata/v1/Products('b4542a03-50de-40f2-a76f-e227308b119d')/$value


In [22]:
# df_download.inde
base_path

'/scratch/andresro/leon_igp/barry_palm/data/1C/palmcountries_2017/PRODUCT/'

## Check 2A files

In [23]:
path=root_dir+'/1C/'+loc+'/PRODUCT/correct_zip.txt'
lines1C = [line.rstrip('\n') for line in open(path)]
lines1C = [x for x in lines1C if '2017' in x]
print('1C: ',len(lines1C))


path=root_dir+'/2A/'+loc+'/correct_2A.txt'
lines2A = [line.rstrip('\n') for line in open(path)]
lines2A = [x for x in lines2A if '2017' in x]
print('2A: ',len(lines2A))

1C:  9351
2A:  8483


In [24]:
ds1C = pd.DataFrame({'title1C': lines1C})

ds1C['tile'] = ds1C.title1C.map(lambda x: x.split('_')[5])
ds1C['orbit'] = ds1C.title1C.map(lambda x: x.split('_')[4])

ds1C.head()

counts1C = ds1C.groupby(['tile','orbit']).count().rename({'title1C':'count1C'},axis=1)

ds1C = ds1C.set_index(['tile','orbit']).join(counts1C)
ds1C['title2A'] = ds1C.title1C.map(lambda x: x.replace('MSIL1C','MSIL2A'))
ds1C.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,title1C,count1C,title2A
tile,orbit,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
T46NGH,R104,S2A_MSIL1C_20171017T034731_N0205_R104_T46NGH_2...,10,S2A_MSIL2A_20171017T034731_N0205_R104_T46NGH_2...
T46NGH,R104,S2B_MSIL1C_20170922T034519_N0205_R104_T46NGH_2...,10,S2B_MSIL2A_20170922T034519_N0205_R104_T46NGH_2...
T46NGH,R104,S2B_MSIL1C_20171221T035139_N0206_R104_T46NGH_2...,10,S2B_MSIL2A_20171221T035139_N0206_R104_T46NGH_2...
T46NGH,R104,S2A_MSIL1C_20170420T034541_N0204_R104_T46NGH_2...,10,S2A_MSIL2A_20170420T034541_N0204_R104_T46NGH_2...
T46NGH,R104,S2A_MSIL1C_20171216T035141_N0206_R104_T46NGH_2...,10,S2A_MSIL2A_20171216T035141_N0206_R104_T46NGH_2...


In [25]:
ds2A = pd.DataFrame({'title2A': lines2A})

ds2A['tile'] = ds2A.title2A.map(lambda x: x.split('_')[5])
ds2A['orbit'] = ds2A.title2A.map(lambda x: x.split('_')[4])

ds2A.head()

counts2A = ds2A.groupby(['tile','orbit']).count().rename({'title2A':'count2A'},axis=1)
ds2A = ds2A.set_index(['tile','orbit']).join(counts2A)
ds2A['correct2A'] = True

ds2A.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,title2A,count2A,correct2A
tile,orbit,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
T46NGH,R104,S2A_MSIL2A_20170420T034541_N0204_R104_T46NGH_2...,8,True
T46NGH,R104,S2A_MSIL2A_20170619T034531_N0205_R104_T46NGH_2...,8,True
T46NGH,R104,S2A_MSIL2A_20170709T034531_N0205_R104_T46NGH_2...,8,True
T46NGH,R104,S2A_MSIL2A_20171017T034731_N0205_R104_T46NGH_2...,8,True
T46NGH,R104,S2B_MSIL2A_20170724T034539_N0205_R104_T46NGH_2...,8,True


In [26]:
dsAll = ds1C.set_index('title2A').join(ds2A.set_index('title2A'))

#dsAll[dsAll.correct2A != True].title1C
#dsAll[dsAll.title1C]
dsAll.reset_index().columns 

Index(['title2A', 'title1C', 'count1C', 'count2A', 'correct2A'], dtype='object')

In [31]:
dsAll[dsAll.correct2A != True].head()

Unnamed: 0_level_0,title1C,count1C,count2A,correct2A
title2A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
S2A_MSIL2A_20170106T023101_N0204_R046_T50MMA_20170106T024308,S2A_MSIL1C_20170106T023101_N0204_R046_T50MMA_2...,10,,
S2A_MSIL2A_20170106T023101_N0204_R046_T50NNF_20170106T024308,S2A_MSIL1C_20170106T023101_N0204_R046_T50NNF_2...,10,,
S2A_MSIL2A_20170109T011711_N0204_R088_T53LRL_20170109T011728,S2A_MSIL1C_20170109T011711_N0204_R088_T53LRL_2...,10,,
S2A_MSIL2A_20170109T011711_N0204_R088_T53MNR_20170109T011728,S2A_MSIL1C_20170109T011711_N0204_R088_T53MNR_2...,10,,
S2A_MSIL2A_20170109T011711_N0204_R088_T53MPU_20170109T011710,S2A_MSIL1C_20170109T011711_N0204_R088_T53MPU_2...,10,,


In [28]:
# ds2A.groupby(['tile']).count()

counts2A.sort_index().sort_values(by='count2A',ascending=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,count2A
tile,orbit,Unnamed: 2_level_1
T54MWU,R002,6
T53MKQ,R031,6
T54MTV,R088,6
T53MKP,R031,6
T54MTA,R088,6
T53MKU,R031,7
T53MMT,R131,7
T54MTV,R045,7
T52MCC,R117,7
T52MHE,R074,7


In [81]:
path='/scratch/andresro/leon_igp/barry_palm/data/2A/phillipines_2017/correct_2A.txt'
lines = [line.rstrip('\n') for line in open(path)]

In [42]:

len(lines),len(set(lines))

(571, 537)