# Notebook to generate a practice dataset

## Wind Data

Let's use the pre-downloaded wind data from the Arabian Peninsula and create a dataframe of useful values

In [1]:
import xarray as xr
import pandas as pd
import numpy as np
import os
import ee
import geemap
import cdsapi
import cv2
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [138]:
file_path = '../raw_data/namib_desert_winds.nc'

ds = xr.open_dataset(file_path)
raw_data = ds.to_dataframe()
clean_df = raw_data.dropna().reset_index()
clean_df

Unnamed: 0,latitude,longitude,time,u10,v10
0,-23.02,14.47,2011-01-01,1.894173,1.674366
1,-23.02,14.47,2011-02-01,2.082840,2.654056
2,-23.02,14.47,2011-03-01,1.951652,2.674042
3,-23.02,14.47,2011-04-01,1.037798,3.245492
4,-23.02,14.47,2011-05-01,0.488088,2.980637
...,...,...,...,...,...
118,-23.02,14.47,2020-11-01,1.770226,3.003730
119,-23.02,14.47,2020-12-01,1.937622,1.576097
120,-23.02,14.47,2021-01-01,1.781153,0.678924
121,-23.02,14.47,2021-02-01,1.917772,1.711177


### Adding wind strength feature

In [139]:
clean_df['wind_strength'] = (clean_df.u10**2 + clean_df.v10**2)**.5
clean_df

Unnamed: 0,latitude,longitude,time,u10,v10,wind_strength
0,-23.02,14.47,2011-01-01,1.894173,1.674366,2.528120
1,-23.02,14.47,2011-02-01,2.082840,2.654056,3.373757
2,-23.02,14.47,2011-03-01,1.951652,2.674042,3.310505
3,-23.02,14.47,2011-04-01,1.037798,3.245492,3.407380
4,-23.02,14.47,2011-05-01,0.488088,2.980637,3.020335
...,...,...,...,...,...,...
118,-23.02,14.47,2020-11-01,1.770226,3.003730,3.486560
119,-23.02,14.47,2020-12-01,1.937622,1.576097,2.497691
120,-23.02,14.47,2021-01-01,1.781153,0.678924,1.906160
121,-23.02,14.47,2021-02-01,1.917772,1.711177,2.570210


### Adding Sin and Cosin of wind direction

In [140]:
clean_df['sin'] = np.sin(np.arctan2(clean_df['u10'],clean_df['v10']))
clean_df['cosin'] = np.cos(np.arctan2(clean_df['u10'],clean_df['v10']))
clean_df

Unnamed: 0,latitude,longitude,time,u10,v10,wind_strength,sin,cosin
0,-23.02,14.47,2011-01-01,1.894173,1.674366,2.528120,0.749242,0.662297
1,-23.02,14.47,2011-02-01,2.082840,2.654056,3.373757,0.617365,0.786677
2,-23.02,14.47,2011-03-01,1.951652,2.674042,3.310505,0.589533,0.807744
3,-23.02,14.47,2011-04-01,1.037798,3.245492,3.407380,0.304574,0.952489
4,-23.02,14.47,2011-05-01,0.488088,2.980637,3.020335,0.161601,0.986856
...,...,...,...,...,...,...,...,...
118,-23.02,14.47,2020-11-01,1.770226,3.003730,3.486560,0.507729,0.861517
119,-23.02,14.47,2020-12-01,1.937622,1.576097,2.497691,0.775765,0.631022
120,-23.02,14.47,2021-01-01,1.781153,0.678924,1.906160,0.934420,0.356174
121,-23.02,14.47,2021-02-01,1.917772,1.711177,2.570210,0.746154,0.665773


### Aggregating final wind dataset by mean()

In [141]:
wind_data = clean_df.groupby(by=['latitude','longitude']).mean().reset_index()
wind_data

Unnamed: 0,latitude,longitude,u10,v10,wind_strength,sin,cosin
0,-23.02,14.47,1.097745,2.567064,2.985563,0.341605,0.853073


In [3]:
def prep_data(path):
    ds = xr.open_dataset(path)
    raw_data = ds.to_dataframe()
    clean_df = raw_data.dropna().reset_index()
    clean_df['wind_strength'] = (clean_df.u10**2 + clean_df.v10**2)**.5
    clean_df['sin'] = np.sin(np.arctan2(clean_df['u10'],clean_df['v10']))
    clean_df['cosin'] = np.cos(np.arctan2(clean_df['u10'],clean_df['v10']))
    wind_data = clean_df.groupby(by=['latitude','longitude']).mean().reset_index()
    return wind_data

## Aquiring satellite images

### Base function 

In [220]:

def get_four_image_dup(data:pd.Series, **kwargs) -> list:
    
    ee.Initialize()
    
    if 'resolution' in kwargs.keys():
        resolution=int(kwargs['resolution'])
    else:
        resolution = 512
    
    satellite_name='COPERNICUS/S2_SR'
    
    area = [(data.longitude-0.025,data.latitude-0.025),
            (data.longitude+0.025,data.latitude+0.025)]
        
    roi = ee.Geometry.Rectangle(coords=area)
    

           
    collection = ee.ImageCollection(satellite_name) \
                .filterBounds(roi) \
                .sort("CLOUD_COVER") \
                .filter('HIGH_PROBA_CLOUDS_PERCENTAGE < 10') \
                .limit(1)

    image = collection.first()
    
    #img = ee.Image(satellite_name).select('B4','B3','B2')
    
    
    vis_params = {
                  'bands': [ 'B4','B3','B2'],
                  'min': 0,
                  'max': 10000,
                  'gamma': 1.4}
    
    coordinates = [(data.longitude-0.025,data.latitude-0.025),
              (data.longitude-0.025,data.latitude+0.025),
              (data.longitude+0.025,data.latitude+0.025),
              (data.longitude+0.025,data.latitude-0.025)]
    
    images = []

    for quadrant,coordinate in enumerate(coordinates):
        
        long = coordinate[0]
        lat = coordinate[1]
        
        bounds = [(long-0.025,lat-0.025),
                (long+0.025,lat+0.025)]
        
        roi = ee.Geometry.Rectangle(coords=bounds)
        
        band_arrs = image.sampleRectangle(region=roi)
        
        band_arr_b4 = band_arrs.get('B4')
        band_arr_b3 = band_arrs.get('B3')
        band_arr_b2 = band_arrs.get('B2')
        np_arr_b4 = np.array(band_arr_b4.getInfo())
        np_arr_b3 = np.array(band_arr_b3.getInfo())
        np_arr_b2 = np.array(band_arr_b2.getInfo())
        
        print(np_arr_b4.shape)
        print(np_arr_b3.shape)
        print(np_arr_b2.shape)
    
        file_name = f"{lat}_{long}_0{quadrant}_CW000_{data.sin}_{data.cosin}_{data.wind_strength}"
        out_img = os.path.expanduser(f"../raw_data/practice_dataset/{file_name}.jpg")
        region = ee.Geometry.Rectangle(bounds)
        geemap.get_image_thumbnail(image, out_img,vis_params,dimensions=(resolution, resolution),region=region, format='jpg')
        
        image_grey = cv2.imread(out_img)
        print(image_grey.shape)
        #image_grey = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        #cv2.imshow(image_grey)
        #cv2.imwrite(file_name)
        geemap.show_image(out_img)
        images.append(out_img)
        
    return image.getInfo()


def get_image_per_coordinates(data:pd.Series, **kwargs) -> list:
    
        coordinates = [(data.longitude-0.025,data.latitude-0.025),
              (data.longitude-0.025,data.latitude+0.025),
              (data.longitude+0.025,data.latitude+0.025),
              (data.longitude+0.025,data.latitude-0.025)]
        
        for quadrant,coordinate in enumerate(coordinates):
            get_one_image(long=coordinate[0], lat=coordinate[1], data=data, quadrant=quadrant)
        
        return None

In [263]:
def get_one_image(long:float, lat:float,data:pd.Series, quadrant:int, **kwargs) -> list:
    
    ee.Initialize()
    
    if 'resolution' in kwargs.keys():
        resolution=int(kwargs['resolution'])
    else:
        resolution = 512
    
    satellite_name='COPERNICUS/S2_SR'
    
    area = [(long-0.025,lat-0.025),
            (long+0.025,lat+0.025)]
        
    roi = ee.Geometry.Rectangle(coords=area)
           
    collection = ee.ImageCollection(satellite_name) \
                .filterBounds(roi) \
                .sort("CLOUD_COVER") \
                .filter('HIGH_PROBA_CLOUDS_PERCENTAGE < 10') \
                .limit(1)

    image = collection.first()
    
    vis_params = {
                  'bands': [ 'B4','B3','B2'],
                  'min': 0,
                  'max': 10000,
                  'gamma': 1.4}

    file_name = f"{lat}_{long}_0{quadrant}_CW000_{data.sin}_{data.cosin}_{data.wind_strength}"
    out_img = os.path.expanduser(f"../raw_data/practice_dataset/{file_name}.jpg")

    geemap.get_image_thumbnail(image, out_img,vis_params,dimensions=(resolution, resolution),region=roi, format='jpg')
        
    image_grey = cv2.imread(out_img)
    image_grey = cv2.cvtColor(image_grey, cv2.COLOR_BGR2GRAY)
    #plt.imshow(image_grey, cmap='Greys')
    cv2.imshow(file_name,image_grey)
    cv2.imwrite(f"../raw_data/practice_dataset/{file_name}.jpg",image_grey)
    #geemap.show_image(out_img)
    #images.append(out_img)
        
    return None

In [None]:
LC8_BANDS = ['B2',   'B3',    'B4',  'B5',  'B6',    'B7',    'B10'];
STD_NAMES = ['blue', 'green', 'red', 'nir', 'swir1', 'swir2', 'temp'];

def cloudscore(img):
    
    def rescale(img1, exp, thresholds):
        return img1.expression(exp, {img1: img1}).subtract(thresholds[0]).divide(thresholds[1] - thresholds[0])
    
    score = ee.Image(1.0);
    score = score.min(rescale(img, 'img.blue', [0.1, 0.3]));
    score = score.min(rescale(img, 'img.red + img.green + img.blue', [0.2, 0.8]));
    score = score.min(rescale(img, 'img.nir + img.swir1 + img.swir2', [0.3, 0.8]));
    score = score.min(rescale(img, 'img.temp', [300, 290]));
    ndsi = img.normalizedDifference(['green', 'swir1']);
    return score.min(rescale(ndsi, 'img', [0.8, 0.6]));

def badSelect(img):
    # Invert the cloudscore so 1 is least cloudy, and rename the band.
    score = cloudscore(img.select(LC8_BANDS, STD_NAMES))
    score = ee.Image(1).subtract(score).select([0], ['cloudscore'])
    return img.addBands(score)

image = ee.ImageCollection('LANDSAT/LC08/C01/T1')\
        .filter(ee.Filter.calendarRange(2018,2018,'year'))\
        .filter(ee.Filter.calendarRange(3,3,'month'))\
        .map(badSelect)

image = image.qualityMosaic('cloudscore')
image_out = image.clip(fc.filterMetadata("Name","equals",kml))
vizParams = {'bands': ['B4', 'B3', 'B2'], 'max': 30000, 'gamma': 1.6}
Coordinate_List = fc.filterMetadata("Name","equals",kml).geometry().bounds().getInfo()['coordinates']
geo = ee.Geometry.Polygon(Coordinate_List)
task = ee.batch.Export.image.toDrive(  
                    image = image_out.visualize(vizParams),
                    description = FileName,
                    folder = folderName,
                    scale=30,
                    region=Coordinate_List,
                    fileFormat='GeoTIFF'
                    )
task.start()

In [264]:
file_path = '../raw_data/practice_wind_dataset.nc'
test_data = prep_data(file_path).iloc[1,:]
get_image_per_coordinates(test_data)

'Ok'

In [266]:
file_path = '../raw_data/white_sands_winds.nc'
test_data = prep_data(file_path).iloc[1,:]
get_image_per_coordinates(test_data)

'Ok'

In [203]:
file_path = '../raw_data/practice_wind_dataset.nc'
test_data = prep_data(file_path).iloc[0,:]
arabian_values = get_one_image(test_data)

(512, 512, 3)


Output()

(512, 512, 3)


Output()

(512, 512, 3)


Output()

(512, 512, 3)


Output()

In [267]:
file_path = '../raw_data/namib_desert_winds.nc'
test_data = prep_data(file_path).iloc[0,:]
get_image_per_coordinates(test_data)

'Ok'

In [183]:
test_data

latitude          40.910000
longitude        101.580002
u10                0.176066
v10                0.392398
wind_strength      1.111934
sin                0.151244
cosin              0.326632
Name: 0, dtype: float64

In [184]:
file_path = '../raw_data/gobi_winds.nc'
test_data = prep_data(file_path).iloc[0,:]
gobi_desert_values = get_one_image(test_data)

Output()

Output()

Output()

Output()

In [77]:
resolutions = [128, 256,512, 1024,2048]

image_collection = []

for resolution in resolutions:
    print(f"RESOLUTION: {resolution} x {resolution} pixels")
    image_collection.append(get_one_image(test_data,resolution=resolution))

RESOLUTION: 128 x 128 pixels


Output()

Output()

Output()

Output()

RESOLUTION: 256 x 256 pixels


Output()

Output()

Output()

Output()

RESOLUTION: 512 x 512 pixels


Output()

Output()

Output()

Output()

RESOLUTION: 1024 x 1024 pixels


Output()

Output()

Output()

Output()

RESOLUTION: 2048 x 2048 pixels


Output()

Output()

Output()

Output()

In [25]:
def train_test_geographic_split(data:pd.DataFrame) -> pd.DataFrame:
    
    wind_data = data.copy()
    
    nb_long = wind_data.longitude.unique().shape[0]
    nb_lat = wind_data.latitude.unique().shape[0]

    nb_test_long = int(nb_long*.4)
    nb_test_lat = int(nb_lat*.4)

    test_lat = np.sort(wind_data.latitude.unique())[:nb_test_lat]
    test_long = np.sort(wind_data.longitude.unique())[:nb_test_long]

    wind_data['folder']='training'

    for vlat in test_lat:
        for vlong in test_long:
            wind_data.loc[(wind_data.latitude == vlat) & (wind_data.longitude==vlong),'folder']='testing'
            
    val = wind_data[wind_data.folder=='testing'].sample(frac=0.3)
    wind_data.loc[~val,'folder'] = 'validating'
    
    return wind_data

In [26]:
df = prep_data('../raw_data/practice_wind_dataset.nc')
train_test_geographic_split(df)

TypeError: ufunc 'invert' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [22]:
df2 = train_test_geographic_split(df)
df2

Unnamed: 0,latitude,longitude,u10,v10,wind_strength,sin,cosin,folder
0,20.91,52.570000,-0.654569,-0.941064,1.674286,-0.401793,-0.504130,testing
1,20.91,52.669998,-0.617733,-0.908168,1.646193,-0.386248,-0.493999,testing
2,20.91,52.770000,-0.581895,-0.866254,1.615274,-0.377852,-0.467297,testing
3,20.91,52.869999,-0.550300,-0.815944,1.582373,-0.372668,-0.449101,testing
4,20.91,52.970001,-0.519233,-0.762639,1.549767,-0.367174,-0.428177,testing
...,...,...,...,...,...,...,...,...
427,22.41,54.770000,0.287349,-0.968053,1.496343,0.131434,-0.510543,training
428,22.41,54.869999,0.343487,-0.912141,1.490919,0.173079,-0.483219,training
429,22.41,54.970001,0.388347,-0.855774,1.486733,0.210044,-0.455772,training
430,22.41,55.070000,0.430046,-0.800121,1.486798,0.245079,-0.427435,training


In [23]:
df2.folder.unique()

array(['testing', 'training'], dtype=object)

In [51]:
nb_long = wind_data.longitude.unique().shape[0]
nb_lat = wind_data.latitude.unique().shape[0]

nb_test_long = int(nb_long*.4)
nb_test_lat = int(nb_lat*.4)

test_lat = np.sort(wind_data.latitude.unique())[:nb_test_lat]
test_long = np.sort(wind_data.longitude.unique())[:nb_test_long]

test_validate_df = pd.DataFrame()

for vlat in test_lat:
    for vlong in test_long:
        test_validate_df = pd.concat([test_validate_df,wind_data[(wind_data.latitude == vlat) & (wind_data.longitude==vlong)]])


In [69]:
test, validate = train_test_geographic_split(wind_data)

In [71]:
validate

Unnamed: 0,latitude,longitude,u10,v10,wind_strength,sin,cosin
35,21.01,53.369999,-0.418923,-0.629496,1.478506,-0.330199,-0.357673
36,21.01,53.470001,-0.388602,-0.587667,1.467832,-0.314558,-0.339529
108,21.309999,52.57,-0.611208,-1.135007,1.732329,-0.37271,-0.586616
116,21.309999,53.369999,-0.432021,-0.828298,1.52271,-0.326505,-0.454204
30,21.01,52.869999,-0.54543,-0.873532,1.596858,-0.363426,-0.477551
109,21.309999,52.669998,-0.578432,-1.108965,1.708075,-0.36092,-0.58171
84,21.209999,52.869999,-0.541218,-0.990874,1.637855,-0.354801,-0.545062
137,21.41,52.77,-0.546089,-1.1264,1.697951,-0.346065,-0.594248
34,21.01,53.27,-0.446262,-0.674574,1.493649,-0.343393,-0.378835
139,21.41,52.970001,-0.510685,-1.047559,1.638342,-0.340067,-0.574397


In [110]:
white_sand_values

{'type': 'Image',
 'bands': [{'id': 'B1',
   'data_type': {'type': 'PixelType',
    'precision': 'int',
    'min': 0,
    'max': 65535},
   'dimensions': [1830, 1830],
   'crs': 'EPSG:32613',
   'crs_transform': [60, 0, 300000, 0, -60, 3700020]},
  {'id': 'B2',
   'data_type': {'type': 'PixelType',
    'precision': 'int',
    'min': 0,
    'max': 65535},
   'dimensions': [10980, 10980],
   'crs': 'EPSG:32613',
   'crs_transform': [10, 0, 300000, 0, -10, 3700020]},
  {'id': 'B3',
   'data_type': {'type': 'PixelType',
    'precision': 'int',
    'min': 0,
    'max': 65535},
   'dimensions': [10980, 10980],
   'crs': 'EPSG:32613',
   'crs_transform': [10, 0, 300000, 0, -10, 3700020]},
  {'id': 'B4',
   'data_type': {'type': 'PixelType',
    'precision': 'int',
    'min': 0,
    'max': 65535},
   'dimensions': [10980, 10980],
   'crs': 'EPSG:32613',
   'crs_transform': [10, 0, 300000, 0, -10, 3700020]},
  {'id': 'B5',
   'data_type': {'type': 'PixelType',
    'precision': 'int',
    'min':

In [157]:
feature = 'CLOUDY_PIXEL_PERCENTAGE'

print(white_sand_values['properties'][feature])
print(arabian_sand_values['properties'][feature])
print(namib_desert_values['properties'][feature])

65.684506
0.017896
9.740144


In [158]:
feature = 'HIGH_PROBA_CLOUDS_PERCENTAGE'

print(white_sand_values['properties'][feature])
print(arabian_sand_values['properties'][feature])
print(namib_desert_values['properties'][feature])

16.185379
0
7.998335


In [None]:
'DARK_FEATURES_PERCENTAGE'
'SOLAR_IRRADIANCE_B9': 817.58,
'SOLAR_IRRADIANCE_B3': 1824.93,
    'SOLAR_IRRADIANCE_B12': 87.75,
  'SOLAR_IRRADIANCE_B10': 365.41,
  'SENSOR_QUALITY': 'PASSED',
  'SOLAR_IRRADIANCE_B11': 247.08,
  'GENERATION_TIME': 1544905108000,
  'SOLAR_IRRADIANCE_B8A': 953.93,
    'THIN_CIRRUS_PERCENTAGE': 25.026366,
        'SNOW_ICE_PERCENTAGE': 0.006193,
            'HIGH_PROBA_CLOUDS_PERCENTAGE': 16.185379,