# Create train and test set

- using the labels from [this](https://github.com/alexvmt/farm_plot_detection/blob/main/report.pdf) report
- the labels are there but the actual band values need to be collected using this notebook

In [1]:
from google.colab import drive
import ee
import numpy as np
import pandas as pd
import geopandas as gpd
import geemap

## Setup

In [None]:
drive.mount('/content/drive')

In [None]:
%cd /content/drive/MyDrive/farm_plot_detection/automl

/content/drive/MyDrive/farm_plot_detection/automl


In [2]:
!earthengine authenticate

W1215 10:47:57.749752 135065637183488 _default.py:640] No project ID could be determined. Consider running `gcloud config set project` or setting the GOOGLE_CLOUD_PROJECT environment variable
W1215 10:47:58.077781 135065637183488 _default.py:640] No project ID could be determined. Consider running `gcloud config set project` or setting the GOOGLE_CLOUD_PROJECT environment variable
I1215 10:47:58.154059 135065637183488 auth.py:126] Failure refreshing credentials: ("Failed to retrieve http://metadata.google.internal/computeMetadata/v1/instance/service-accounts/default/?recursive=true from the Google Compute Engine metadata service. Status: 404 Response:\nb''", <google.auth.transport.requests._Response object at 0x7ad703362aa0>)
W1215 10:47:58.160408 135065637183488 _default.py:640] No project ID could be determined. Consider running `gcloud config set project` or setting the GOOGLE_CLOUD_PROJECT environment variable
I1215 10:47:58.231316 135065637183488 auth.py:126] Failure refreshing cr

In [3]:
ee.Initialize()

RefreshError: ignored

## Load labels

In [None]:
gdf = gpd.read_file('kaza_bengo_crop_2020_random_2000.geojson')

In [None]:
gdf

Unnamed: 0,crop,subset,geometry
0,0,train,POINT (26.27313 -18.67769)
1,1,train,POINT (26.22911 -16.78453)
2,0,train,POINT (23.91473 -17.64923)
3,1,train,POINT (26.88519 -18.55946)
4,0,train,POINT (25.05245 -15.19111)
...,...,...,...
1995,0,test,POINT (26.22830 -18.89666)
1996,0,test,POINT (25.44007 -16.37118)
1997,1,test,POINT (27.43635 -19.18086)
1998,0,test,POINT (25.81123 -17.89990)


## Load and prepare Sentinel-2 imagery

In [None]:
start_date = ee.Date('2020-01-01')
end_date = start_date.advance(365, 'day')

In [None]:
bands = ['B2', 'B3', 'B4', 'B8']

In [None]:
s2_composite = ee.ImageCollection('COPERNICUS/S2').filterDate(start_date, end_date).select(bands).median()

## Collect band values and export data

In [None]:
def get_bands_from_points(gdf, image, chunks):

  df_with_bands = pd.DataFrame(columns=['B2', 'B3', 'B4', 'B8', 'crop', 'subset'])

  list_gdf = np.array_split(gdf, chunks)

  for i, gdf in enumerate(list_gdf):
    print('Processing chunk {}/{}...'.format(i+1, chunks))
    fc = geemap.geopandas_to_ee(gdf)
    fc_with_bands = image.sampleRegions(collection=fc, scale=10)
    df_with_bands_temp = geemap.ee_to_pandas(fc_with_bands)
    df_with_bands = pd.concat([df_with_bands, df_with_bands_temp])

  df_with_bands = df_with_bands.reset_index(drop=True)

  return(df_with_bands)

In [None]:
%time

df_with_bands = get_bands_from_points(gdf, s2_composite, 20)

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 8.11 µs
Processing chunk 1/20...
Processing chunk 2/20...
Processing chunk 3/20...
Processing chunk 4/20...
Processing chunk 5/20...
Processing chunk 6/20...
Processing chunk 7/20...
Processing chunk 8/20...
Processing chunk 9/20...
Processing chunk 10/20...
Processing chunk 11/20...
Processing chunk 12/20...
Processing chunk 13/20...
Processing chunk 14/20...
Processing chunk 15/20...
Processing chunk 16/20...
Processing chunk 17/20...
Processing chunk 18/20...
Processing chunk 19/20...
Processing chunk 20/20...


In [None]:
df_with_bands

Unnamed: 0,B2,B3,B4,B8,crop,subset
0,1501.285714,1683.900000,2270.000000,3176.571429,0,train
1,1366.666667,1423.428571,1827.500000,2570.000000,1,train
2,1304.000000,1187.000000,1397.833333,2475.500000,0,train
3,1252.777778,1309.363636,1698.500000,2835.222222,1,train
4,1061.833333,893.125000,831.833333,1931.000000,0,train
...,...,...,...,...,...,...
95,1118.250000,1114.800000,1515.750000,2473.200000,0,test
96,1434.400000,1317.000000,1523.000000,2386.714286,0,test
97,1437.333333,1490.000000,1932.166667,2955.250000,1,test
98,1165.428571,1067.333333,1236.666667,2133.666667,0,test


In [None]:
df_with_bands.to_csv('train_test_set.csv', index=False)