# Train and test set

### Feature groups
- raw Sentinel-2 bands
- indices based on raw Sentinel-2 bands (e.g. NDVI, NDWI, SAVI, EVI)

### More advanced ideas
- seasonal, quarterly and monthly time series
- min, max, median indices

## Setup

In [1]:
from google.colab import drive
import pandas as pd

In [2]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
%cd /content/drive/MyDrive/land_cover_classification_kaza

/content/drive/MyDrive/land_cover_classification_kaza


## Load and prepare raw data

We're using the labels provided by Nuno and the according Sentinel-2 bands.

In [4]:
raw_data = pd.read_csv('data/raw_data.csv')
raw_data.shape

(267442, 45)

In [5]:
raw_data

Unnamed: 0,system:index,B11_Q1,B11_Q2,B11_Q3,B11_Q4,B12_Q1,B12_Q2,B12_Q3,B12_Q4,B2_Q1,...,B8A_Q3,B8A_Q4,B8_Q1,B8_Q2,B8_Q3,B8_Q4,LC_Nr,LC_Out,Landcover,.geo
0,00000000000000000010_0,2406.0,2806.0,3126.0,3612.0,1597.0,1888.0,2266.5,3276.0,571.0,...,2509.5,2606.0,2770.0,2535.5,2494.0,2540.0,4,Active cro,Cropland,"{""geodesic"":false,""type"":""Point"",""coordinates""..."
1,00000000000000000010_1,2521.0,2917.0,3280.5,3835.0,1558.0,1909.0,2320.5,3677.0,556.0,...,2570.0,2552.0,2684.0,2535.0,2601.0,2408.0,4,Active cro,Cropland,"{""geodesic"":false,""type"":""Point"",""coordinates""..."
2,00000000000000000010_2,2521.0,2917.0,3280.5,3835.0,1558.0,1909.0,2320.5,3677.0,520.0,...,2570.0,2552.0,2732.0,2502.0,2616.5,2374.0,4,Active cro,Cropland,"{""geodesic"":false,""type"":""Point"",""coordinates""..."
3,00000000000000000010_3,2381.0,2893.5,3291.0,3848.0,1407.0,1877.0,2218.0,3823.0,468.0,...,2506.5,2421.0,2680.0,2430.0,2503.0,2330.0,4,Active cro,Cropland,"{""geodesic"":false,""type"":""Point"",""coordinates""..."
4,00000000000000000010_4,2381.0,2893.5,3291.0,3848.0,1407.0,1877.0,2218.0,3823.0,424.0,...,2506.5,2421.0,2602.0,2397.0,2457.0,2282.0,4,Active cro,Cropland,"{""geodesic"":false,""type"":""Point"",""coordinates""..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
267437,00000000000000000561_158,1198.0,2491.5,1830.5,3264.0,664.0,1679.5,1945.0,3113.0,275.0,...,960.0,1655.0,1454.0,1940.5,848.5,1450.0,8,Wetland,Wetland,"{""geodesic"":false,""type"":""Point"",""coordinates""..."
267438,00000000000000000561_159,1216.0,2411.0,1741.0,3216.0,688.0,1587.0,1798.0,3039.0,273.0,...,912.5,1625.0,1448.0,1904.0,908.0,1456.0,8,Wetland,Wetland,"{""geodesic"":false,""type"":""Point"",""coordinates""..."
267439,00000000000000000561_160,1216.0,2411.0,1741.0,3216.0,688.0,1587.0,1798.0,3039.0,294.0,...,912.5,1625.0,1460.0,1926.5,940.0,1444.0,8,Wetland,Wetland,"{""geodesic"":false,""type"":""Point"",""coordinates""..."
267440,00000000000000000561_161,1198.0,2491.5,1830.5,3264.0,664.0,1679.5,1945.0,3113.0,292.0,...,960.0,1655.0,1534.0,1931.5,764.5,1460.0,8,Wetland,Wetland,"{""geodesic"":false,""type"":""Point"",""coordinates""..."


In [6]:
raw_data = raw_data.drop(['system:index', '.geo'], axis=1)

In [7]:
# remove land cover class deforestation as it is not needed in this use case
raw_data = raw_data[raw_data['Landcover'] != 'Deforestation']
raw_data.shape

(261066, 43)

In [8]:
raw_data['Landcover'] = raw_data['Landcover'].str.capitalize()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  raw_data['Landcover'] = raw_data['Landcover'].str.capitalize()


## Downsample and balance raw data

In [9]:
raw_data['Landcover'].value_counts()

Landcover
Forest      117277
Cropland     58759
Wetland      45501
Shrub        27552
Grass        10826
Bare           475
Water          376
Built up       300
Name: count, dtype: int64

In [10]:
raw_data['LC_Nr'].value_counts()

LC_Nr
7    117277
4     58759
8     45501
6     27552
5     10826
2       475
1       376
3       300
Name: count, dtype: int64

In [11]:
def sample_train_and_test_data(df, land_cover_class, train_fraction, desired_train_samples=None):

  random_state = 42

  # filter and shuffle df
  df = df[df['Landcover'] == land_cover_class]
  df = df.sample(frac=1, random_state=random_state).reset_index(drop=True)

  # calculate n train samples
  n_train_samples = int(round(df.shape[0] * train_fraction, 0))

  # sample train and test samples
  train_samples = df.iloc[:n_train_samples]
  test_samples = df.iloc[n_train_samples:]

  # downsample train samples if specified
  if desired_train_samples is not None:
    train_samples = train_samples.sample(n=desired_train_samples, random_state=random_state).reset_index(drop=True)

  print(f'land cover class: {land_cover_class}; train samples: {train_samples.shape[0]}; test samples: {test_samples.shape[0]}')

  return train_samples, test_samples

In [12]:
train_fraction = 0.7
train_forest, test_forest = sample_train_and_test_data(raw_data, 'Forest', train_fraction, desired_train_samples=500)
train_cropland, test_cropland = sample_train_and_test_data(raw_data, 'Cropland', train_fraction, desired_train_samples=500)
train_wetland, test_wetland = sample_train_and_test_data(raw_data, 'Wetland', train_fraction, desired_train_samples=500)
train_shrub, test_shrub = sample_train_and_test_data(raw_data, 'Shrub', train_fraction, desired_train_samples=500)
train_grass, test_grass = sample_train_and_test_data(raw_data, 'Grass', train_fraction, desired_train_samples=500)
train_bare, test_bare = sample_train_and_test_data(raw_data, 'Bare', train_fraction)
train_water, test_water = sample_train_and_test_data(raw_data, 'Water', train_fraction)
train_built_up, test_built_up = sample_train_and_test_data(raw_data, 'Built up', train_fraction)

land cover class: Forest; train samples: 500; test samples: 35183
land cover class: Cropland; train samples: 500; test samples: 17628
land cover class: Wetland; train samples: 500; test samples: 13650
land cover class: Shrub; train samples: 500; test samples: 8266
land cover class: Grass; train samples: 500; test samples: 3248
land cover class: Bare; train samples: 332; test samples: 143
land cover class: Water; train samples: 263; test samples: 113
land cover class: Built up; train samples: 210; test samples: 90


In [13]:
train = pd.concat([train_forest, train_cropland, train_wetland, train_shrub, train_grass, train_bare, train_water, train_built_up], ignore_index=True)
train

Unnamed: 0,B11_Q1,B11_Q2,B11_Q3,B11_Q4,B12_Q1,B12_Q2,B12_Q3,B12_Q4,B2_Q1,B2_Q2,...,B8A_Q2,B8A_Q3,B8A_Q4,B8_Q1,B8_Q2,B8_Q3,B8_Q4,LC_Nr,LC_Out,Landcover
0,1676.5,1855.5,2356.0,2994.0,843.5,1048.5,1518.0,2345.0,328.0,432.0,...,2232.0,2113.0,2507.0,2528.0,2307.0,2040.0,2296.0,7,Forest,Forest
1,1682.0,1729.5,2152.0,2983.0,836.0,994.5,1518.0,1958.0,263.0,320.0,...,2039.0,1869.0,2819.0,2348.0,1766.0,1676.0,2536.0,7,Forest,Forest
2,1927.5,2226.0,2146.0,3504.0,968.0,1262.5,1428.0,2341.0,293.0,420.0,...,2526.0,2038.0,3357.0,2818.0,2626.0,2100.0,3082.0,7,Forest,Forest
3,1901.0,1937.0,2234.0,2910.0,1017.5,1129.0,1385.0,1951.0,345.0,385.0,...,2201.0,2226.0,3085.0,2495.5,1986.0,2060.0,2708.0,7,Forest,Forest
4,1879.5,2026.5,2349.0,2888.0,919.5,1134.0,1435.0,2343.0,303.0,373.5,...,2395.5,2152.0,2756.0,2795.0,2179.0,2014.0,2476.0,7,Forest,Forest
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3300,2955.5,2885.5,3179.0,3912.0,2391.0,2294.0,2725.0,3652.0,626.0,692.0,...,2232.5,2363.0,2859.0,3005.0,2158.0,2340.0,2684.0,3,Built-up,Built up
3301,2932.0,3434.0,3763.0,4097.0,2127.0,2967.5,3378.0,3772.0,953.0,1104.0,...,2731.0,2780.0,3324.0,2819.0,2598.0,2600.0,2994.0,3,Built-up,Built up
3302,2685.0,3169.0,3626.0,3953.5,2078.0,2625.5,3170.0,3640.0,1102.0,1127.0,...,2727.5,2756.0,2872.0,2880.0,2786.5,2640.0,2758.0,3,Built-up,Built up
3303,1352.0,3222.0,3534.0,3774.0,704.0,3107.5,3353.0,3469.0,281.0,1115.0,...,2117.5,2460.0,2566.0,1950.0,2310.0,2626.0,2584.0,3,Built-up,Built up


In [14]:
test = pd.concat([test_forest, test_cropland, test_wetland, test_shrub, test_grass, test_bare, test_water, test_built_up], ignore_index=True)
test

Unnamed: 0,B11_Q1,B11_Q2,B11_Q3,B11_Q4,B12_Q1,B12_Q2,B12_Q3,B12_Q4,B2_Q1,B2_Q2,...,B8A_Q2,B8A_Q3,B8A_Q4,B8_Q1,B8_Q2,B8_Q3,B8_Q4,LC_Nr,LC_Out,Landcover
0,1549.0,1593.0,1749.666667,2217.0,716.0,807.0,1045.0,1556.0,272.0,306.0,...,1897.0,1836.666667,2022.0,2147.0,1792.0,1691.0,1982.0,7,Forest,Forest
1,1424.5,1320.0,1563.000000,1675.0,607.5,606.0,833.0,878.0,223.0,240.0,...,2024.0,1895.000000,2399.0,2236.5,1987.0,1838.0,2156.0,7,Forest,Forest
2,1717.0,1808.5,1957.000000,3563.0,873.5,1022.0,1439.0,2501.0,256.0,313.0,...,2017.5,1859.000000,3133.0,2027.0,1759.0,1624.0,2840.0,7,Forest,Forest
3,1606.0,1688.5,2052.000000,2454.0,749.0,873.0,1207.0,1798.0,266.5,298.0,...,2093.0,1966.000000,2698.0,2523.0,2068.0,1890.0,2620.0,7,Forest,Forest
4,1681.0,1837.5,1990.000000,2658.5,910.0,1039.5,1228.0,1887.5,305.0,343.0,...,2080.0,2060.000000,2702.5,2075.0,1753.0,1788.0,2296.0,7,Forest,Forest
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78316,3189.0,2890.5,2844.000000,3182.0,2972.5,2846.5,2848.0,3276.0,1312.0,976.0,...,2176.0,2073.000000,2316.0,2167.0,1917.0,1782.0,2022.0,3,Built-up,Built up
78317,1389.0,3557.5,3832.000000,3994.0,735.0,3635.5,3899.0,3955.0,285.0,1111.0,...,2483.0,2696.000000,2961.0,1954.0,2252.0,2532.0,2752.0,3,Built-up,Built up
78318,1376.0,3709.0,3940.500000,3963.0,707.0,3779.5,3886.5,3880.0,290.0,1274.0,...,2571.5,2709.000000,2693.0,1830.0,2556.0,2725.0,2554.0,3,Built-up,Built up
78319,3904.0,3587.5,4037.000000,4515.0,3472.0,3207.0,4012.0,4244.0,1668.0,1237.0,...,2812.5,2577.500000,3091.0,3208.0,2589.5,2501.0,2930.0,3,Built-up,Built up


## Compute indices

In [15]:
def ndvi(df, nir_band, red_band):
  ndvi = (df[nir_band] - df[red_band]) / (df[nir_band] + df[red_band])
  return ndvi

In [16]:
def ndwi_mcf(df, green_band, nir_band):
  ndwi_mcf = (df[green_band] - df[nir_band]) / (df[green_band] + df[nir_band])
  return ndwi_mcf

In [17]:
def ndwi_gao(df, nir_band, swir_band):
  ndwi_gao = (df[nir_band] - df[swir_band]) / (df[nir_band] + df[swir_band])
  return ndwi_gao

In [18]:
def savi(df, nir_band, red_band):
  savi = (df[nir_band] - df[red_band]) / (df[nir_band] + df[red_band] + 0.5) * (1.0 + 0.5)
  return savi

In [19]:
def evi(df, nir_band, red_band, blue_band):
  evi = 2.5 * ((df[nir_band] - df[red_band]) / (df[nir_band] + 6 * df[red_band] - 7.5 * df[blue_band] + 1))
  return evi

In [20]:
train['NDVI_Q1'] = ndvi(train, 'B8_Q1', 'B4_Q1')
train['NDWI_MCF_Q1'] = ndwi_mcf(train, 'B3_Q1', 'B8_Q1')
train['NDWI_GAO_Q1'] = ndwi_gao(train, 'B8_Q1', 'B12_Q1')
train['SAVI_Q1'] = savi(train, 'B8_Q1', 'B4_Q1')
train['EVI_Q1'] = evi(train, 'B8_Q1', 'B4_Q1', 'B2_Q1')
train['NDVI_Q2'] = ndvi(train, 'B8_Q2', 'B4_Q2')
train['NDWI_MCF_Q2'] = ndwi_mcf(train, 'B3_Q2', 'B8_Q2')
train['NDWI_GAO_Q2'] = ndwi_gao(train, 'B8_Q2', 'B12_Q2')
train['SAVI_Q2'] = savi(train, 'B8_Q2', 'B4_Q2')
train['EVI_Q2'] = evi(train, 'B8_Q2', 'B4_Q2', 'B2_Q2')
train['NDVI_Q3'] = ndvi(train, 'B8_Q3', 'B4_Q3')
train['NDWI_MCF_Q3'] = ndwi_mcf(train, 'B3_Q3', 'B8_Q3')
train['NDWI_GAO_Q3'] = ndwi_gao(train, 'B8_Q3', 'B12_Q3')
train['SAVI_Q3'] = savi(train, 'B8_Q3', 'B4_Q3')
train['EVI_Q3'] = evi(train, 'B8_Q3', 'B4_Q3', 'B2_Q3')
train['NDVI_Q4'] = ndvi(train, 'B8_Q4', 'B4_Q4')
train['NDWI_MCF_Q4'] = ndwi_mcf(train, 'B3_Q4', 'B8_Q4')
train['NDWI_GAO_Q4'] = ndwi_gao(train, 'B8_Q4', 'B12_Q4')
train['SAVI_Q4'] = savi(train, 'B8_Q4', 'B4_Q4')
train['EVI_Q4'] = evi(train, 'B8_Q4', 'B4_Q4', 'B2_Q4')
train

Unnamed: 0,B11_Q1,B11_Q2,B11_Q3,B11_Q4,B12_Q1,B12_Q2,B12_Q3,B12_Q4,B2_Q1,B2_Q2,...,NDVI_Q3,NDWI_MCF_Q3,NDWI_GAO_Q3,SAVI_Q3,EVI_Q3,NDVI_Q4,NDWI_MCF_Q4,NDWI_GAO_Q4,SAVI_Q4,EVI_Q4
0,1676.5,1855.5,2356.0,2994.0,843.5,1048.5,1518.0,2345.0,328.0,432.0,...,0.341223,-0.455064,0.146712,0.511750,0.721234,0.346628,-0.430530,-0.010558,0.519865,0.832160
1,1682.0,1729.5,2152.0,2983.0,836.0,994.5,1518.0,1958.0,263.0,320.0,...,0.427598,-0.466317,0.049468,0.641260,1.078874,0.525872,-0.533253,0.128616,0.788690,1.418831
2,1927.5,2226.0,2146.0,3504.0,968.0,1262.5,1428.0,2341.0,293.0,420.0,...,0.466993,-0.495726,0.190476,0.700367,1.177351,0.499757,-0.563277,0.136640,0.749544,1.123387
3,1901.0,1937.0,2234.0,2910.0,1017.5,1129.0,1385.0,1951.0,345.0,385.0,...,0.419711,-0.501458,0.195936,0.629457,0.952158,0.530376,-0.591069,0.162481,0.795451,1.236495
4,1879.5,2026.5,2349.0,2888.0,919.5,1134.0,1435.0,2343.0,303.0,373.5,...,0.442693,-0.499628,0.167875,0.663921,1.061673,0.387115,-0.467694,0.027599,0.580591,0.902442
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3300,2955.5,2885.5,3179.0,3912.0,2391.0,2294.0,2725.0,3652.0,626.0,692.0,...,0.193269,-0.327283,-0.076012,0.289866,0.395575,0.176161,-0.269631,-0.152778,0.264213,0.494465
3301,2932.0,3434.0,3763.0,4097.0,2127.0,2967.5,3378.0,3772.0,953.0,1104.0,...,0.124567,-0.307847,-0.130144,0.186831,0.247423,0.137538,-0.347435,-0.114987,0.206287,0.250000
3302,2685.0,3169.0,3626.0,3953.5,2078.0,2625.5,3170.0,3640.0,1102.0,1127.0,...,0.172291,-0.296024,-0.091222,0.258408,0.343363,0.156879,-0.279814,-0.137856,0.235294,0.308250
3303,1352.0,3222.0,3534.0,3774.0,704.0,3107.5,3353.0,3469.0,281.0,1115.0,...,0.168149,-0.283480,-0.121592,0.252196,0.426926,0.160305,-0.268532,-0.146208,0.240431,0.388466


In [21]:
test['NDVI_Q1'] = ndvi(test, 'B8_Q1', 'B4_Q1')
test['NDWI_MCF_Q1'] = ndwi_mcf(test, 'B3_Q1', 'B8_Q1')
test['NDWI_GAO_Q1'] = ndwi_gao(test, 'B8_Q1', 'B12_Q1')
test['SAVI_Q1'] = savi(test, 'B8_Q1', 'B4_Q1')
test['EVI_Q1'] = evi(test, 'B8_Q1', 'B4_Q1', 'B2_Q1')
test['NDVI_Q2'] = ndvi(test, 'B8_Q2', 'B4_Q2')
test['NDWI_MCF_Q2'] = ndwi_mcf(test, 'B3_Q2', 'B8_Q2')
test['NDWI_GAO_Q2'] = ndwi_gao(test, 'B8_Q2', 'B12_Q2')
test['SAVI_Q2'] = savi(test, 'B8_Q2', 'B4_Q2')
test['EVI_Q2'] = evi(test, 'B8_Q2', 'B4_Q2', 'B2_Q2')
test['NDVI_Q3'] = ndvi(test, 'B8_Q3', 'B4_Q3')
test['NDWI_MCF_Q3'] = ndwi_mcf(test, 'B3_Q3', 'B8_Q3')
test['NDWI_GAO_Q3'] = ndwi_gao(test, 'B8_Q3', 'B12_Q3')
test['SAVI_Q3'] = savi(test, 'B8_Q3', 'B4_Q3')
test['EVI_Q3'] = evi(test, 'B8_Q3', 'B4_Q3', 'B2_Q3')
test['NDVI_Q4'] = ndvi(test, 'B8_Q4', 'B4_Q4')
test['NDWI_MCF_Q4'] = ndwi_mcf(test, 'B3_Q4', 'B8_Q4')
test['NDWI_GAO_Q4'] = ndwi_gao(test, 'B8_Q4', 'B12_Q4')
test['SAVI_Q4'] = savi(test, 'B8_Q4', 'B4_Q4')
test['EVI_Q4'] = evi(test, 'B8_Q4', 'B4_Q4', 'B2_Q4')
test

Unnamed: 0,B11_Q1,B11_Q2,B11_Q3,B11_Q4,B12_Q1,B12_Q2,B12_Q3,B12_Q4,B2_Q1,B2_Q2,...,NDVI_Q3,NDWI_MCF_Q3,NDWI_GAO_Q3,SAVI_Q3,EVI_Q3,NDVI_Q4,NDWI_MCF_Q4,NDWI_GAO_Q4,SAVI_Q4,EVI_Q4
0,1549.0,1593.0,1749.666667,2217.0,716.0,807.0,1045.0,1556.0,272.0,306.0,...,0.553990,-0.520683,0.236111,0.830794,1.976179,0.527553,-0.491347,0.120407,0.791177,1.819511
1,1424.5,1320.0,1563.000000,1675.0,607.5,606.0,833.0,878.0,223.0,240.0,...,0.594102,-0.544538,0.376264,0.890960,1.890177,0.688332,-0.592319,0.421226,1.032296,2.092857
2,1717.0,1808.5,1957.000000,3563.0,873.5,1022.0,1439.0,2501.0,256.0,313.0,...,0.392796,-0.475023,0.060398,0.589068,0.927877,0.434343,-0.537212,0.063471,0.651433,0.914699
3,1606.0,1688.5,2052.000000,2454.0,749.0,873.0,1207.0,1798.0,266.5,298.0,...,0.495253,-0.534091,0.220536,0.742733,1.207795,0.551214,-0.552133,0.186057,0.826698,1.485322
4,1681.0,1837.5,1990.000000,2658.5,910.0,1039.5,1228.0,1887.5,305.0,343.0,...,0.439034,-0.522998,0.185676,0.658419,1.026920,0.489942,-0.545347,0.097646,0.734793,1.091671
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78316,3189.0,2890.5,2844.000000,3182.0,2972.5,2846.5,2848.0,3276.0,1312.0,976.0,...,0.073494,-0.195171,-0.230238,0.110224,0.150580,0.057531,-0.149517,-0.236693,0.086286,0.140127
78317,1389.0,3557.5,3832.000000,3994.0,735.0,3635.5,3899.0,3955.0,285.0,1111.0,...,0.163068,-0.261584,-0.212564,0.244575,0.435583,0.145235,-0.252047,-0.179365,0.217830,0.373902
78318,1376.0,3709.0,3940.500000,3963.0,707.0,3779.5,3886.5,3880.0,290.0,1274.0,...,0.149789,-0.257499,-0.175679,0.224660,0.388956,0.150969,-0.256890,-0.206093,0.226428,0.423087
78319,3904.0,3587.5,4037.000000,4515.0,3472.0,3207.0,4012.0,4244.0,1668.0,1237.0,...,0.220893,-0.313895,-0.231998,0.331300,0.762940,0.273359,-0.357739,-0.183161,0.409995,4.095052


## Save train and test set

In [22]:
train.to_csv('data/train.csv', index=False)
test.to_csv('data/test.csv', index=False)