# Create train and test set

## Feature groups
- Sentinel-2 bands
- Indices based on Sentinel-2 bands (e.g. NDVI, NDWI, SAVI, EVI)
- DEM
- GLCM

## Setup

In [1]:
from google.colab import drive
import pandas as pd

In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
%cd /content/drive/MyDrive/land_cover_classification_kaza/automl

/content/drive/MyDrive/land_cover_classification_kaza/automl


## Load raw data

We're using the labels provided by Nuno and the according Sentinel-2 bands.

In [4]:
file_name = 'RND_Binga_1000.csv'
df = pd.read_csv(file_name)

In [5]:
df

Unnamed: 0,B2,B3,B4,B5,B6,B7,B8,B8A,B11,B12,class,LC2023
0,542.294118,686.076923,869.000000,1314.500000,1749.250000,1914.444444,1948.250000,2277.333333,2616.000000,1867.000000,5,Shrub
1,566.842105,808.500000,1061.200000,1638.714286,2009.000000,2412.250000,2648.750000,2837.666667,3107.000000,1995.800000,3,Forest
2,1250.285714,1635.000000,2214.000000,2658.333333,2989.333333,3210.000000,3216.000000,3572.000000,4027.000000,2747.000000,2,Crop
3,673.200000,861.000000,1196.000000,1561.400000,1857.892857,2120.166667,2334.400000,2563.433333,2975.250000,2087.666667,5,Shrub
4,795.500000,991.000000,1188.125000,1569.285714,2205.368421,2464.100000,2594.357143,2784.600000,2832.400000,2037.833333,3,Forest
...,...,...,...,...,...,...,...,...,...,...,...,...
1258,921.880000,1497.058824,2719.764706,3164.312500,3425.750000,3591.891667,3683.636364,3777.244444,5072.571429,4572.187500,2,Crop
1259,795.681818,1210.539130,2206.444444,2655.533333,2907.400000,3107.285714,3170.500000,3291.800000,4558.333333,3892.666667,2,Crop
1260,854.928571,1364.608696,2593.454545,3039.000000,3304.100000,3490.909091,3549.142857,3736.571429,4980.000000,4364.250000,2,Crop
1261,796.954545,1239.200000,2445.200000,2837.000000,2992.666667,3182.000000,3275.200000,3427.000000,4977.400000,4601.750000,2,Crop


### Add indices

In [6]:
#df['NDVI'] = (df['B8'] - df['B4']) / (df['B8'] + df['B4'])
#df['NDWI'] = (df['B3'] - df['B8']) / (df['B3'] + df['B8'])

In [7]:
#df

### Shuffle data and split in train and test set

In [8]:
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

In [9]:
df.shape

(1263, 12)

In [10]:
n_train_samples = int(round(df.shape[0] * 0.7, 0))
n_train_samples

884

In [11]:
n_test_samples = df.shape[0] - n_train_samples
n_test_samples

379

In [12]:
train = df.iloc[:n_train_samples]
train

Unnamed: 0,B2,B3,B4,B5,B6,B7,B8,B8A,B11,B12,class,LC2023
0,550.333333,603.500000,543.777778,478.333333,278.666667,283.000000,221.500000,218.307692,210.333333,177.142857,6,Water
1,761.728571,969.202381,1498.000000,1952.222222,2275.200000,2539.833333,2663.555556,2903.750000,3354.500000,2217.666667,3,Forest
2,686.571429,861.375000,1446.666667,1899.500000,2319.500000,2613.333333,2893.600000,2994.000000,2897.333333,1894.500000,5,Shrub
3,636.750000,802.666667,1104.800000,1396.250000,1764.500000,2053.300000,2140.200000,2365.333333,2637.000000,1938.500000,2,Crop
4,664.000000,796.375000,1057.000000,1485.000000,1940.142857,2274.666667,2472.000000,2657.428571,2638.500000,1746.000000,3,Forest
...,...,...,...,...,...,...,...,...,...,...,...,...
879,661.500000,858.166667,969.666667,1338.000000,2023.500000,2213.000000,2264.000000,2474.000000,2632.750000,1977.500000,2,Crop
880,543.100000,551.000000,480.222222,400.555556,288.846154,290.066667,294.250000,282.250000,150.727273,110.857143,6,Water
881,623.000000,803.666667,1112.000000,1648.250000,2064.333333,2273.666667,2465.500000,2719.666667,3167.000000,1991.000000,5,Shrub
882,591.833333,787.166667,1300.666667,1807.571429,2147.750000,2498.375000,2636.500000,3046.200000,3105.500000,2122.000000,3,Forest


In [13]:
train.to_csv('train.csv', index=False)

In [14]:
test = df.iloc[n_train_samples:]
test

Unnamed: 0,B2,B3,B4,B5,B6,B7,B8,B8A,B11,B12,class,LC2023
884,666.571429,862.400000,1200.400000,1496.200000,1816.000000,2071.750000,2276.250000,2499.875000,2972.500000,2059.25,3,Forest
885,666.285714,841.000000,1120.800000,1576.333333,2086.500000,2424.500000,2607.000000,2770.666667,3045.500000,1971.00,5,Shrub
886,670.888889,928.600000,1438.000000,1837.750000,2140.500000,2412.857143,2576.750000,2857.500000,3065.000000,2187.50,7,Wetlands
887,1061.428571,1372.200000,1891.714286,2184.375000,2435.000000,2624.000000,2871.375000,2978.500000,4128.333333,3130.00,2,Crop
888,1180.750000,1704.666667,2536.000000,2854.600000,3177.333333,3459.500000,3719.000000,3864.500000,4646.000000,3445.00,0,Bare
...,...,...,...,...,...,...,...,...,...,...,...,...
1258,611.000000,842.000000,1128.000000,1422.125000,1962.166667,2207.000000,2335.000000,2558.083333,2698.666667,2215.50,7,Wetlands
1259,1561.733333,1830.400000,2213.214286,2686.550000,2806.925000,2906.000000,2595.444444,3103.500000,4402.714286,4194.00,1,BuiltUp
1260,974.904762,1326.777778,1890.933333,2079.600000,2272.375000,2462.818182,2603.764706,2719.615385,3952.000000,3292.65,1,BuiltUp
1261,594.162791,803.700000,1273.809524,1526.625000,1827.533333,2076.500000,2265.157895,2410.000000,3054.625000,2175.75,5,Shrub


In [15]:
test.to_csv('test.csv', index=False)