In [1]:
# load sklean libraries for getting train, validation and test data
# also, the needed for boostrapping
from sklearn.model_selection import train_test_split
import pandas as pd
import os

In [2]:
path = '~/Documents/GitHub/S2-classification/Data'

In [17]:
ndvi = pd.read_csv(os.path.join(path, 'ndvi.csv'), index_col='ID')
ndvi = ndvi.drop(columns=['Unnamed: 0'], axis=1)
ndvi['ndvi_pen'] = ndvi['ndvi_pen']/ndvi['n']
ndvi['ndvi_pen'] = (ndvi['ndvi_pen'] - ndvi['ndvi_pen'].min())/(ndvi['ndvi_pen'].max() - ndvi['ndvi_pen'].min())
ndvi.drop(columns=['n'], axis=1, inplace=True)
ndvi = ndvi.sort_values(by=['specie'])

b02 = pd.read_csv(os.path.join(path, 'b02.csv'), index_col='ID')
b02 = b02.drop(columns=['Unnamed: 0'], axis=1)
b02['b02_pen'] = b02['b02_pen']/b02['n']
b02['b02_pen'] = (b02['b02_pen'] - b02['b02_pen'].min())/(b02['b02_pen'].max() - b02['b02_pen'].min())
b02.drop(columns=['n','specie'], axis=1, inplace=True)

b03 = pd.read_csv(os.path.join(path, 'b03.csv'), index_col='ID')
b03 = b03.drop(columns=['Unnamed: 0'], axis=1)
b03['b03_pen'] = b03['b03_pen']/b03['n']
b03['b03_pen'] = (b03['b03_pen'] - b03['b03_pen'].min())/(b03['b03_pen'].max() - b03['b03_pen'].min())
b03.drop(columns=['n','specie'], axis=1, inplace=True)

b04 = pd.read_csv(os.path.join(path, 'b04.csv'), index_col='ID')
b04 = b04.drop(columns=['Unnamed: 0'], axis=1)
b04['b04_pen'] = b04['b04_pen']/b04['n']
b04['b04_pen'] = (b04['b04_pen'] - b04['b04_pen'].min())/(b04['b04_pen'].max() - b04['b04_pen'].min())
b04.drop(columns=['n','specie'], axis=1, inplace=True)

b08 = pd.read_csv(os.path.join(path, 'b08.csv'), index_col='ID')
b08 = b08.drop(columns=['Unnamed: 0'], axis=1)
b08['b08_pen'] = b08['b08_pen']/b08['n']
b08['b08_pen'] = (b08['b08_pen'] - b08['b08_pen'].min())/(b08['b08_pen'].max() - b08['b08_pen'].min())
b08.drop(columns=['n'], axis=1, inplace=True)

# Merge all bands
bands = pd.concat([b02,b03,b04,b08], axis=1)
bands = bands.sort_values(by=['specie'])

# concat bands and ndvi
both = pd.concat([ndvi.drop('specie', axis = 1),bands], axis=1)


In [19]:
# count the number of samples per specie
res = both['specie'].value_counts()

In [26]:
both = both.reset_index()

In [27]:
# for each specie, split in 60%, 20% and 20%
# train, validation and test
# stratify to keep the same proportion of samples per specie
train, test = train_test_split(both, test_size=0.2, stratify=both['specie'])
train, val = train_test_split(train, test_size=0.25, stratify=train['specie'])

In [28]:
def sample_or_boostrap(df, limit):
    if len(df) > limit:
        return df.sample(replace=False, n=limit, random_state=1)
    else:
        res = df
        res2 = df.sample(n= limit - len(df), random_state=1, replace=True)
        return pd.concat([res, res2])


In [29]:
# for train, limit the numnbre of samples per specie to 100
train = train.groupby('specie').apply(lambda x: sample_or_boostrap(x, limit=100)).reset_index(drop=True)

In [30]:
# for validation, limit the numnbre of samples per specie to 30
val = val.groupby('specie').apply(lambda x: sample_or_boostrap(x, limit=30)).reset_index(drop=True)

# for test, limit the numnbre of samples per specie to 30
test = test.groupby('specie').apply(lambda x: sample_or_boostrap(x, limit=30)).reset_index(drop=True)

In [32]:
train.to_csv(os.path.join(path, 'train.csv'), index=False)
val.to_csv(os.path.join(path, 'val.csv'), index=False)
test.to_csv(os.path.join(path, 'test.csv'), index=False)