In [None]:
import os

import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit

from exp_main.settings import class_names

### load raw data

In [5]:
s2_data_folder_path = 's2_data'
s2_data_raw = np.load(os.path.join(s2_data_folder_path, 'data.npy'))
gt_raw = np.load(os.path.join(s2_data_folder_path, 'gt.npy'))
s2_data_raw.shape, gt_raw.shape

((10000, 13, 64, 64), (10000, 10))

### create init df

In [3]:
s2_file_names = [
    class_name + '_' + str(file) + '.npy'
    for class_name in class_names
    for file in range(1000)
    ]
gt_name = [class_name 
           for class_name in class_names
           for file in range(1000)
          ]
df_raw = pd.DataFrame(
    data = {
        's2_file_names': s2_file_names,
        'class_name': gt_name
    }
)
df_raw

Unnamed: 0,s2_file_names,class_name
0,AnnualCrop_0.npy,AnnualCrop
1,AnnualCrop_1.npy,AnnualCrop
2,AnnualCrop_2.npy,AnnualCrop
3,AnnualCrop_3.npy,AnnualCrop
4,AnnualCrop_4.npy,AnnualCrop
...,...,...
9995,SeaLake_995.npy,SeaLake
9996,SeaLake_996.npy,SeaLake
9997,SeaLake_997.npy,SeaLake
9998,SeaLake_998.npy,SeaLake


### parse init data to separate .npy files

In [11]:
for i in range(len(df_raw)):
    s2_data = s2_data_raw[i]
    s2_data_file_name = df_raw['s2_file_names'].iloc[i]
    s2_data_file_name_path = os.path.join(s2_data_folder_path, s2_data_file_name)
    np.save(s2_data_file_name_path, s2_data)

### add class encoding

In [6]:
encoded_gt_df = pd.DataFrame(gt_raw, columns=class_names)
df = pd.concat([df_raw, encoded_gt_df], axis=1)
df

Unnamed: 0,s2_file_names,class_name,AnnualCrop,Forest,HerbaceousVegetation,Highway,Industrial,Pasture,PermanentCrop,Residential,River,SeaLake
0,AnnualCrop_0.npy,AnnualCrop,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,AnnualCrop_1.npy,AnnualCrop,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,AnnualCrop_2.npy,AnnualCrop,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,AnnualCrop_3.npy,AnnualCrop,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,AnnualCrop_4.npy,AnnualCrop,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,SeaLake_995.npy,SeaLake,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
9996,SeaLake_996.npy,SeaLake,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
9997,SeaLake_997.npy,SeaLake,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
9998,SeaLake_998.npy,SeaLake,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


### split dataset to train/val/test

#### train_test and val

In [7]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.15, random_state=7)

X_values = df['s2_file_names'].values
y_values = df['class_name'].values

train_test_index, val_index = next(iter(sss.split(X_values, y_values)))

train_test_df = df[df.index.isin(train_test_index)]
val_df = df[df.index.isin(val_index)]

#### train and test

In [8]:
train_df = pd.DataFrame()
test_df = pd.DataFrame()

for class_name in train_test_df['class_name'].unique():
    class_name_df = train_test_df[train_test_df['class_name'] == class_name]
    class_name_df = class_name_df.sample(frac=1)
    class_name_test_df = class_name_df.iloc[:100]
    class_name_train_df = class_name_df[~class_name_df.index.isin(class_name_test_df.index)]
    
    test_df = test_df.append(class_name_test_df)
    train_df = train_df.append(class_name_train_df)

#### check splitting

In [9]:
train_df.shape, val_df.shape, test_df.shape

((7500, 12), (1500, 12), (1000, 12))

In [10]:
train_df['class_name'].value_counts(), val_df['class_name'].value_counts(), test_df['class_name'].value_counts()

(HerbaceousVegetation    750
 Residential             750
 PermanentCrop           750
 River                   750
 Highway                 750
 SeaLake                 750
 Pasture                 750
 AnnualCrop              750
 Industrial              750
 Forest                  750
 Name: class_name, dtype: int64,
 HerbaceousVegetation    150
 Industrial              150
 Forest                  150
 Residential             150
 Pasture                 150
 River                   150
 Highway                 150
 PermanentCrop           150
 SeaLake                 150
 AnnualCrop              150
 Name: class_name, dtype: int64,
 River                   100
 Highway                 100
 PermanentCrop           100
 SeaLake                 100
 HerbaceousVegetation    100
 Industrial              100
 Forest                  100
 Residential             100
 AnnualCrop              100
 Pasture                 100
 Name: class_name, dtype: int64)

In [11]:
csv_folder_path = 'csv'
train_df.to_csv(os.path.join(csv_folder_path, 'train.csv'), index=False)
val_df.to_csv(os.path.join(csv_folder_path, 'val.csv'), index=False)
test_df.to_csv(os.path.join(csv_folder_path, 'test.csv'), index=False)