In [87]:
# -*- coding: utf-8 -*-
"""
Created on Thu Apr  7 16:15:55 2022

@author: SkyMap
"""

import numpy as np
import pandas as pd
import os, glob

from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
    
import rasterio
import xgboost as xgb


np.random.seed()

In [88]:
def get_index_and_mask_train(fp_mask, nodata_value=0):
    src = rasterio.open(fp_mask)
    mask = src.read()[0].flatten()
    index_nodata = np.where(mask == nodata_value)
    mask_train = np.delete(mask, index_nodata)
    return mask_train, index_nodata

In [89]:
def get_df_flatten_train(fp_img, list_number_band, index_nodata):
    src = rasterio.open(fp_img)
    # return to img train
    list_band_have = list(range(1,src.count+1))
    dfObj = pd.DataFrame()
    if set(list_number_band).issubset(list_band_have):
        img = src.read(list_number_band)
        i = 0
        for band in img:
            band = band.flatten()
            band = np.delete(band, index_nodata)
            name_band = f"band {list_number_band[i]}"
            dfObj[name_band] = band
            i+=1
        return dfObj
    else:
        miss = np.setdiff1d(list_number_band, list_band_have)
        print("*"*15, "ERROR", "*"*15)
        print(f"Image dont have band : {miss.tolist()}")

In [90]:
def create_data_train_from_ones_img(fp_img, fp_mask, list_band_to_train, size_get=None):
    mask_train, index_nodata = get_index_and_mask_train(fp_mask)
    df_dataset = get_df_flatten_train(fp_img, list_band_to_train, index_nodata)
    df_dataset['label'] = mask_train - 1
    print(np.unique(mask_train - 1),'zeeeeeeeee')


    g = df_dataset.groupby('label', group_keys=False)
    g = g.apply(lambda x: x.sample(g.size().min()).reset_index(drop=True))
    g = pd.DataFrame(g)
    # size_get = 3000       # sample size
    if size_get:
        replace = False  # with replacement
        fn = lambda obj: obj.loc[np.random.choice(obj.index, size_get, replace),:]
        a= g.groupby('label', as_index=False).apply(fn)
        # print(a)
        return pd.DataFrame(a)
    else:
        return g

In [91]:
def create_data_train_all_img(list_fp_img, list_fp_mask, list_band_to_train, out_fp_csv_train, size_get=None):
    print(list_fp_img)
    dir_name_img = os.path.dirname(list_fp_img[0])
    # list_df_all = []
    for fp_mask in list_fp_mask:
        base_name = os.path.basename(fp_mask)
        fp_img = os.path.join(dir_name_img, base_name)
        df_tmp = create_data_train_from_ones_img(fp_img, fp_mask, list_band_to_train, size_get)
        # print(df_tmp)
        # list_df_all.append(df_tmp)
        # result = pd.concat(list_df_all)
        df_tmp.to_csv(out_fp_csv_train, mode='a', index=False, header=False)
        print('done ', base_name)
        # print(result)
    # print(np.unique(result['label'].to_numpy()))
    # result.to_csv(out_fp_csv_train)

In [92]:
def create_data_train(csv_training):
    datasets = pd.read_csv(csv_training).iloc[:, 2:]
    print(datasets.shape)
    X = datasets.iloc[:, :-1]
    Y = datasets.iloc[:, -1]
    encoder = LabelEncoder()
    encoder.fit(Y)
    encoded_Y = encoder.transform(Y)
    Y = np_utils.to_categorical(encoded_Y)
    return X,Y

In [93]:
def create_data_train_Xgboost(csv_training, training_per = 0.8):
    datasets = pd.read_csv(csv_training)#.iloc[:, 2:]
    ds_train = datasets.sample(frac=training_per)
    ds_test = datasets[~datasets.isin(ds_train)].dropna()

    X_train = ds_train.iloc[:, :-1]
    Y_train = ds_train.iloc[:, -1]
    X_test = ds_test.iloc[:, :-1]
    Y_test = ds_test.iloc[:, -1]

    return X_train, Y_train, X_test, Y_test

In [94]:
list_fp_img = glob.glob(r'E:\WORK\Mongodia\pixel_base\img\*.tif')
list_fp_mask = glob.glob(r'E:\WORK\Mongodia\pixel_base\mask\*.tif')
list_band_to_train = [1,2,3,4,5,6,7]
out_fp_csv_train = r'E:\WORK\Mongodia\pixel_base\training.csv'
fp_model_save = r"E:\WORK\Mongodia\pixel_base\model_5000_v2_num_round_100_max_depth7_7Band.model"
if not os.path.exists(out_fp_csv_train):
    create_data_train_all_img(list_fp_img, list_fp_mask, list_band_to_train, out_fp_csv_train)
X_train, Y_train, X_test, Y_test = create_data_train_Xgboost(out_fp_csv_train, training_per = 0.8)
print('Training ...')
dtrain = xgb.DMatrix(X_train, label=Y_train)
dtest = xgb.DMatrix(X_test, label=Y_test)
num_round = 100
param = {'max_depth': 7, 'eta': 1, 'objective': 'multi:softmax'}
param['nthread'] = 5
param['eval_metric'] = 'auc'
param['num_class'] = 6
param['gpu_id'] = 0
evallist = [(dtest, 'eval'), (dtrain, 'train')]
bst = xgb.train(param, dtrain, num_round, evallist)
bst.save_model(fp_model_save)


['E:\\WORK\\Mongodia\\pixel_base\\img\\LC08_L2SP_133026_20210609_20210615_02_T1_0.tif', 'E:\\WORK\\Mongodia\\pixel_base\\img\\LC08_L2SP_133026_20210913_20210924_02_T1_0.tif']
[0 1 2 3 4 5] zeeeeeeeee
done  LC08_L2SP_133026_20210609_20210615_02_T1_0.tif
[0 1 2 3 4 5] zeeeeeeeee
done  LC08_L2SP_133026_20210913_20210924_02_T1_0.tif
Training ...
[0]	eval-auc:0.98988	train-auc:0.99353
[1]	eval-auc:0.99544	train-auc:0.99831
[2]	eval-auc:0.99723	train-auc:0.99929
[3]	eval-auc:0.99791	train-auc:0.99968
[4]	eval-auc:0.99845	train-auc:0.99985
[5]	eval-auc:0.99856	train-auc:0.99993
[6]	eval-auc:0.99870	train-auc:0.99996
[7]	eval-auc:0.99885	train-auc:0.99998
[8]	eval-auc:0.99889	train-auc:0.99999
[9]	eval-auc:0.99894	train-auc:1.00000
[10]	eval-auc:0.99893	train-auc:1.00000
[11]	eval-auc:0.99895	train-auc:1.00000
[12]	eval-auc:0.99901	train-auc:1.00000
[13]	eval-auc:0.99903	train-auc:1.00000
[14]	eval-auc:0.99908	train-auc:1.00000
[15]	eval-auc:0.99910	train-auc:1.00000
[16]	eval-auc:0.99909	trai

In [96]:
X_train.shape

(10655, 7)