In [None]:
import pandas as pd
import rasterio
import matplotlib.pyplot as plt
import numpy as np
import cudf
import cuml
from cuml.ensemble import RandomForestClassifier as cuRF
from cuml.model_selection import train_test_split
from scipy.stats import randint
import os
import pickle

In [None]:
retrain = False
rf_file = 'cu_rf5'
reclassify = True #Reclassify previously classified images
show_image = True
num_classify = 50
classified_path = r'/mnt/c/users/attic/hls_kelp/imagery/rf_classified_S30'
unclassified_path = r'/mnt/c/users/attic/hls_kelp/imagery/rf_prepped_v2'
unclassified_files = os.listdir(unclassified_path)
rf_path = r'/mnt/c/users/attic/hls_kelp/random_forest/'
training_path = r'/mnt/c/users/attic/hls_kelp/imagery/rf_training_v3_cleaned'


In [None]:
if retrain:

    training_files = os.listdir(training_path)
    training_data = []
    #file = 'HLS.L30.T11SKU.2018058T183349.v2.0_kelp_classified.tif'
    for file in training_files:
        with rasterio.open(os.path.join(training_path, file)) as src:
                training_img = src.read()  
        #print(training_img.shape)
        training_img = training_img[:,2600:3600,:]
        file_data = training_img.reshape(training_img.shape[0], -1) 
        training_data.append(file_data)

    combined_training_data = np.hstack(training_data)
    combined_training_data[6,:] = combined_training_data[6,:]

    df = cudf.DataFrame(combined_training_data.T)
    X = df.iloc[:, :-1].astype('float32')
    y = df.iloc[:, -1].astype('float32')
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.95, random_state=42)
    cu_rf_params = {
        'n_estimators':150,  # Increase number of trees
        'max_depth': 300,      # Increase maximum depth
        'n_bins': 50,         # Increase number of bins
        'n_streams': 16      # Adjust parallel streams based on GPU capability
    }

    cu_rf = cuRF(**cu_rf_params)
    cu_rf.fit(X_train, y_train)
    y_pred = cu_rf.predict(X_test)
    
    accuracy = cuml.metrics.accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy}")
else:
     print("Retrain set to False")

In [None]:
if retrain: 

    if not os.path.isdir(rf_path):
        os.mkdir(rf_path)
    with open(os.path.join(rf_path, rf_file), 'wb') as f:
        pickle.dump(cu_rf, f)

In [None]:
if not retrain:
    with open(os.path.join(rf_path,rf_file), 'rb') as f:
        cu_rf = pickle.load(f)

for i, file in enumerate(unclassified_files):
    if(i > num_classify):
         break
    file_name = file.split('_')
    if not reclassify and os.path.isfile(os.path.join(classified_path, f'{file_name[0]}_kelp_classified.tif')):
        print(f"{file} already classified")
        continue
    file_img =[]
    with rasterio.open(os.path.join(unclassified_path,file)) as src:
        file_img = src.read(indexes=[1, 2, 3, 4, 5, 6])
        img = np.stack(file_img, axis=0)
        n_bands, height, width = img.shape
        img_2D = img.reshape(img.shape[0], -1).T #classifier takes 2D array of band values for each pixel 
    #normalized_img_bands = np.column_stack((img_2D, cloud_mask_2D))
 ##========== Normalize multi-spectral data ==========##
        img_sum = img_2D.sum(axis=1)
        epsilon = 1e-10  
        img_2D_nor = np.divide(img_2D, img_sum[:, None] + epsilon, where=(img_sum[:, None] != 0))
        img_2D_nor = (img_2D_nor * 255).astype(np.uint8)
        #img_normalized = img_2D_normalized.reshape((height, width))
            # img_sum_nonzero = np.where(img_sum == 0, 1, img_sum)
            # img_2D_normalized = img_2D / img_sum_nonzero[:, None] #divide value by sum of pixel band values
            # print(img_2D_normalized.shape)
            # img_2D_normalized = (img_2D_normalized * 255)
            # img_2D_normalized = img_2D_normalized.astype(np.uint8)

        #img_data= file_img.reshape(file_img.shape[0], -1).T
        img_data = cudf.DataFrame(img_2D_nor)
        img_data = img_data.astype(np.float32)
        kelp_pred = cu_rf.predict(img_data)
        kelp_img = kelp_pred.values_host.reshape(width,height)
        if show_image:
            print(file)
            plt.figure(figsize=(25, 25)) 
            plt.subplot(2, 1, 1)  
            plt.imshow(kelp_img[2700:3400, 600:2000])
            plt.title(file)
            r_nor = img_2D_nor[:,2].reshape((height, width))
            g_nor = img_2D_nor[:,1].reshape((height, width))
            b_nor = img_2D_nor[:,0].reshape((height, width))
            rgb_nor = np.stack([r_nor,g_nor,b_nor], axis=-1)  
            rgb_cropped = rgb_nor[2700:3400, 600:2000]
            plt.subplot(2, 1, 2) 
            plt.imshow(rgb_cropped)
            plt.title("RGB Cropped Image")
            #plt.colorbar()
            plt.show()
        data_type = rasterio.int16
        profile = {
            'driver': 'GTiff',
            'width': width,
            'height': height,
            'count': 5,  # one band  B02, B03, B04, and B05, classified (Blue, Green, Red, and NIR).
            'dtype': data_type,  # assuming binary mask, adjust dtype if needed
            'crs': src.crs,
            'transform': src.transform,
            'nodata': 0  # assuming no data is 0
        }
        # Write the land mask array to GeoTIFF
        if not os.path.isdir(classified_path):
            os.mkdir(classified_path)
        with rasterio.open(os.path.join(classified_path, f'{file_name[0]}_kelp_classified.tif'), 'w', **profile) as dst:
                dst.write(file_img[0].astype(data_type), 1)
                dst.write(file_img[1].astype(data_type), 2)
                dst.write(file_img[2].astype(data_type), 3)
                dst.write(file_img[3].astype(data_type), 4)
                dst.write(kelp_img.astype(rasterio.uint8), 5)
        print(f'{i+1} / {len(unclassified_files)}')