In [None]:
# pip install imbalanced-learn

from tensorflow.keras.preprocessing.image import ImageDataGenerator
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

2024-07-11 09:56:20.207473: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-11 09:56:20.264438: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
import os
import numpy as np
import rasterio
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.image import ImageDataGenerator

def load_dataset(data_dir, img_size=(256, 256)):
    """
    Load dataset from specified directory.

    Parameters:
    - data_dir (str): Directory containing train, val, and test subdirectories.
    - img_size (tuple): Desired size of the image (height, width).

    Returns:
    - train_data (tuple): Tuple containing (X_train, y_train).
    - val_data (tuple): Tuple containing (X_val, y_val).
    - test_data (tuple): Tuple containing (X_test, y_test).
    """
    train_dir = os.path.join(data_dir, 'train')
    val_dir = os.path.join(data_dir, 'val')
    test_dir = os.path.join(data_dir, 'test')

    # Load training data
    X_train, y_train = load_data_from_dir(os.path.join(train_dir, 'input'), os.path.join(train_dir, 'output'), img_size)
    # Load validation data
    X_val, y_val = load_data_from_dir(os.path.join(val_dir, 'input'), os.path.join(val_dir, 'output'), img_size)
    # Load test data
    X_test, y_test = load_data_from_dir(os.path.join(test_dir, 'input'), os.path.join(test_dir, 'output'), img_size)

    # Preprocess to handle NaN values
    X_train = preprocess_data(X_train)
    X_val = preprocess_data(X_val)
    X_test = preprocess_data(X_test)

    return (X_train, y_train), (X_val, y_val), (X_test, y_test)

def preprocess_data(images):
    """
    Preprocesses input images to handle NaN values.

    Parameters:
    - images (numpy.ndarray): Array of input images.

    Returns:
    - images (numpy.ndarray): Processed array of input images.
    """
    # Replace NaN values with 0
    images[np.isnan(images)] = 0

    return images

def load_data_from_dir(input_dir, output_dir, img_size):
    """
    Load data (images and labels) from input and output directories.

    Parameters:
    - input_dir (str): Directory containing input images.
    - output_dir (str): Directory containing output images.
    - img_size (tuple): Desired size of the image (height, width).

    Returns:
    - images (numpy.ndarray): Array of loaded input images.
    - labels (numpy.ndarray): Array of corresponding output images.
    """
    images = []
    labels = []

    for filename in os.listdir(input_dir):
        if filename.endswith('.tif'):
            # Load input image (X)
            input_path = os.path.join(input_dir, filename)
            img = load_tiff_image(input_path, img_size)
            images.append(img)

            # Load corresponding output image (y)
            output_filename = filename.replace('.tif', '_cl.tif')
            output_path = os.path.join(output_dir, output_filename)
            label = load_tiff_image(output_path, img_size, is_label=True)
            label[np.isnan(label)] = 0
            labels.append(label)

    if images and labels:
        images = np.array(images)
        labels = np.array(labels)

    return images, labels

def load_tiff_image(path, img_size, is_label=False):
    """
    Load a TIFF image from specified path.

    Parameters:
    - path (str): Path to the TIFF image.
    - img_size (tuple): Desired size of the image (height, width).
    - is_label (bool): Whether the image is a label image.

    Returns:
    - img (numpy.ndarray): Loaded image as a numpy array.
    """
    with rasterio.open(path) as src:
        img = src.read()

    # Reshape and resize if necessary
    img = img.transpose(1, 2, 0)  # Change from bands x height x width to height x width x bands
    img = img[:img_size[0], :img_size[1], :]  # Resize to desired size

    if is_label:
        # Convert label image to binary classification (1 for Marine Debris, 0 for others)
        img = (img == 1).astype(np.uint8)  # Assuming Marine Debris class is encoded as 1

    return img

# Example usage:
data_dir = 'MARIDA'
(X_train, y_train), (X_val, y_val), (X_test, y_test) = load_dataset(data_dir)

print(f"Training data: X_train shape = {X_train.shape}, y_train shape = {y_train.shape}")
print(f"Validation data: X_val shape = {X_val.shape}, y_val shape = {y_val.shape}")
print(f"Testing data: X_test shape = {X_test.shape}, y_test shape = {y_test.shape}")


Training data: X_train shape = (694, 256, 256, 11), y_train shape = (694, 256, 256, 1)
Validation data: X_val shape = (328, 256, 256, 11), y_val shape = (328, 256, 256, 1)
Testing data: X_test shape = (359, 256, 256, 11), y_test shape = (359, 256, 256, 1)


In [None]:
print(y_train.shape)

(694, 256, 256, 1)


In [None]:
print(X_train.shape)

(694, 256, 256, 11)


In [None]:
from imblearn.over_sampling import SMOTE
X = np.reshape(X_train, (256*256*694, 11))
Y = np.reshape(y_train, (256*256*694, 1))
print(Y.shape)

(45481984, 1)


In [None]:
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X, Y)
print(X_res.shape)
print(y_res.shape)

(90960082, 11)
(90960082,)


Band 1 - Coastal Aerosol: 443.9 nm
Band 2 - Blue: 496.6 nm
Band 3 - Green: 560.0 nm
Band 4 - Red: 664.5 nm
Band 5 - Vegetation Red Edge: 703.9 nm
Band 6 - Vegetation Red Edge: 740.2 nm
Band 7 - Vegetation Red Edge: 782.5 nm
Band 8 - NIR (Near Infrared): 835.1 nm
Band 8A - Narrow NIR: 864.8 nm
Band 9 - Water Vapor: 945.0 nm
Band 10 - SWIR - Cirrus: 1373.5 nm
Band 11 - SWIR: 1613.7 nm
Band 12 - SWIR: 2202.4 nm

In [None]:
import pandas as pd
Coastal = pd.DataFrame(X_res).iloc[:, 0]  # costel
Blue = pd.DataFrame(X_res).iloc[:, 1] # blue
Green = pd.DataFrame(X_res).iloc[:, 2] # green
RED = pd.DataFrame(X_res).iloc[:, 3] #
RedEdge1 = pd.DataFrame(X_res).iloc[:, 4]
RedEdge2 = pd.DataFrame(X_res).iloc[:, 5]
RedEdge3 = pd.DataFrame(X_res).iloc[:, 6]
NIR = pd.DataFrame(X_res).iloc[:, 7]
Narrow_NIR = pd.DataFrame(X_res).iloc[:, 8]
SWIR1 = pd.DataFrame(X_res).iloc[:, 9]
SWIR2 = pd.DataFrame(X_res).iloc[:, 10]

# print(first_column.tolist())


In [None]:
Blue.shape

(90960082,)

# NDVI

In [None]:
NDVI = (NIR - RED) / (NIR + RED)
print(np.isinf(NDVI).sum())
print(np.isinf(NDVI).sum())

0
0


# Plastic Index

In [None]:
Plastic = NIR / (NIR - RED)
print(np.isinf(Plastic).sum())
print(np.isinf(-Plastic).sum())

13
13


# Floating Littere Index (FDI)

In [None]:
FDI = NIR - (RedEdge2+ (SWIR1 - RedEdge2) * ((842 - 665)/(1610 - 665)) * 10  )
np.isinf(FDI).sum()
np.isinf(-FDI).sum()

0

# Water Index

In [None]:
WaterIndex = (Green-NIR)/ (Green+NIR)
np.isinf(WaterIndex).sum()
np.isinf(-WaterIndex).sum()

0

In [None]:
y_res= pd.DataFrame(y_res)

In [None]:
X_train = pd.concat([FDI,WaterIndex,Plastic,NDVI,SWIR1,NIR,RED,y_res],axis=1)
X_train.shape

(90960082, 8)

In [None]:
np.isinf(X_train).sum()

0     0
1     0
2    13
3     0
9     0
7     0
3     0
0     0
dtype: int64

In [None]:
mask = (X_train == np.inf).any(axis=1) | (X_train == -np.inf).any(axis=1)
data_filtered = X_train[~mask]

In [None]:
data_filtered.shape

(90960069, 8)

In [None]:
data_filtered

Unnamed: 0,0,1,2,3,9,7,3.1,0.1
0,0.011884,0.353668,-3.827362,-0.115544,0.001662,0.006761,0.008528,0
1,0.011987,0.350520,-2.183913,-0.186295,0.001662,0.006863,0.010006,0
2,0.010403,0.339595,-1.956158,-0.203570,0.002163,0.006761,0.010217,0
3,0.010403,0.373687,-2.488416,-0.167313,0.002163,0.006761,0.009478,0
4,0.009423,0.298356,-3.601909,-0.121894,0.002664,0.007170,0.009161,0
...,...,...,...,...,...,...,...,...
90960077,0.032894,0.125777,-7.195486,-0.064973,0.022103,0.042570,0.048486,1
90960078,0.057127,0.128533,-5.269717,-0.086659,0.017067,0.055453,0.065976,1
90960079,0.024138,0.167718,-6.122776,-0.075497,0.004966,0.020601,0.023966,1
90960080,0.055560,-0.085162,18.468830,0.027826,0.014891,0.058933,0.055742,1


In [None]:
np.isinf(data_filtered).sum()

0    0
1    0
2    0
3    0
9    0
7    0
3    0
0    0
dtype: int64

In [None]:
X = data_filtered.iloc[:-1, :-1]  # Features (all columns except the last one)
Y = data_filtered.iloc[:-1,-1]    # Target variable (last row, last column)
print(X.shape, Y.shape)

(90960068, 7) (90960068,)


In [None]:
Y.unique()

array([0, 1], dtype=uint8)

In [None]:
test_y = np.reshape(y_test,(359*256*256,1))
test_x = np.reshape(X_test,(359*256*256,11))
test_y.shape

(23527424, 1)

In [None]:
Coastal = pd.DataFrame(test_x).iloc[:, 0]  # costel
Blue = pd.DataFrame(test_x).iloc[:, 1] # blue
Green = pd.DataFrame(test_x).iloc[:, 2] # green
RED = pd.DataFrame(test_x).iloc[:, 3] #
RedEdge1 = pd.DataFrame(test_x).iloc[:, 4]
RedEdge2 = pd.DataFrame(test_x).iloc[:, 5]
RedEdge3 = pd.DataFrame(test_x).iloc[:, 6]
NIR = pd.DataFrame(test_x).iloc[:, 7]
Narrow_NIR = pd.DataFrame(test_x).iloc[:, 8]
SWIR1 = pd.DataFrame(test_x).iloc[:, 9]
SWIR2 = pd.DataFrame(test_x).iloc[:, 10]

# print(first_column.tolist())


In [None]:
NDVI = (NIR - RED) / (NIR + RED)
print(np.isinf(NDVI).sum())
print(np.isinf(NDVI).sum())

0
0


In [None]:
Plastic = NIR / (NIR - RED)
print(np.isinf(Plastic).sum())
print(np.isinf(-Plastic).sum())

0
0


In [None]:
FDI = NIR - (RedEdge2+ (SWIR1 - RedEdge2) * ((842 - 665)/(1610 - 665)) * 10  )
np.isinf(FDI).sum()
np.isinf(-FDI).sum()

0

In [None]:
WaterIndex = (Green-NIR)/ (Green+NIR)
np.isinf(WaterIndex).sum()
np.isinf(-WaterIndex).sum()

0

In [None]:
X_test = pd.concat([FDI,WaterIndex,Plastic,NDVI,SWIR1,NIR,RED],axis=1)
X_test.shape

(23527424, 7)

In [None]:
np.isinf(X_test).sum()

0    0
1    0
2    0
3    0
9    0
7    0
3    0
dtype: int64

In [None]:
# Initialize and fit the RandomForestClassifier with specified parameters
rf_classifier = RandomForestClassifier(n_estimators=3500, max_depth=11, random_state=42, n_jobs=2)
rf_classifier.fit(X, Y)

In [None]:
result = rf_classifier.predict(X_test)

In [None]:
from sklearn.metrics import classification_report
report = classification_report(result, test_y)
report

'              precision    recall  f1-score   support\n\n           0       0.99      1.00      1.00  23405068\n           1       0.91      0.00      0.01    122356\n\n    accuracy                           0.99  23527424\n   macro avg       0.95      0.50      0.50  23527424\nweighted avg       0.99      0.99      0.99  23527424\n'

In [None]:
print(report)

              precision    recall  f1-score   support

           0       0.99      1.00      1.00  23405068
           1       0.91      0.00      0.01    122356

    accuracy                           0.99  23527424
   macro avg       0.95      0.50      0.50  23527424
weighted avg       0.99      0.99      0.99  23527424



In [None]:
def write_to_file(filename, content):
    with open(filename, 'w') as f:
        f.write(content)
# Example usage:
filename = '1000_decision_tree.txt'

write_to_file(filename, report)

In [None]:
import joblib

joblib.dump(rf_classifier, "random_forest_1000_decision_tree.joblib")

['random_forest_1000_decision_tree.joblib']

In [None]:
# Calculate confusion matrix
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
cm = confusion_matrix(test_y, result)
cm

array([[23405032,   122011],
       [      36,      345]])

In [None]:
test_y.shape

(23527424, 1)

In [None]:
import os

def shutdown():
    if os.name == 'posix':  # For UNIX/Linux/MacOS
        os.system('shutdown -h now')
    elif os.name == 'nt':  # For Windows
        os.system('shutdown /s /t 1')
    else:
        raise OSError(f"Unsupported operating system: {os.name}")

# Calling the shutdown function
shutdown()


In [None]:
# from tensorflow.keras.preprocessing.image import ImageDataGenerator

# # Define ImageDataGenerator with rotation augmentation
# datagen = ImageDataGenerator(
#     rotation_range=45,  # Rotate images randomly up to 45 degrees
#     rescale=1./255  # Normalize pixel values (assuming pixel range 0-255)
# )

# # Example usage:
# batch_size = 32
# # Create generators for training and validation data
# train_generator = datagen.flow(X_train, y_train, batch_size=batch_size)
# val_generator = datagen.flow(X_val, y_val, batch_size=batch_size)

# # Note: No need to augment validation data, so we only apply rotation augmentation to training data


In [None]:
# import tensorflow as tf
# from tensorflow.keras.losses import SparseCategoricalCrossentropy

# # Compute class weights based on frequency
# def compute_class_weights(y_train):
#     class_weights = {}
#     total_samples = len(y_train)
#     unique_classes = np.unique(y_train)
#     class_counts = np.bincount(y_train.flatten())

#     for i, count in enumerate(class_counts):
#         class_weights[i] = (1 / count) * (total_samples / len(unique_classes))

#     return class_weights

# # Example usage
# # Assuming y_train is your training labels (shape: (694, 256, 256))
# y_train_flat = y_train.flatten()
# class_weights = compute_class_weights(y_train_flat)

# # Define weighted loss function
# loss_function = SparseCategoricalCrossentropy(from_logits=True, weight=class_weights)

# # Compile your model with this loss function
# model.compile(optimizer='adam', loss=loss_function, metrics=['accuracy'])

# # Train your model using the generators
# history = model.fit(train_generator, epochs=num_epochs, validation_data=val_generator)
