# BASELINE NOTEBOOK 

## Metrics

the following metrics will be used for the model:

`auc`= Approximates the AUC (Area under the curve) of the ROC or PR curves. (ROC / PR)

`Precision` 

`Recall `

`F1`

## general `Baseline` for data train

In [55]:
import os, folium, glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import imageio.v3 as iio
import geopandas as gpd
from IPython.display import Image, display
import tensorflow as tf
from tensorflow.keras import layers, callbacks, backend, Model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from tensorflow.keras.applications import EfficientNetB0, DenseNet121
from tensorflow.keras.layers import Input, Dense, Conv2D, Flatten
from tensorflow.keras.models import Model
from tensorflow.keras import layers

In [6]:
labels= gpd.read_file('labels.geojson')
labels_unique = labels[(labels.Type == 'Negative') | (labels.Type == 'CAFOs') | (labels.Type == 'WWTreatment')
           | (labels.Type == 'Landfills') | (labels.Type == 'RefineriesAndTerminals')
           | (labels.Type == 'ProcessingPlants') | (labels.Type == 'Mines')]

In [10]:
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(labels_unique.Type.values.reshape(labels_unique.Type.values.shape[0], 1))
mlb.classes_

array(['CAFOs', 'Landfills', 'Mines', 'Negative', 'ProcessingPlants',
       'RefineriesAndTerminals', 'WWTreatment'], dtype=object)

In [13]:
y_train, y_test= train_test_split(y, train_size=0.1, stratify=y)
def invert_ecoding(encoded_labels, categories):
    categories = ['[unk]']+categories
    return np.take(categories, np.argwhere(encoded_labels == 1.0)[:,1])
y_tra_orig = np.argwhere(y_train == 1.0)[:,1]
y_test_orig = np.argwhere(y_test == 1.0)[:,1]

In [21]:
y_test_orig

array([3, 6, 3, ..., 0, 3, 3])

In [44]:
# same probability for all the instances
constant_probability = np.full_like(y_test, 1/7)

y_pred_baseline = np.argmax(constant_probability, axis=1)

# Calculate metrics
accuracy = accuracy_score(np.argmax(y_test, axis=1), y_pred_baseline)
precision = precision_score(np.argmax(y_test, axis=1), y_pred_baseline, average='weighted')
recall = recall_score(np.argmax(y_test, axis=1), y_pred_baseline, average='weighted')
f1 = f1_score(np.argmax(y_test, axis=1), y_pred_baseline, average='weighted')
auc = roc_auc_score(y_test, constant_probability, average='weighted')


  _warn_prf(average, modifier, msg_start, len(result))


In [46]:
results_df = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1-score', 'AUC'],
    'value': [accuracy, precision, recall, f1, auc]
})

In [47]:
results_df

Unnamed: 0,Metric,value
0,Accuracy,0.289995
1,Precision,0.084097
2,Recall,0.289995
3,F1-score,0.130384
4,AUC,0.5


## `Baseline` CNN

In [94]:
##labels_unique.Image_Folder=labels_unique.Image_Folder+"/naip.png"
##separate the datasets in train and test
train = int(labels_unique.shape[0] * 0.8)
test = int(labels_unique.shape[0] * 0.2)
labels_unique_train = labels_unique.iloc[:train, :]
labels_unique_test = labels_unique.iloc[train:, :]

In [95]:
##input data 
image_size = (720, 720)
batch_size = 32
validation_split = 0.2
test_split = 0.1

##get images from path on train and test 

from keras.preprocessing import image
dir_path=r"/root/code/dvictoria2/methane_source/"
train_datagen=image.ImageDataGenerator(rescale=1./255, validation_split = validation_split)
test_datagen =image.ImageDataGenerator(rescale=1./255)

In [99]:
##create generators for train_validation and test
train_generator=train_datagen.flow_from_dataframe(
    dataframe=labels_unique_train, directory=dir_path, 
    x_col="Image_Folder", y_col="Type", seed = 42,
    class_mode="categorical", target_size=(720,720), batch_size=batch_size , subset = "training", color_mode='rgb')
validation_generator = train_datagen.flow_from_dataframe(dataframe=labels_unique_train, directory=dir_path, 
    x_col="Image_Folder", y_col="Type", seed = 42,
    class_mode="categorical", target_size=(720,720), batch_size=batch_size , subset = "validation", color_mode='rgb')
test_generator = test_datagen.flow_from_dataframe(dataframe=labels_unique_test, directory=dir_path, 
    x_col="Image_Folder", y_col="Type", seed = 42,
    class_mode="categorical", target_size=(720,720), batch_size=batch_size , color_mode='rgb')

Found 9040 validated image filenames belonging to 7 classes.
Found 2260 validated image filenames belonging to 7 classes.
Found 2826 validated image filenames belonging to 7 classes.


In [100]:
STEP_SIZE_TRAIN=train_generator.n//train_generator.batch_size
STEP_SIZE_VALID=validation_generator.n//validation_generator.batch_size
STEP_SIZE_TEST=test_generator.n//test_generator.batch_size

In [101]:
##create build_model funtion
def build_model():
    inputs = layers.Input(shape=(720, 720, 3))
    
    cnn = layers.Conv2D(32, (3, 3), activation='relu')(inputs)
    cnn2 = layers.MaxPooling2D((2, 2))(cnn)
    cnn3 = layers.Conv2D(64, (3, 3), activation='relu')(cnn2)
    cnn4 = layers.MaxPooling2D((2, 2))(cnn3)
    
    flatten = layers.Flatten()(cnn4)
    dense1 = layers.Dense(32, activation='relu')(flatten)
    out = layers.Dense(7, activation='softmax')(dense1)
    
    model = Model(inputs=inputs, outputs=out)
    
    model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=[
        tf.keras.metrics.AUC(name='auc'),
        tf.keras.metrics.AUC(name='pr_auc', curve='PR'),
        tf.keras.metrics.Precision(name='precision'),
        tf.keras.metrics.Recall(name='recall'),
        'f1_score', 
        'accuracy'
    ])
    return model


In [64]:
model=build_model()

In [None]:
cb = [callbacks.EarlyStopping(patience=5), callbacks.ReduceLROnPlateau(patience=3),callbacks.ModelCheckpoint('methane_V1.hdf5', monitor='loss',verbose=1, save_best_only=True)]
epochs = 10
# fits the model on batches with real-time data augmentation:
history =model.fit_generator(generator=train_generator,
                    steps_per_epoch=STEP_SIZE_TRAIN,
                    validation_data=validation_generator,
                    validation_steps=STEP_SIZE_VALID,
                    epochs=10, callbacks = cb)

# evaluate the test set
test_loss, test_metrics = model.evaluate(test_generator, steps=STEP_SIZE_TEST)
print("Test loss:", test_loss)
print("Test metrics:", test_metrics)