In [None]:
import os, glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import imageio
import geopandas as gpd
from IPython.display import Image, display
import tensorflow as tf
from tensorflow.keras import layers, callbacks, backend, Model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from keras.preprocessing import image
from tensorflow.keras.optimizers import Adam
from plot_keras_history import show_history, plot_history
import matplotlib.pyplot as plt
from keras.preprocessing import image

In [None]:
df=pd.read_csv("F:\\CNOOC_testing\\Methane_dataset\\METHANE_PROJECT\\smallsize224_all.csv")

In [None]:
df.shape

In [None]:
df.Type.value_counts()

In [None]:
df=df.dropna()

In [None]:
df.head(2)

In [None]:
df.Type = df.Type.astype(str)

In [None]:
df_train=df.loc[df['dataset'] == "train"]

In [None]:
df_test=df.loc[df['dataset'] == "test"]

In [None]:
df_train.shape

In [None]:
df_train=df_train[(df_train.Type == 'Negative') | (df_train.Type == 'CAFOs') | (df_train.Type == 'WWTreatment')
           | (df_train.Type == 'Landfills') | (df_train.Type == 'RefineriesAndTerminals')
           | (df_train.Type == 'ProcessingPlants') | (df_train.Type == 'Mines')]

In [None]:
df_test=df_test[(df_test.Type == 'Negative') | (df_test.Type == 'CAFOs') | (df_test.Type == 'WWTreatment')
           | (df_test.Type == 'Landfills') | (df_test.Type == 'RefineriesAndTerminals')
           | (df_test.Type == 'ProcessingPlants') | (df_test.Type == 'Mines')]

In [None]:
df_train.shape

In [None]:
df_test.shape

In [None]:
df_train.Type.value_counts()

In [None]:
#def sampling_k_elements(group, k=1600):
#    if len(group) < k:
#        return group
#    return group.sample(k)
#balanced = df_train.groupby('Type').apply(sampling_k_elements).reset_index(drop=True)

In [None]:
SEED = 42
CATEGORIES = 6
TRAIN_BATCH_SIZE = 32
VAL_SPLIT = 0.2

IMG_HEIGHT = 224
IMG_WIDTH = 224
IMG_CHANNEL = 3

RUN_NUMBER = 0

In [None]:
class MyDataFrameIterator(image.DataFrameIterator):
    def __init__(self, *args, **kwargs):
        return super().__init__(*args, **kwargs)

    def __getitem__(self, idx):
        X_, y_ = super().__getitem__(idx)
        y_ = np.delete(y_, self.class_indices['Negative'], axis=1)
        return X_, y_
    
    def next(self):
        X_, y_ = super().next()
        y_ = np.delete(y_, self.class_indices['Negative'], axis=1)
        return X_, y_
    
class MyImageDataGenerator(image.ImageDataGenerator):
    def __init__(self, *args, **kwargs):
        return super().__init__(self, *args, **kwargs)
    
    def flow_from_dataframe(self, df, directory, *args, **kwargs):
        return MyDataFrameIterator(df, directory, *args, **kwargs)

In [None]:
datagen=MyImageDataGenerator(rescale=1./255,
                                 #validation_split = VAL_SPLIT,
                                 #rotation_range=20,
                                 #width_shift_range=0.2,
                                 #height_shift_range=0.2,
                                 #horizontal_flip=True
                                )

In [None]:
train_generator=datagen.flow_from_dataframe(
    df_train,
    "",
    x_col="img_dir",
    y_col="Type",
    color_mode='rgb',
    seed = 42,
    class_mode="categorical",
    validate_filenames=False,
    target_size=(IMG_HEIGHT,IMG_WIDTH),
    batch_size=TRAIN_BATCH_SIZE)

In [None]:
val_generator = datagen.flow_from_dataframe(
    df_train,
    "",
    x_col="img_dir",
    y_col="Type",
    color_mode='rgb',
    seed = 42,
    class_mode="categorical",
    target_size=(IMG_HEIGHT,IMG_WIDTH),
    validate_filenames=False,
    batch_size=TRAIN_BATCH_SIZE)

In [None]:
STEP_SIZE_TRAIN = train_generator.n//train_generator.batch_size
STEP_SIZE_VALID = val_generator.n//val_generator.batch_size

In [None]:
def build_model():
    inputs = layers.Input(shape=(224, 224, 3))
    
    # the densenet layer
    x = tf.keras.applications.DenseNet121(include_top=False,
                      weights='imagenet',
                      pooling='max'
                      )(inputs)
    
    #x = layers.Flatten()(x)
    x = layers.Dense(16, activation='relu')(x)
    out = layers.Dense(CATEGORIES, activation='sigmoid')(x)
    
    model = Model(inputs=inputs, outputs=out)
    loss = tf.keras.losses.BinaryCrossentropy()
    model.compile(loss=loss, optimizer=Adam(learning_rate=0.001), metrics=[
        tf.keras.metrics.AUC(name='auc'),
        tf.keras.metrics.AUC(name='pr_auc', curve='PR'),
        tf.keras.metrics.Precision(name='precision'),
        tf.keras.metrics.Recall(name='recall'),
        'accuracy'
    ])
    return model

In [None]:
backend.clear_session()
model = build_model()
model.layers[1].trainable=False
model.summary()

In [None]:
cb = [callbacks.EarlyStopping(patience=5),
      callbacks.ReduceLROnPlateau(patience=3),
      callbacks.ModelCheckpoint('carla_v0.hdf5',
                                monitor='loss',
                                verbose=1,
                                save_best_only=True)]
epochs = 3
# fits the model on batches with real-time data augmentation:

history =model.fit_generator(generator=train_generator,
                          steps_per_epoch=STEP_SIZE_TRAIN,
                          validation_data=val_generator,
                          validation_steps=STEP_SIZE_VALID,
                          epochs=epochs,
                          callbacks = cb)

In [None]:
show_history(history)
plot_history(history, path="standard.png")
plt.close()

In [None]:
TEST_BATCH_SIZE = 16

test_gen = datagen.flow_from_dataframe(
    df_test,
    "",
    x_col="img_dir",
    y_col="Type",
    color_mode='rgb',
    seed = 42,
    class_mode="categorical",
    validate_filenames=False,
    target_size=(IMG_HEIGHT,IMG_WIDTH),
    batch_size=TEST_BATCH_SIZE)

In [None]:
cat_names = list(test_gen.class_indices.keys())
cat_names.remove('Negative')
cat_names

In [None]:
X_test, y_test = test_gen.next()
y_pred_proba = model.predict(X_test)

# SAVING THE ARRAY, PLEASE UPDATE THE NAME SO IT DOES nOT OVERWRITE

# :)

And keep track of what you sensitized on so we can compare metrics :)

In [None]:
np.save(f'y_pred_proba_{RUN_NUMBER}.npy', y_pred_proba)
np.save(f'y_test_{RUN_NUMBER}.npy', y_test)

In [None]:
plt.imshow(y_pred_proba.T)
plt.colorbar()
plt.show()

In [None]:
plt.imshow(y_test.T)
plt.colorbar()
plt.show()

In [None]:
# Plot a sample image from the test set
abc = X_test[0,:,:,:].reshape(224, 224, 3).astype(np.uint8)
plt.imshow(abc)
plt.colorbar()
plt.show()

In [None]:
# Instantiate metrics
precision = tf.keras.metrics.Precision()
recall = tf.keras.metrics.Recall()
auc = tf.keras.metrics.AUC()
pr_auc = tf.keras.metrics.AUC(name='pr_auc', curve='PR')
accuracy = tf.keras.metrics.Accuracy()

In [None]:
# Binarize predictions
y_pred = (y_pred_proba> 0.5).astype(int)

In [None]:
prec_cat = {}
recall_cat = {}
auc_cat = {}
pr_auc_cat = {}
accuracy_cat = {}
overall_cat = {}

def calc_metric(y_test, y_pred, m_dict, m_fun, idx, name):
    m_dict[name] = m_fun(y_test[:][:,idx].reshape(TEST_BATCH_SIZE,1), y_pred[:][:,idx].reshape(TEST_BATCH_SIZE,1)).numpy()

dicts = [prec_cat, recall_cat, auc_cat, pr_auc_cat, accuracy_cat]
funcs = [precision, recall, auc, pr_auc, accuracy]

for idx, name in enumerate(cat_names):
    for m_dict, m_fun in zip(dicts, funcs):
        calc_metric(y_test, y_pred, m_dict, m_fun, idx, name)
    overall_cat[name] = np.mean([d[name] for d in dicts])


In [None]:
metrics_df = pd.DataFrame({
    'Category': cat_names,
    'Precision': list(prec_cat.values()),
    'Recall': list(recall_cat.values()),
    'AUC': list(auc_cat.values()),
    'PR AUC': list(pr_auc_cat.values()),
    'Accuracy': list(accuracy_cat.values()),
    'Overall': list(overall_cat.values())
})

print(metrics_df)