# Stack image and geodata models (50:50)

In [None]:
import os
import random
from contextlib import redirect_stdout
import pandas as pd
import descartes
import geopandas as gpd
import shapely
from shapely.geometry import Point, Polygon
from shapely import geometry

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from tensorflow import keras
from tensorflow.keras.models import Sequential
import tensorflow.keras.layers as layers
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.layers import Dense, Conv2D, Flatten, Dropout, MaxPooling2D
from tensorflow.keras.utils import plot_model 

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize
from sklearn import preprocessing

## Getting the data

In [None]:
def get_local_repository_path(repository_name):
    """
    Return local absolute path from home directory
    to the repository folder (including it).
    Arg.: Name of the repository.
    """
    wd_path = os.getcwd()
    split_wd_path = wd_path.split("/")
    tfm_position = split_wd_path.index(repository_name)
    local_path_split = split_wd_path[:tfm_position+1]
    return "/".join(local_path_split)

In [None]:
# CONFIGURE
local_path = get_local_repository_path("tfm")
model_name = "stack_example"

In [None]:
# OUTPUTS
save_dir = os.path.join(os.path.abspath(os.getcwd()), "outputs", model_name)
# Create outputs folder
if not os.path.exists(save_dir):
    os.makedirs(save_dir, exist_ok=True)

In [None]:
# INPUTS (for the weights)
input_dir = os.path.join(os.path.abspath(os.getcwd()), "inputs")
geo_weights = "geo_weights_1D_1000.h5"
img_weights = "img_weights_VGG16_b1b2b3PT_500_30P.h5"

In [None]:
# GEODATA SET DIRECTORY
geo_source_dir = "data/geodata/preprocessing/outputs"
# Importing the dataset
full_dataset = pd.read_csv(os.path.join(local_path, geo_source_dir, "filtered_coordinates.csv"))
dataset = full_dataset.drop(columns="coordinate_uncertainty")

In [None]:
# DATA SET DIRECTORIES
img_source_dir = "data/images/image_preprocessing/processed_images_train_val_test/"
train_dir = os.path.join(local_path, img_source_dir, "train")
val_dir = os.path.join(local_path, img_source_dir, "val")
test_dir = os.path.join(local_path, img_source_dir, "test")

In [None]:
# LABELS
img_class_names = sorted(os.listdir(train_dir))
print(f"{len(img_class_names)} classes in dataset.")
print(f"Classes names: {img_class_names}")

In [None]:
# LABELS
geo_class_names = list(set(full_dataset["species_name"].tolist()))
print(f"{len(geo_class_names)} classes in dataset.")
print(f"Classes names: {geo_class_names}")

In [None]:
# EDIT FOR EACH MODEL
# Model description
model_description = f"""
{model_name}

"""

# Save model description
with open(os.path.join(save_dir,"model_description.txt"), "w") as file:
    with redirect_stdout(file):
        print(model_description)

In [None]:
# GEODATA SAMPLE
full_dataset = pd.read_csv(os.path.join(local_path, geo_source_dir, "filtered_coordinates.csv"))
dataset = full_dataset.drop(columns="coordinate_uncertainty")

# Encode class names
geo_class_names.sort()
species_to_number = {species_name:geo_class_names.index(species_name) for species_name in geo_class_names}
dataset['target']=dataset.apply(lambda r:species_to_number[r.species_name],axis=1)
dataset.sample(n=5)
dataset_num = dataset.drop(columns="species_name")
dataset_num.sample(n=5)

# Split in train, val and test subsets
train, test = train_test_split(dataset_num, test_size=0.2)
train, val = train_test_split(train, test_size=0.2)
print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')
# Extract predictors (latitude and longitude)
X_train = train.iloc[:, 1:3].values  
X_val = val.iloc[:, 1:3].values 
X_test = test.iloc[:, 1:3].values 
print(f"Train predictor shape: {X_train.shape}")
print(f"Validation predictor shape: {X_val.shape}")
print(f"Test predictor shape: {X_test.shape}")
print(X_train)

# Encode labels
y_train = train.iloc[:, 2].values  
y_val = val.iloc[:, 2].values 
test_labels = test.iloc[:, 2].values 

y_train = to_categorical(y_train)
y_val = to_categorical(y_val)
y_test = to_categorical(test_labels)
print(f"Train target shape: {y_train.shape}")
print(f"Validation target shape: {y_val.shape}")
print(f"Test target shape: {y_test.shape}")
print(y_train)

# Normalize train  val predictors
X_train_norm = normalize(X_train)
X_val_norm = normalize(X_val)
X_train_norm

In [None]:
# CONFIGURATION ImageDataGenerator 
img_height = 224 
img_width = 224
color_mode= "rgb"
class_mode="categorical"                                  
shuffle=True                                                               
seed = 1234 
def plot_images(images_arr):
    fig, axes = plt.subplots(1, 6, figsize=(15,15))
    axes = axes.flatten()
    for img, ax in zip( images_arr, axes):
        ax.imshow(img)
        ax.axis('off')
    plt.tight_layout()
    plt.show()
train_datagen_no_aug = ImageDataGenerator(rescale=1./255)  
train_array_no_aug = train_datagen_no_aug.flow_from_directory(directory = train_dir,
                                            target_size=(img_width, img_height)
                                            ) 
sample_training_images, _ = next(train_array_no_aug)
plot_images(sample_training_images[:6])

## Load models

In [None]:
# GEODATA
# Architecture
geo_model = Sequential()
geo_model.add(layers.Dense(64, input_dim=2, activation='relu'))
geo_model.add(layers.Dense(len(class_names), activation='softmax'))
# Load weights
#geo_model.load_weights(os.path.join(input_dir, geo_weights))

In [None]:
# IMAGES
# Architecture
loaded_model = tf.keras.applications.VGG16()
img_model = Sequential()
for layer in loaded_model.layers[:-1]: 
    img_model.add(layer)
img_model.add(Dense(len(class_names), activation = "softmax"))
# Load weights
img_model.load_weights(os.path.join(input_dir, img_weights))

In [None]:
# Define inputs
img_input = keras.Input(shape=(224,224,3))
geo_input = keras.Input(shape=(2))

y1 = img_model(img_input)
y2 = geo_model(geo_input)
output = layers.average([y1,y2])
ensemble_model = keras.Model(inputs=[img_input, geo_input], outputs = output)
keras.utils.plot_model(ensemble_model)

In [None]:
ensemble_model.compile(optimizer = "adam", loss = 'categorical_crossentropy', metrics = ['accuracy'])

In [None]:
epochs = 5
history = model.fit(
    "image": train_array,
    "geodata": X_train_norm,
        y_train,
        validation_data=(X_val_norm, y_val),
        batch_size = 10, 
        epochs = epochs
    )

### Evaluating the model training

#### Accuracy and loss during training

In [None]:
# Parameters measured during model training
history_dict = history.history
print(history_dict.keys())

In [None]:
try:
    acc = history_dict["acc"]
    val_acc = history_dict["val_acc"]
    loss = history_dict["loss"]
    val_loss = history_dict["val_loss"]
except:
    try:
        acc = history_dict["accuracy"]
        val_acc = history_dict["val_accuracy"]
        loss = history_dict["loss"]
        val_loss = history_dict["val_loss"]
    except:
        pass        

In [None]:
def plot_acc_loss(acc,val_acc,loss,val_loss,epochs):
    epochs_range = range(epochs)
    plt.figure(figsize=(8, 8))
    plt.suptitle(model_name)
    # Accuracy plots
    plt.subplot(1, 2, 1)
    plt.plot(epochs_range, acc, label="Training Accuracy")
    plt.plot(epochs_range, val_acc, label="Validation Accuracy")
    plt.legend(loc="lower right")
    plt.xlabel("Epochs")
    plt.ylabel("Accuracy")
    plt.title("Training and Validation Accuracy")
    # Loss plots
    plt.subplot(1, 2, 2)
    plt.plot(epochs_range, loss, label="Training Loss") 
    plt.plot(epochs_range, val_loss, label="Validation Loss")
    plt.legend(loc="upper right")
    plt.title("Training and Validation Loss")
    plt.ylabel("Loss")
    plt.xlabel("Epochs")
    plt.savefig(os.path.join(save_dir,"acc_loss_plot.png"))
    plt.show()

In [None]:
epochs = 100
plot_acc_loss(acc,val_acc,loss,val_loss,epochs)

#### Overfitting

When the model predicts significantly better the training set than the validation set, it is a sign of overfitting. 

### Reuse model outputs

To load the model again, it has to be built and then the weights added:

```
# load pre-trained model with the weights
vgg16_model = tf.keras.applications.VGG16()
# Add the layers of vgg16 model to a new sequential model 
model = Sequential()
for layer in vgg16_model.layers[:-1]: # remove last layer
    model.add(layer)
# Freeze the weights in the layers
for layer in model.layers:
    layer.trainable = False
# Add last layer for categories
model.add(Dense(len(class_names), activation = "softmax"))  

cnn.load_weights(os.path.join(save_dir, "model.h5"))
```

To get back the accuracy and loss data:

- Open the CSV with the model history.
- Save it to a dictionary.

```
history_df = pd.read_csv(os.path.join(save_dir, "model_history.csv"))
history_dict = history_df.to_dict()
try: # the key names vary across tf versions
    acc = np.array(list(history_dict["acc"].values()))
    val_acc = np.array(list(history_dict["val_acc"].values()))
    loss = np.array(list(history_dict["loss"].values()))
    val_loss = np.array(list(history_dict["val_loss"].values()))
    epochs_range = np.array(range(epochs))
except:
    try:
        acc = np.array(list(history_dict["accuracy"].values()))
        val_acc = np.array(list(history_dict["val_accuracy"].values()))
        loss = np.array(list(history_dict["loss"].values()))
        val_loss = np.array(list(history_dict["val_loss"].values()))
        epochs_range = np.array(range(epochs))
    except:
        pass
plot_acc_loss(acc,val_acc,loss,val_loss,epochs)
```

## Evaluating the model

### Get the test dataset

In [None]:
def test_labels_to_index(test_labels, class_names):
    """
    Return a 1D array of integers with the corresponding
    number for a class.
    Args.: - A list with the class name of each item in 
          the test data set.
           - A sorted list with the possible class names. 
    Eg.: test_labels[1] = "Buxus_sempervirens" corresponds to index 4
         in the list of class names.
    """
    test_labels_index = []
    for i in range(len(test_labels)):
        ind = class_names.index(test_labels[i])
        test_labels_index.append(ind)
    return np.array(test_labels_index)

In [None]:
test_files = os.listdir(test_dir)
test_labels = get_test_labels(test_files)
test_labels[:5]

In [None]:
test_labels_index = test_labels_to_index(test_labels, class_names)
test_labels_index[:5]

### Predict the probability of classifiying each class

In [None]:
# Get the probability of predicting each class for each image
predictions = model.predict_generator(test_array,steps=1,verbose=1)

Predictions is a 2D array with a shape: (number of examples in test, number of classes)

In [None]:
predictions.shape

In [None]:
# Get predicted class for each example
def predicted_class(predictions):
    """
    Return a 1D array with the predicted class for each example.
    Arg.: 2D array predictions of shape (number of examples, number of classes)
    """
    pred_class = []
    for i in range(len(predictions)):
        higher_prob = max(predictions[i])
        ind, = np.where(np.isclose(predictions[i], higher_prob))
        pred_class.append(ind[0])
    return np.array(pred_class)

pred_class = predicted_class(predictions)

### Plot the confussion matrix

In [None]:
test_labels_index

In [None]:
pred_class

In [None]:
# Build the confusion matrix
cm = tf.math.confusion_matrix(test_labels_index, pred_class) 
# Convert from tensor to array
sess = tf.Session()
conf_mat = sess.run(cm)
conf_mat

In [None]:
def plot_confusion_matrix(cm, class_names, model_name):
    """
    Returns a matplotlib figure containing the plotted confusion matrix.

    Args:
    cm (array, shape = [n, n]): a confusion matrix of integer classes
    class_names (array, shape = [n]): String names of the integer classes
    """
    figure = plt.figure(figsize=(20, 20))
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.BuGn)
    plt.title("Confusion matrix - "+ model_name, fontsize = 22)
    plt.colorbar()
    tick_marks = np.arange(len(class_names))
    plt.xticks(tick_marks, class_names, rotation=90)
    plt.yticks(tick_marks, class_names)
    plt.savefig(os.path.join(save_dir,"conf_matrix.png"))


In [None]:
plot_confusion_matrix(conf_mat, np.array(class_names), model_name)