# Load Modules
The google drive path with the data is here:
https://drive.google.com/drive/folders/1SvfyjqJLUp6ma2q-xGjmC1gqt1fzhNGW?usp=sharing

In [2]:


from google.colab import drive
drive.mount('/content/gdrive/')
import sys
sys.path.append('/content/gdrive/My Drive/Colab Notebooks')
!pip3 install -r /content/gdrive/My\ Drive/Colab\ Notebooks/GLC/requirements.txt


Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
import copy
from pathlib import Path
import pickle
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingClassifier
from GLC.data_loading.environmental_raster import PatchExtractor
from GLC.data_loading.common import load_patch
from GLC.metrics import predict_top_30_set
from GLC.submission import generate_submission_file
from GLC.metrics import top_k_error_rate_from_sets
from GLC.metrics import top_30_error_rate

In [5]:
try:
  resolver = tf.distribute.cluster_resolver.TPUClusterResolver()
  tf.config.experimental_connect_to_cluster(resolver)
  tf.tpu.experimental.initialize_tpu_system(resolver)
  print("All devices: ", tf.config.list_logical_devices('TPU'))
  strategy = tf.distribute.experimental.TPUStrategy(resolver)
except ValueError:
  strategy = tf.distribute.get_strategy() 

# **Load Training Data**


### **Load Complete Dataset**

We can use the whole dataset to train on the environmental vectors since it is not too computationaly expensive to do so

In [6]:

# SUBMISSION_PATH = Path("submissions")
# os.makedirs(SUBMISSION_PATH, exist_ok=True)

DATA_PATH = Path("/content/gdrive/My Drive/Colab Notebooks/input/")

df_obs_fr = pd.read_csv(DATA_PATH / "observations" / "observations_fr_train.csv", sep=";", index_col="observation_id")
df_obs_us = pd.read_csv(DATA_PATH / "observations" / "observations_us_train.csv", sep=";", index_col="observation_id")
df_obs = pd.concat((df_obs_fr, df_obs_us))

obs_id_train = df_obs.index[df_obs["subset"] == "train"].values
obs_id_val = df_obs.index[df_obs["subset"] == "val"].values

y_train = df_obs.loc[obs_id_train]["species_id"].values
y_val = df_obs.loc[obs_id_val]["species_id"].values

n_val = len(obs_id_val)
print("Validation set size: {} ({:.1%} of train observations)".format(n_val, n_val / len(df_obs)))

# df_obs_fr_test = pd.read_csv(DATA_PATH / "observations" / "observations_fr_test.csv", sep=";", index_col="observation_id")
# df_obs_us_test = pd.read_csv(DATA_PATH / "observations" / "observations_us_test.csv", sep=";", index_col="observation_id")
# df_obs_test = pd.concat((df_obs_fr_test, df_obs_us_test))
# obs_id_test = df_obs_test.index.values
# print("Number of observations for testing: {}".format(len(df_obs_test)))
# print(df_obs_test.head())

df_env = pd.read_csv(DATA_PATH / "pre-extracted" / "environmental_vectors.csv", sep=";", index_col="observation_id")

Validation set size: 40080 (2.5% of train observations)


### **Load a subset of the data**

We can use a subset of the dataset to train on the CNNs and Inception v2 since it is too computationaly expensive to do so.*italicised text*


In [7]:
n_observations = 100000
n_species = 500

path_temp = "data" + str(n_observations) + "-" + str(500) + ".csv"
df_obs_subset = pd.read_csv(DATA_PATH / "data-subset" / path_temp, sep=",", index_col="observation_id")
print("Number of observations for training: {}".format(len(df_obs_subset)))
y_true = df_obs_subset['species_id']
# Relabel

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(df_obs_subset.species_id)
df_obs_subset['species_id'] = le.transform(df_obs_subset.species_id)
number_of_unique_species = np.unique(df_obs_subset['species_id']).shape[0]
print("Number of unique species: "+str(number_of_unique_species))

Number of observations for training: 100000
Number of unique species: 500


# Random Forest on Environmental Vectors

In [9]:
X_train = df_env.loc[obs_id_train].values

X_val = df_env.loc[obs_id_val].values
# X_test = df_env.loc[obs_id_test].values

# print(y_train)
imp = SimpleImputer(
    missing_values=np.nan,
    strategy="constant",
    fill_value=np.finfo(np.float32).min,
)
imp.fit(X_train)

X_train = imp.transform(X_train)
X_val = imp.transform(X_val)
# X_test = imp.transform(X_test)
n_features = X_train.shape[1]
# print(X_train)
# print("Rescaling")
#X_train = X_train / X_train.max(axis=0)
#X_val = X_val / X_val.max(axis=0)
#X_test = X_test / X_test.max(axis=0)

"""
model = Sequential()
model.add(Dense(32, activation='relu',
                kernel_initializer='he_normal', input_shape=(n_features,)))
model.add(Dense(16, activation='relu', kernel_initializer='he_normal'))
model.add(Dense(1, activation='softmax'))
model.compile(optimizer='adam', loss='binary_crossentropy',
              metrics=['accuracy'])
print(model.summary())
model.fit(X_train, y_train, epochs=1, batch_size=32, verbose=1)

"""
filename = "/content/gdrive/My Drive/Colab Notebooks/rf.sav"

rf_model = RandomForestClassifier(n_estimators=64, max_depth=10,  verbose=0, n_jobs=-1)

rf_model.fit(X_train, y_train)
pickle.dump(est, open(filename, 'wb'))

# rf_model = pickle.load(open(filename, 'rb'))

def batch_predict(predict_func, X, batch_size=1024):
    res = predict_func(X[:1])
    n_samples, n_outputs, dtype = X.shape[0], res.shape[1], res.dtype

    preds = np.empty((n_samples, n_outputs), dtype=dtype)

    for i in range(0, len(X), batch_size):
        X_batch = X[i:i+batch_size]
        preds[i:i+batch_size] = predict_func(X_batch)

    return preds


def predict_func(X):
    y_score = rf_model.predict_proba(X)
    s_pred = predict_top_30_set(y_score)
    return s_pred


s_val = batch_predict(predict_func, X_val, batch_size=1024)
score_val = top_k_error_rate_from_sets(y_val, s_val)

print("Top-30 error rate: {:.1%}".format(score_val))

# s_pred = batch_predict(predict_func, X_test, batch_size=1024)
# print("Generate the submission file")
# generate_submission_file(SUBMISSION_PATH / "random_forest_on_environmental_vectors.csv", df_obs_test.index, s_pred)


EOFError: ignored

# Deep Neural Network (Inception v2) on image patches

In [10]:
from tensorflow.keras.applications.inception_v3 import InceptionV3
from keras.applications import imagenet_utils
from keras.preprocessing.image import img_to_array, load_img
from keras.applications.inception_v3 import preprocess_input
import numpy as np
import cv2

inception_model = Sequential()
inception_model.add(Conv2D(3, kernel_size = 3, padding="same", input_shape = (256, 256, 6), activation = 'relu'))
inception_model.add(InceptionV3(weights='imagenet', include_top = False, input_shape= (256, 256, 3), pooling="avg", classes=number_of_unique_species))
inception_model.add(Dense(number_of_unique_species, activation = 'softmax'))
inception_model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])
inception_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 256, 256, 3)       165       
                                                                 
 inception_v3 (Functional)   (None, 2048)              21802784  
                                                                 
 dense (Dense)               (None, 500)               1024500   
                                                                 
Total params: 22,827,449
Trainable params: 22,793,017
Non-trainable params: 34,432
_________________________________________________________________


In [None]:
plotdata = []
from sklearn.model_selection import train_test_split
y_all = np.array(pd.get_dummies(df_obs['species_id']).values)
start = 0
end = 1
for i in range(start, end):
  path_temp = "patches" + str(n_observations) + "-" + str(n_species) + '-' + str(i) + ".npz"
  patches = np.load(DATA_PATH  / "data-subset" / "patches" / path_temp)['arr_0']
  X = np.array(patches)
  y = y_all[i*1000:(i+1)*1000]
  X_train, X_test_and_val, y_train, y_test_and_val = train_test_split(X, y, test_size=0.33, random_state=42)
  X_test, X_val, y_test, y_val = train_test_split(X_test_and_val, y_test_and_val, test_size=0.5, random_state=42)

  print("\n\nTraining observations " + str(i*1000+1) + " to " + str((i+1)*1000))
  inception_model.fit(X_train, y_train, epochs = 50, validation_data=(X_test, y_test), batch_size = 32, verbose=1)
  # inception_model.save_weights('/content/gdrive/My Drive/Colab Notebooks/checkpoint500-100000-'+str(i)+'.h5')
  (loss, accuracy) = inception_model.evaluate(x = X_test, y = y_test)
  print('Loss: {} Accuracy: {}'.format(loss, accuracy * 100))
  predictions = inception_model.predict(X_val)
  wrong = 0
  for index in range(len(y_val)):
      top_30_preds = predictions[index].argsort()[-30:][::-1]
      if le.transform(np.where(y_val[index])[0])[0] in top_30_preds:
        continue
      wrong+=1
  print("Top 30 Error Rate with "+str((i+1)*1000)+" observations used for training: "+str(1.0*wrong/len(y_test)))
  plotdata.append([(i+1)*1000, 1.0*wrong/len(y_test)])

inception_model.save_weights('/content/gdrive/My Drive/Colab Notebooks/checkpoint500-100000.h5')
np.savetxt('/content/gdrive/My Drive/Colab Notebooks/plotdata_'+ str(n_species) + '_'  + str(start) + '-'+str(end) + '.csv', np.array(plotdata), delimiter=',')

# Ensemble Learning using RL and DNN

In [None]:
models_list = [rf_model, ] 

def stacked_dataset(members, inputX):
	stackX = None
	for model in members:
		# make prediction
		yhat = model.predict(inputX, verbose=0)
		# stack predictions into [rows, members, probabilities]
		if stackX is None:
			stackX = yhat #
		else:
			stackX = np.dstack((stackX, yhat))
	# flatten predictions to [rows, members x probabilities]
	stackX = stackX.reshape((stackX.shape[0], stackX.shape[1]*stackX.shape[2]))
	return stackX

# fit a model based on the outputs from the ensemble members
def fit_stacked_model(members, inputX, inputy):
	# create dataset using ensemble
	stackedX = stacked_dataset(members, inputX)
	# fit the meta learner
	model = LogisticRegression() #meta learner
	model.fit(stackedX, inputy)
	return model
model = fit_stacked_model(members, X_test,y_test)

# make a prediction with the stacked model
def stacked_prediction(members, model, inputX):
	# create dataset using ensemble
	stackedX = stacked_dataset(members, inputX)
	# make a prediction
	yhat = model.predict(stackedX)
	return yhat

yhat = stacked_prediction(members, model, X_test)
score = f1_m(y_test/1.0, yhat/1.0)
print('Stacked F Score:', score)