A small reproducible example to demonstrate the modelling pipeline. 

In [1]:
import sys
import os
import pandas as pd
from PIL import Image
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, concatenate, Dense
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Import relevant functions from other modules
sys.path.append(os.path.abspath(".."))
from Data.ImageData.select_tifs import copy_tif_files
from Modelling.TrainTestSplit.TrainTestSplitNew import process_text_data, oversample_classes, split_data, generate_and_save_augmented_images
from Data.ImageData.SplitImagesNew import extract_vignettes
from Modelling.CNN.CNNnew import preprocess_data, create_tf_datasets, build_model, cnn_evaluate_model, plot_hist
from Modelling.MLP.MLPnew import load_and_preprocess, perform_grid_search, mlp_evaluate_model
from Modelling.CollaborativeModel.CollabModelNew import prepare_data, train_collaborative_model

### The datasets

Create subsample and merged datasets (bc csv data is too big for github)

In [None]:
cleaned_merged = pd.read_csv("cleaned_merged.csv")
cleaned_merged

In [None]:
class_counts = cleaned_merged["Class"].value_counts()
class_counts

### Preprocess Data

#### Image Data

In [14]:
# Select relevant tif files
source_dir = "/Users/adelelauzon/Desktop/MSc/STA5243/HURON_OverlapTiffsWithPP"
output_dir = "/Users/adelelauzon/Desktop/MSc/STA5243/2453Github/MiniExample/tifs_mini"
data_cleaned_path = "/Users/adelelauzon/Desktop/MSc/STA5243/2453Github/MiniExample/cleaned_merged.csv"  


In [None]:
copy_tif_files(source_dir, output_dir, data_cleaned_path)


In [16]:
# Split the tif mosaics into particles
vignettes_output = "/Users/adelelauzon/Desktop/MSc/STA5243/2453Github/MiniExample/vignettes_mini"
extracted_particles_csv_path = "/Users/adelelauzon/Desktop/MSc/STA5243/2453Github/MiniExample/extracted_particles.csv"


In [None]:
extract_vignettes(data_cleaned_path, output_dir,vignettes_output, extracted_particles_csv_path)

### Train/Val/Test Split

Since our class counts are so imbalanced, we will oversample Herpacticoida and Herpacticoida. 

In [18]:
particles = pd.read_csv(extracted_particles_csv_path)
text_data = pd.read_csv(data_cleaned_path)
text_all_cleaned = process_text_data(text_data, seed=42)

In [None]:
particles["Class"].value_counts()

In [None]:
particles[particles["Class"]=="Bosmina_1"]

In [21]:
# Specify the classes you want to oversample
classes_to_oversample = ('Bosmina_1',)  
train_img, val_img, test_img, train_text, val_text, test_text = split_data(particles, text_all_cleaned, classes_to_oversample, target_count=30)

datagen = ImageDataGenerator(rotation_range=40, width_shift_range=0.2, height_shift_range=0.2,
                                shear_range=0.2, zoom_range=0.2, horizontal_flip=True, fill_mode='nearest')


In [None]:
# Generate augmented images for each oversampled class
augmented_images = []
for class_label in classes_to_oversample:
    train_class = train_img[train_img['Class'] == class_label]
    aug_images = generate_and_save_augmented_images(train_class, vignettes_output, class_label=class_label)
    augmented_images.extend(aug_images)

augmented_df = pd.DataFrame(augmented_images, columns=["Vignette", "Class"])
train_img = pd.concat([train_img, augmented_df])

In [None]:
# Visualize augmented images
path_aug = "/Users/adelelauzon/Desktop/MSc/STA5243/2453Github/MiniExample/vignettes_mini/20180430_Huron_057_2mm_rep1_000004_vign000001_aug_0_698.png"
path_orig = "/Users/adelelauzon/Desktop/MSc/STA5243/2453Github/MiniExample/vignettes_mini/20180430_Huron_057_2mm_rep1_000004_vign000001.png"
orig = Image.open(path_orig)
aug = Image.open(path_aug)
display(orig)
display(aug)

In [None]:

train_img.to_csv("/Users/adelelauzon/Desktop/MSc/STA5243/2453Github/MiniExample/image_train.csv", index=False)
val_img.to_csv("/Users/adelelauzon/Desktop/MSc/STA5243/2453Github/MiniExample/image_val.csv", index=False)
test_img.to_csv("/Users/adelelauzon/Desktop/MSc/STA5243/2453Github/MiniExample/image_test.csv", index=False)
train_text.to_csv("/Users/adelelauzon/Desktop/MSc/STA5243/2453Github/MiniExample/text_train.csv", index=False)
val_text.to_csv("/Users/adelelauzon/Desktop/MSc/STA5243/2453Github/MiniExample/text_val.csv", index=False)
test_text.to_csv("/Users/adelelauzon/Desktop/MSc/STA5243/2453Github/MiniExample/text_test.csv", index=False)
print("Data augmentation and saving completed.")


In [None]:
train_text["ParticleID"]

### Build Models

#### CNN

In [26]:
base_path = "/Users/adelelauzon/Desktop/MSc/STA5243/2453Github/MiniExample/vignettes_mini/"
train_img, val_img, test_img, num_classes = preprocess_data(train_img, val_img, test_img, base_path)


In [27]:
train_ds, val_ds, test_ds = create_tf_datasets(train_img, val_img, test_img, num_classes)


In [28]:
model = build_model(num_classes)


In [None]:
hist = model.fit(train_ds, epochs=10, validation_data=val_ds)


In [30]:
fig_output_dir = "/Users/adelelauzon/Desktop/MSc/STA5243/2453Github/MiniExample/cnn_model_accuracy.png"
plot_hist(hist, fig_output_dir)


In [None]:
metrics_output_dir = "/Users/adelelauzon/Desktop/MSc/STA5243/2453Github/MiniExample/cnn_performance_metrics.txt"
model_output_dir = '/Users/adelelauzon/Desktop/MSc/STA5243/2453Github/MiniExample/cnn_final_model.keras'

model.save(model_output_dir)
cnn_evaluate_model(model, test_ds, test_img, metrics_output_dir)

#### MLP

In [7]:
base_path = "/Users/adelelauzon/Desktop/MSc/STA5243/2453Github/MiniExample"
model_output_dir = '/Users/adelelauzon/Desktop/MSc/STA5243/2453Github/MiniExample/mlp_final_model.keras'
fig_output_dir = "/Users/adelelauzon/Desktop/MSc/STA5243/2453Github/MiniExample/mlp_model_accuracy.png"
metrics_output_dir = "/Users/adelelauzon/Desktop/MSc/STA5243/2453Github/MiniExample/performance_metrics.txt"

text_train, text_val, text_test = [load_and_preprocess(file, base_path) for file in ["text_train.csv", "text_val.csv", "text_test.csv"]]



In [8]:
# Feature and label extraction
feature_columns = [col for col in text_train.columns if not col.startswith('class_')]
class_columns = [col for col in text_train.columns if col.startswith('class_')]

X_train, y_train = text_train[feature_columns].to_numpy(), text_train[class_columns].to_numpy()
X_val, y_val = text_val[feature_columns].to_numpy(), text_val[class_columns].to_numpy()
X_test, y_test = text_test[feature_columns].to_numpy(), text_test[class_columns].to_numpy()

input_shape, num_classes = X_train.shape[1], len(class_columns)

In [None]:
best_model = perform_grid_search(X_train, y_train, X_val, y_val, input_shape, num_classes, fig_output_dir, num_epochs=10)

In [10]:
# Train the best model
best_model.save(model_output_dir)


In [None]:
mlp_evaluate_model(best_model, X_train, y_train, X_test, y_test,X_val, y_val, metrics_output_dir, fig_output_dir, num_epochs=10)


### Combined Model


In [12]:
image_train = pd.read_csv("/Users/adelelauzon/Desktop/MSc/STA5243/2453Github/MiniExample/image_train.csv")
image_val = pd.read_csv("/Users/adelelauzon/Desktop/MSc/STA5243/2453Github/MiniExample/image_val.csv")
image_test = pd.read_csv("/Users/adelelauzon/Desktop/MSc/STA5243/2453Github/MiniExample/image_test.csv")

text_train = pd.read_csv("/Users/adelelauzon/Desktop/MSc/STA5243/2453Github/MiniExample/text_train.csv")
text_val = pd.read_csv("/Users/adelelauzon/Desktop/MSc/STA5243/2453Github/MiniExample/text_val.csv")
text_test = pd.read_csv("/Users/adelelauzon/Desktop/MSc/STA5243/2453Github/MiniExample/text_test.csv")


In [2]:
image_size = (300, 300)
batch_size = 64

# Paths
image_paths = [
"/Users/adelelauzon/Desktop/MSc/STA5243/2453Github/MiniExample/image_train.csv",
"/Users/adelelauzon/Desktop/MSc/STA5243/2453Github/MiniExample/image_val.csv",
"/Users/adelelauzon/Desktop/MSc/STA5243/2453Github/MiniExample/image_test.csv"
]

text_paths = [
"/Users/adelelauzon/Desktop/MSc/STA5243/2453Github/MiniExample/text_train.csv",
"/Users/adelelauzon/Desktop/MSc/STA5243/2453Github/MiniExample/text_val.csv",
"/Users/adelelauzon/Desktop/MSc/STA5243/2453Github/MiniExample/text_test.csv"
]

vignette_path = "/Users/adelelauzon/Desktop/MSc/STA5243/2453Github/MiniExample/vignettes_mini/"
output_dir = "/Users/adelelauzon/Desktop/MSc/STA5243/2453Github/MiniExample"
mlp_path = "/Users/adelelauzon/Desktop/MSc/STA5243/2453Github/MiniExample/mlp_final_model.keras"
cnn_path = "/Users/adelelauzon/Desktop/MSc/STA5243/2453Github/MiniExample/cnn_final_model.keras"


In [None]:
# Prepare data
# 1. Process Image Data
image_size = (300, 300)
batch_size = 64
trainAttrX, trainImagesX, trainY, valAttrX, valImagesX, valY, testAttrX, testImagesX, testY, num_classes = prepare_data(image_paths, text_paths, vignette_path)


In [None]:
# Train and save model
train_collaborative_model(mlp_path, cnn_path, num_classes, output_dir, trainAttrX, trainImagesX, trainY, valAttrX, valImagesX, valY, num_epochs=10)
