# Studying a model performance with synthetic data

Synthetic data have been created for each class, using as input the pictures that all models misclassify. The idea is the following: Maybe having more pics similar to the misclassified ones we can improve the performance of the model in that areas.

- There are a test set with 150 real pics of each class.
- Now, for each class, there are other 3 train/validation folders:
  - With all real pics and 50 synthetic pics
  - With all real pics and 250 synthetic pics
  - With all real pics and 480 synthetic pics


- The study of what pics were generated synthetically can be found here => https://github.com/albertovpd/viu_tfm-deep_vision_classification/tree/synthetic_data_study

- They were created using the pytorch implementation of this repo => https://github.com/mit-han-lab/data-efficient-gans


In [None]:
# Google Drive stuff
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

In [None]:
# tf
%tensorflow_version 2.x
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

- libs

In [None]:
%tensorflow_version 2.x
# batch ingestion of pics without pickle
from tensorflow.keras.preprocessing import image_dataset_from_directory

# nns
from tensorflow.keras.applications import ResNet50 

from tensorflow.keras import Model
from tensorflow.keras.models import load_model # Sequential
from tensorflow.keras import layers 

# optimization
from tensorflow.keras.optimizers import SGD #Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy, categorical_crossentropy
from tensorflow.keras.callbacks import EarlyStopping

# nn architectures, metrics, viz & reports => written in my_functions202202 file
import sys
sys.path.append("/content/drive/My Drive/2-Estudios/viu-master_ai/tfm-deep_vision/src")
from my_functions202202 import generic_last_2layers, plotting_model, model_evaluation, classification_report_pic, confusion_matrix_report

import numpy as np
%matplotlib inline

# navigating through folder
import os

- paths

In [None]:
base_folder = "/content/drive/My Drive/2-Estudios/viu-master_ai/tfm-deep_vision/"
trainval_folders = base_folder + "input/dataset_synth_data-1test_3trainval/train_val_ds/"
test_folder = base_folder+"input/dataset_synth_data-1test_3trainval/test_ds/"

src_folder =  base_folder+"src/"
output_folder = base_folder + "/output/"

- functions

In [None]:
#my_functions202202.py

- common parameters

In [None]:
image_size = (128,128)
batch_size = 128
epochs = 250
opt = SGD(momentum=0.9) 

In [None]:
base_model_resnet50 = ResNet50(include_top=False, weights='imagenet', input_shape=(128, 128, 3), classes = 5) 
models_dict = {"resnet50_NOdataAug_dropoutFirst007": 
         generic_last_2layers(data_augmentation= None, nn=base_model_resnet50, neurons_final_layer=5, dropout_layers=True,  dropout_position="first",  dropout_percent = 0.07)
         }

In [None]:
test_ds = image_dataset_from_directory(
    test_folder,
      class_names=["Bedroom","Bathroom","Dinning","Livingroom","Kitchen"],
      seed=None,
      validation_split=None, 
      subset=None,
      image_size= image_size,
      batch_size= batch_size,
      color_mode='rgb',
      shuffle=False 
  )

# Irregular partitions for train set

The motivation is to check performance vs lack of data. Then:

- 150 pics for each class are saved in the val_ds (indeed, the very same than before)
- with the rest of them, 4 subfolders are created. each time the data is randomly shuffled:
  - 20% are stored in test_ds folder
  - 80%, 62%, 46%, 30% of the remaining pics are saved for the train dataset. The rest is discarded.



In [None]:
os.listdir(trainval_folders)

In [None]:
# total = 0
# for root, dirs, files in os.walk(trainval_folders):
#     total += len(files)
#     print(dirs, len(files))

In [None]:
# remaining_folders_irr = os.listdir(irreg_input)[2:3]
folders = os.listdir(trainval_folders)
folders

In [None]:
for f in folders:
    print("\n ================================",
          "\n FOLDER : ",f)
    print("train dataset")
    train_path = trainval_folders+f+"/"+'train_ds/'
    print(train_path)
    train_ds = image_dataset_from_directory(
        train_path,
        class_names=["Bedroom","Bathroom","Dinning","Livingroom","Kitchen"],
        seed=None,
        validation_split=None, 
        subset=None,
        image_size= image_size,
        batch_size= batch_size,
        color_mode='rgb',
        shuffle=False 
        )

    print("\n val dataset")
    val_path = trainval_folders+f+"/"+"val_ds"
    val_ds = image_dataset_from_directory(
      val_path,
      class_names=["Bedroom","Bathroom","Dinning","Livingroom","Kitchen"],
      seed=None,
      validation_split=None, 
      subset=None,
      image_size= image_size,
      batch_size= batch_size,
      color_mode='rgb',
      shuffle=False 
    )

    class_names = train_ds.class_names
    print(class_names)

    # checking numbers non sense now because all have same volume
    paths = ['train_ds/', 'val_ds/']
    for p in paths:
        for dir,subdir,files in os.walk(trainval_folders+f+"/"+p):
            print(dir,' => ', p, str(len(files)))

    # calling model
    model_name, nn = list(models_dict.items())[0]
    print("\n", model_name)
    nn.summary()

    nn.compile( optimizer = opt, #"adam", 
                  loss=SparseCategoricalCrossentropy(from_logits=True) ,#'categorical_crossentropy', 
                  metrics=['accuracy'] # "recall"
                  )

    # ====== USING VAL DATASET ======

    history = nn.fit(
          train_ds,
          validation_data=val_ds,
          epochs=epochs,
          #callbacks = callbacks # <=== REMOVE CALLBACK for full results
          )

    number_of_epochs_it_ran = len(history.history['loss']) 
    print("run epochs: ",number_of_epochs_it_ran)
    name = model_name+"_irregKfolds_"+f
    #models_dict[m].save(output_folder+name+".h5")

    # saving model accuracy/loss graph
    plotting_model(history,number_of_epochs_it_ran, name, output_folder, "val") 

    # saving model metrics to json
    evaluation = nn.evaluate(test_ds, batch_size=batch_size, return_dict=True)
    model_evaluation(evaluation, output_folder, name+"_trainVal")

    # get inferences
    y_pred_val_float = nn.predict(val_ds)
    y_pred_val = np.argmax(y_pred_val_float, axis=1)

    # get real labels
    y_target = tf.concat([y for x, y in val_ds], axis=0) 

    # classification and confusion matrix reports
    classification_report_pic(y_pred_val, y_target,  class_names, output_folder, name+"_trainVal")
    confusion_matrix_report(y_pred_val, y_target, class_names, output_folder, name+"_trainVal")

    # ====== USING TEST DATASET ======

    # saving model metrics to json
    evaluation_test = nn.evaluate(test_ds, batch_size=batch_size, return_dict=True)
    model_evaluation(evaluation_test, output_folder, name+"_trainTest")

    # get inferences
    y_pred_test_float = nn.predict(test_ds)
    y_pred_test = np.argmax(y_pred_test_float, axis=1)

    # get real labels for val_ds
    y_target_test = tf.concat([y for x, y in test_ds], axis=0) 

    # classification and confusion matrix reports
    classification_report_pic(y_pred_test, y_target_test,  class_names, output_folder, name+"_trainTest")
    confusion_matrix_report(y_pred_test, y_target_test, class_names, output_folder, name+"_trainTest")