#Dataset Preprocessing

##Importing libraries

In [1]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive/My Drive/AN2DL Homework1

Mounted at /gdrive
[Errno 2] No such file or directory: '/gdrive/My Drive/AN2DL Homework1'
/content


In [2]:
# Fix randomness and hide warnings
seed = 164

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['PYTHONHASHSEED'] = str(seed)
os.environ['MPLCONFIGDIR'] = os.getcwd()+'/configs/'

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=Warning)

import numpy as np
np.random.seed(seed)

import logging

import random
random.seed(seed)

In [3]:
# Import tensorflow
import tensorflow as tf

from tensorflow import keras as tfk
from keras import layers as tfkl
tf.autograph.set_verbosity(0)
tf.get_logger().setLevel(logging.ERROR)
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
tf.random.set_seed(seed)
tf.compat.v1.set_random_seed(seed)
print(tf.__version__)

2.14.0


In [4]:
# Import other libraries
import cv2
from keras.applications.mobilenet import preprocess_input
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
import seaborn as sns
from PIL import Image
import pandas as pd

##Data analysis

In [5]:
# Load the dataset
data = np.load('public_data.npz', allow_pickle=True)

images = np.array(data['data'])
labels = np.array(data['labels'])

FileNotFoundError: ignored

In [None]:
# Number of images to display
size = labels.size
num_img = 10

# Create a random index to display every time a different set of images
random_index = np.random.rand()
random_index = int(random_index*(size-num_img))

# Create subplots for displaying items
fig, axes = plt.subplots(2, num_img//2, figsize=(20, 9))
for i in range(random_index, random_index + num_img):
    title = 'true label: ' + labels[i] + ' index: ' + str(i)
    ax = axes[i%2, i%num_img//2]
    ax.set_title(title)
    ax.imshow(np.clip(images[i], 0, 255))  # Display clipped item images
    ax.axis('off')
plt.tight_layout()
plt.show()


In [None]:
# Display Shrek
plt.imshow(images[3144])
plt.show()

In [None]:
# Display troll
plt.imshow(images[723])
plt.show()

##Outliers elimination

In [None]:
# Save one of the outliers images
shrek = images[3144]
troll = images[723]

# Indexes we have to delete
fck_indexes = []
for i in range(size):
  to_check = images[i]
  # Look for Shrek
  # We use cv2 library, we subtract every pixel from the shrek's image to the one we are currently checking, if the result is 0
  # the two images are the same
  difference = cv2.subtract(shrek, to_check)
  b, g, r = cv2.split(difference)
  if cv2.countNonZero(b) == 0 and cv2.countNonZero(g) == 0 and cv2.countNonZero(r) == 0:
    fck_indexes.append(i)

  # Look for Troll
  difference = cv2.subtract(troll, to_check)
  b, g, r = cv2.split(difference)
  if cv2.countNonZero(b) == 0 and cv2.countNonZero(g) == 0 and cv2.countNonZero(r) == 0:
    fck_indexes.append(i)

# Delete outliers
images = np.delete(images, fck_indexes, axis=0)

labels = np.delete(labels, fck_indexes, axis=0)

In [None]:
# Print the shapes of the clean dataset
print("Data Shape:", images.shape)
print("Label Shape:", labels.shape)

##Check for duplicates

In [None]:
# We iterate through the dataset to find duplicates
# We are going to remove the duplicates since they may cause overfit in our model
same_indexes = []

for i in range(np.size(images, 0)):
  original = images[i]

  for j in range(np.size(images, 0)):
    to_check = images[j]
    difference = cv2.subtract(original, to_check)
    b, g, r = cv2.split(difference)
    if cv2.countNonZero(b) == 0 and cv2.countNonZero(g) == 0 and cv2.countNonZero(r) == 0 and j != i:
      same_indexes.append([i, j])


In [None]:
to_remove = []

# Convert the matrix to a set of tuples to remove duplicates
to_remove = set(tuple(sorted(row)) for row in same_indexes)

# Convert the set of tuples back to a list
to_remove = [list(row) for row in to_remove]

# Check label consistency
duplicati_labels = []
for i in range(len(to_remove)):
  duplicati_labels.append([labels[to_remove[i][0]], labels[to_remove[i][1]]])

flag = 0
for i in range(len(to_remove)):
  if (duplicati_labels[i][0] != duplicati_labels[i][1]):
    flag = 1

if (flag == 1):
  print('labels not consistent')

# Save only the indexes
to_remove_indexes = [to_remove[i][0] for i in range(len(to_remove))]


In [None]:
# Eliminate duplicates
images = np.delete(images, to_remove_indexes, axis=0)

labels = np.delete(labels, to_remove_indexes, axis=0)

In [None]:
# Save the clean dataset
np.savez('dataset_refined', images, labels)

## Balancing dataset: SMOTE

In [None]:
from imblearn.over_sampling import SMOTE
# We balance the classes using SMOTE
# Assuming you have a dataset with features 'images' and labels 'labels'
# Initialize the SMOTE object
smote = SMOTE(sampling_strategy='auto', random_state=seed)

# Flatten the images
images_flat = images.reshape(images.shape[0], -1)  # Reshape to (num_samples, num_features)

# Now you can use SMOTE on 'images_flat'
images_smote, labels_smote = smote.fit_resample(images_flat, labels)
# Now, 'images_smote' and 'labels_smote' contain the oversampled dataset
# Reshape the images
original_shape = (96, 96, 3)
images = images_smote.reshape(-1, *original_shape)
labels = labels_smote

In [None]:
# Check the number of occurences
unique, counts = np.unique(labels, return_counts=True)

print('Number of occurences:')
for i in range(unique.size):
  print(unique[i] + ': ' + str(counts[i]))

In [None]:
# Save the balanced dataset
np.savez_compressed('smoted_dataset', images, labels)