# XNature V2 Classifier


The XNature V2 dataset contains x-ray images of 8 diffent classes:
<ol>
<li>Fruits
<li>Guns
<li>Keys
<li>Knifes
<li>Razors
<li>Salmons
<li>Shurikens
<li>Wood
</ol>

The objectives of this notebook are explore the dataset, preprocess the image data and to create a image classifier using Deep Learning.<br><br>
For this particular task it'll be used the Keras API, since it's faster and simpler than using TensorFlow to test lots of hypotesis using Deep Learning.

## Data Exploration

#### Getting the data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from keras.layers import Dense, Activation, Conv2D, MaxPooling2D, Flatten
from keras.models import Sequential
from os import listdir
from os.path import isfile, join
import numpy as np
mypath = "C:\\Users\\rodri\\Desktop\\xnaturev2\\XNature"
pastas = listdir(mypath)
pastas.remove('.ipynb_checkpoints')
pastas.remove('Untitled.ipynb')
pastas.remove('Untitled1.ipynb')
pastas.remove('XNature')

Using TensorFlow backend.


In [None]:
from PIL import Image
x = []
y = []
siz = (30,30)

for pasta in pastas:
    path_pastas = mypath + '\\' + pasta
    files = listdir(path_pastas)
    for file in files:
        img = Image.open(path_pastas + '\\'+ file)
        img_rsz = img.resize(siz)
        x.append(np.asarray(img_rsz))
        y.append(pasta)

        

In [None]:
# Let's take a look at these images

x = np.array(x)
y = np.array(y)
rand = np.random.randint(0, x.shape[0], 20)
sample_imgs = x[rand]
sample_labels = y[rand]

# code to view the images

num_rows, num_cols = 2, 10
f, ax = plt.subplots(num_rows, num_cols, figsize=(12,5),
                     gridspec_kw={'wspace':0.03, 'hspace':0.01}, 
                     squeeze=True)

for r in range(num_rows):
    for c in range(num_cols):
        image_index = r * num_cols + c
        ax[r,c].axis("off")
        ax[r,c].imshow(sample_imgs[image_index], cmap='gray')
        ax[r,c].set_title(sample_labels[image_index])
plt.show()
plt.close()

In [None]:
print('So fat we\'ve seen how some of these images look like in grayscale and we are able to address each of the '+str(x.shape[0])+' images\ninto '+str(len(np.unique(y)))+' unique categories, just as it was described earlier')

## Separating the data into train and test

For this particular part it'll be used the sklearn's train test split, because it's a very simple way to split the data

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
# defining test size and fixed random state for reproducibility purposes
test_size = .20
random_state = 42

# transforming y labels into numbers
LE = LabelEncoder()
y_lab = LE.fit_transform(y)


x_train, x_test, y_train, y_test = train_test_split(x, y_lab, test_size = test_size, random_state = random_state)

In [None]:
print("Summary:")
print("Number of train imgs:", x_train.shape[0])
print("Number of test imgs:", x_test.shape[0])
print("Number of train labels:", len(np.unique(y_train)))
print("Number of test imgs:", len(np.unique(y_test)))

Let's take a look at the distribuition of labels in our dataset

In [None]:
unique_elements_train, counts_elements_train = np.unique(y_train, return_counts=True)
unique_elements_test, counts_elements_test = np.unique(y_test, return_counts=True)

print('In the training dataset: ')

for i in range(len(unique_elements_train)):
    print('Class {0} {1:.2f}%'.
          format(unique_elements_train[i], counts_elements_train[i]/len(y_train)*100))

print('\n\nIn the testing dataset: ')
for i in range(len(unique_elements_test)):

    print('Class {0} {1:.2f}%'.
          format(unique_elements_test[i], counts_elements_test[i]/len(y_test)*100))
    

Asserting the data above, it can be issued that both the train and the test arrays have a close proximity in distribuition.

## Data Processing

In [None]:
sample_imgs.shape

In [None]:
from keras.utils import to_categorical
img_height = sample_imgs[0].shape[0]
img_width = sample_imgs[0].shape[1]
num_channels = 1

train_data = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], x_train.shape[2], num_channels))
test_data = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], x_test.shape[2], num_channels))

In [None]:
train_data = train_data.astype('float32') / 255.
test_data = test_data.astype('float32') / 255.

num_classes = 8
train_labels_cat = to_categorical(y_train,num_classes)
test_labels_cat = to_categorical(y_test,num_classes)

In [None]:
for _ in range(5): 
    indexes = np.random.permutation(len(train_data))

train_data = train_data[indexes]
train_labels_cat = train_labels_cat[indexes]

# now set-aside 10% of the train_data/labels as the
# cross-validation sets
val_perc = 0.10
val_count = int(val_perc * len(train_data))

# first pick validation set from train_data/labels
val_data = train_data[:val_count,:]
val_labels_cat = train_labels_cat[:val_count,:]
train_data2 = train_data[val_count:,:]
train_labels_cat2 = train_labels_cat[val_count:,:]

In [None]:
model = Sequential()
model.add(Conv2D(32, (2,2), padding = 'same', activation = 'relu',  input_shape=(img_height, img_width, num_channels)))
model.add(MaxPooling2D(2,2))
model.add(Conv2D(64, (2,2), padding = 'same', activation = 'relu',  input_shape=(img_height, img_width, num_channels)))
model.add(MaxPooling2D(2,2))
model.add(Conv2D(128, (2,2), padding = 'same', activation = 'relu',  input_shape=(img_height, img_width, num_channels)))
model.add(MaxPooling2D(2,2))
model.add(Flatten())

model.add(Dense(128, activation = 'relu'))

model.add(Dense(8, activation = 'softmax'))

model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics=['accuracy'])

print(model.summary())

In [None]:
results = model.fit(train_data2, train_labels_cat2, 
                    epochs=15, batch_size=64,
                    validation_data=(val_data, val_labels_cat))

In [None]:
def show_plots(history, plot_title=None, fig_size=None):
    """ Useful function to view plot of loss values & accuracies across the various epochs
        Works with the history object returned by the train_model(...) call """
    assert type(history) is dict

    # NOTE: the history object should always have loss & acc (for training data), but MAY have
    # val_loss & val_acc for validation data
    loss_vals = history['loss']
    val_loss_vals = history['val_loss'] if 'val_loss' in history.keys() else None
    epochs = range(1, len(history['acc']) + 1)

    f, ax = plt.subplots(nrows=1, ncols=2, figsize=((16, 4) if fig_size is None else fig_size))

    # plot losses on ax[0]
    ax[0].plot(epochs, loss_vals, color='navy', marker='o', linestyle=' ', label='Training Loss')
    if val_loss_vals is not None:
        ax[0].plot(epochs, val_loss_vals, color='firebrick', marker='*', label='Validation Loss')
        ax[0].set_title('Training & Validation Loss')
        ax[0].legend(loc='best')
    else:
        ax[0].set_title('Training Loss')

    ax[0].set_xlabel('Epochs')
    ax[0].set_ylabel('Loss')
    ax[0].grid(True)

    # plot accuracies
    acc_vals = history['acc']
    val_acc_vals = history['val_acc'] if 'val_acc' in history.keys() else None

    ax[1].plot(epochs, acc_vals, color='navy', marker='o', ls=' ', label='Training Accuracy')
    if val_acc_vals is not None:
        ax[1].plot(epochs, val_acc_vals, color='firebrick', marker='*', label='Validation Accuracy')
        ax[1].set_title('Training & Validation Accuracy')
        ax[1].legend(loc='best')
    else:
        ax[1].set_title('Training Accuracy')

    ax[1].set_xlabel('Epochs')
    ax[1].set_ylabel('Accuracy')
    ax[1].grid(True)

    if plot_title is not None:
        plt.suptitle(plot_title)

    plt.show()
    plt.close()

    # delete locals from heap before exiting (to save some memory!)
    del loss_vals, epochs, acc_vals
    if val_loss_vals is not None:
        del val_loss_vals
    if val_acc_vals is not None:
        del val_acc_vals
        

In [None]:
show_plots(results.history)

### Now into the test set

In [None]:
********************************************************