In [None]:
import os
import os.path
import glob
from PIL import Image
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold                                                                                                                       
from sklearn.metrics import confusion_matrix,accuracy_score

import numpy as np
np.random.seed(1)

from keras import backend as K
from keras.utils import np_utils
from keras.utils.data_utils import get_file
from keras.preprocessing import image
from keras.applications import VGG16
from keras.applications.imagenet_utils import preprocess_input
from keras.applications.imagenet_utils import _obtain_input_shape
from keras.callbacks import ModelCheckpoint
from keras.models import Sequential,Model
from keras.layers import Input,Flatten,Dense,Dropout,GlobalAveragePooling2D,Conv2D,MaxPooling2D

In [None]:
imagedir = "Datasets/malimg_paper_dataset_imgs"

cur_dir = os.getcwd()
os.chdir(imagedir)  # the parent folder with sub-folders

In [None]:
# Get number of samples per family
list_fams = sorted(os.listdir(os.getcwd()), key=str.lower)  # vector of strings with family names
no_imgs = []  # No. of samples per family
for i in range(len(list_fams)):
    os.chdir(list_fams[i])
    len1 = len(glob.glob('*.png'))  # assuming the images are stored as 'png'
    no_imgs.append(len1)
    os.chdir('..')
num_samples = np.sum(no_imgs)  # total number of all samples

In [None]:
# Compute the labels
y = np.zeros(num_samples)
pos = 0
label = 0
for i in no_imgs:
    print ("Label:%2d\tFamily: %15s\tNumber of images: %d" % (label, list_fams[label], i))
    for j in range(i):
        y[pos] = label
        pos += 1
    label += 1
num_classes = label
np.save("malimg-y_numclass.npy", y)

In [None]:
# Compute the features
width, height,channels = (224,224,3)
X = np.zeros((num_samples, width, height, channels))
cnt = 0
print("Processing images ...")
for i in range(len(list_fams)):
    for img_file in image.list_pictures(list_fams[i], ext='jpg|jpeg|bmp|png'):
        #print("[%d] Processing image: %s" % (cnt, img_file))
        img = image.load_img(img_file, target_size=(224, 224))
        x = image.img_to_array(img)
        x = np.expand_dims(x, axis=0)
        X[cnt] = x
        #x /= 255
        #x = preprocess_input(x)
        #X[cnt] = np.array(x).reshape(width*height*channels)
        cnt += 1
print("Images processed: %d" %(cnt))
np.save("malimg-X_9339x224x224x3.npy", X)

In [None]:
os.chdir(cur_dir)

In [None]:
# Encoding classes (y) into integers (y_encoded) and then generating one-hot-encoding (Y)
encoder = LabelEncoder()
encoder.fit(y)
y_encoded = encoder.transform(y)
Y = np_utils.to_categorical(y_encoded)
np.save("malimg-Y_onehotencode.npy", Y)

In [None]:
# Creating base_model (VGG16 convolutional layers)
image_shape = (224, 224, 3)                                                                                                                                               
base_model = VGG16(weights='imagenet', input_shape=image_shape, include_top=False)         
for layer in base_model.layers:                                                       
    layer.trainable = False

In [None]:
# Extracting features from VGG!6 convolutional layers
vggfeatures = base_model.predict(X)
np.save("malimg-vgg16features.npy", vggfeatures)

In [None]:
# Loading VGG16 extracted features
vggfeatures = np.load('malimg-vgg16features.npy')

In [None]:
# Create stratified k-fold subsets                                                                                                                                        
kfold = 10  # no. of folds                                                                 
skf = StratifiedKFold(kfold, shuffle=True,random_state=1)
skfind = [None] * kfold  # skfind[i][0] -> train indices, skfind[i][1] -> test indices
cnt = 0                                              
for index in skf.split(X, y):         
    skfind[cnt] = index                                                 
    cnt += 1 

In [None]:
# Training top_model and saving min training loss weights
num_epochs=500
checkpointer = ModelCheckpoint(filepath='top_model-weights-min_loss-mlp-vgg16-2layers-160neurons-relu-0.4dropout-Adam-500epochs.h5', monitor='loss', verbose=0, save_best_only=True, save_weights_only=True, mode='min')
for i in range(kfold):
    train_indices = skfind[i][0]
    test_indices = skfind[i][1]
    X_train = vggfeatures[train_indices]
    Y_train = Y[train_indices]
    X_test = vggfeatures[test_indices]
    Y_test = Y[test_indices]
    y_test = y[test_indices]

    top_model = Sequential() 
    top_model.add(Flatten(input_shape=base_model.output_shape[1:])) # input_shape=(7,7,512)                                                
    top_model.add(Dense(160, activation='relu', name='fc1'))
    top_model.add(Dropout(0.4))                
    top_model.add(Dense(160, activation='relu', name='fc2'))
    top_model.add(Dropout(0.4))              
    top_model.add(Dense(num_classes, activation='softmax', name='predictions'))                             
    top_model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['accuracy'])

    top_model.fit(X_train, Y_train, validation_data=(X_test, Y_test), epochs=num_epochs, batch_size=X_train.shape[0], verbose=1, callbacks=[checkpointer])

In [None]:
# Loading the top_model trained in the previous step
top_model = Sequential() 
top_model.add(Flatten(input_shape=base_model.output_shape[1:])) # input_shape=(7,7,512)                                                
top_model.add(Dense(160, activation='relu', name='fc1'))
top_model.add(Dropout(0.4))                
top_model.add(Dense(160, activation='relu', name='fc2'))
top_model.add(Dropout(0.4))              
top_model.add(Dense(num_classes, activation='softmax', name='predictions'))                             
top_model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['accuracy'])

top_model.load_weights("top_model-weights-max_val_acc-mlp-vgg16-2layers-160neurons-relu-0.4dropout-Adam-500epochs.h5")

In [None]:
# Fine-tuning the model (base_model + top_model) 
checkpointer = ModelCheckpoint(filepath='model-weights-max_val_acc-mlp-vgg16-2layers-160neurons-relu-0.4dropout-Adam-10epochs.h5', monitor='val_acc', verbose=0, save_best_only=True, save_weights_only=True, mode='max')
for i in range(kfold):
    train_indices = skfind[i][0]
    test_indices = skfind[i][1]
    X_train = X[train_indices]
    Y_train = Y[train_indices]
    X_test = X[test_indices]
    Y_test = Y[test_indices]
    y_test = y[test_indices]

    model = Sequential() 
    model.add(base_model)                                                 
    model.add(top_model)                                                 
    model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['accuracy'])
    for layer in model.layers:                                                       
        layer.trainable = True
    model.fit(X_train, Y_train, validation_data=(X_test, Y_test), epochs=10, batch_size=300, verbose=1, callbacks=[checkpointer])