In [1]:
import os
import os.path
import glob
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold                                                                                                                       
from sklearn.metrics import confusion_matrix,accuracy_score
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV
import sklearn_evaluation.plot as skplot
from sklearn.model_selection import learning_curve
from sklearn import svm

import matplotlib.pyplot as plt
import matplotlib.cm as colormap
plt.rcParams['image.cmap'] = 'Paired'

import numpy as np
np.random.seed(1)

from keras.utils import np_utils
from keras.preprocessing import image
from keras.applications.inception_v3 import InceptionV3,preprocess_input

# Disable GPU
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

Using TensorFlow backend.


In [2]:
imagedir = "Datasets/malimg_paper_dataset_imgs"

In [3]:
cur_dir = os.getcwd()
os.chdir(imagedir)  # the parent folder with sub-folders

# Get number of samples per family
list_fams = sorted(os.listdir(os.getcwd()), key=str.lower)  # vector of strings with family names
no_imgs = []  # No. of samples per family
for i in range(len(list_fams)):
    os.chdir(list_fams[i])
    len1 = len(glob.glob('*.png'))  # assuming the images are stored as 'png'
    no_imgs.append(len1)
    os.chdir('..')
num_samples = np.sum(no_imgs)  # total number of all samples

# Compute the labels
y = np.zeros(num_samples)
pos = 0
label = 0
for i in no_imgs:
    print ("Label:%2d\tFamily: %15s\tNumber of images: %d" % (label, list_fams[label], i))
    for j in range(i):
        y[pos] = label
        pos += 1
    label += 1
num_classes = label

# Compute the features
width, height,channels = (224,224,3)
X = np.zeros((num_samples, width, height, channels))
cnt = 0
list_paths = [] # List of image paths
print("Processing images ...")
for i in range(len(list_fams)):
    for img_file in glob.glob(list_fams[i]+'/*.png'):
        #print("[%d] Processing image: %s" % (cnt, img_file))
        list_paths.append(os.path.join(os.getcwd(),img_file))
        img = image.load_img(img_file, target_size=(224, 224))
        x = image.img_to_array(img)
        x = np.expand_dims(x, axis=0)
        x = preprocess_input(x)
        X[cnt] = x
        cnt += 1
print("Images processed: %d" %(cnt))

os.chdir(cur_dir)

Label: 0	Family:       Adialer.C	Number of images: 122
Label: 1	Family:       Agent.FYI	Number of images: 116
Label: 2	Family:       Allaple.A	Number of images: 2949
Label: 3	Family:       Allaple.L	Number of images: 1591
Label: 4	Family:   Alueron.gen!J	Number of images: 198
Label: 5	Family:       Autorun.K	Number of images: 106
Label: 6	Family:     C2LOP.gen!g	Number of images: 200
Label: 7	Family:         C2LOP.P	Number of images: 146
Label: 8	Family:  Dialplatform.B	Number of images: 177
Label: 9	Family:       Dontovo.A	Number of images: 162
Label:10	Family:        Fakerean	Number of images: 381
Label:11	Family:   Instantaccess	Number of images: 431
Label:12	Family:      Lolyda.AA1	Number of images: 213
Label:13	Family:      Lolyda.AA2	Number of images: 184
Label:14	Family:      Lolyda.AA3	Number of images: 123
Label:15	Family:       Lolyda.AT	Number of images: 159
Label:16	Family:     Malex.gen!J	Number of images: 136
Label:17	Family:   Obfuscator.AD	Number of images: 142
Label:18

In [4]:
X.shape

(9339, 224, 224, 3)

In [5]:
# Encoding classes (y) into integers (y_encoded) and then generating one-hot-encoding (Y)
encoder = LabelEncoder()
encoder.fit(y)
y_encoded = encoder.transform(y)
Y = np_utils.to_categorical(y_encoded)

In [6]:
# Creating base_model (InceptionV3 notop)
image_shape = (224, 224, 3)                                                                                                                                                                                                                                                                                            
base_model = InceptionV3(weights='imagenet', input_shape=image_shape, include_top=False)

In [7]:
filename = 'malimg-inceptionv3features.npy'
if os.path.exists(filename):
    print("Loading InceptionV3 extracted features from %s ..." %(filename))
    inceptionv3features = np.load(filename)
else:
    print("Extracting features from InceptionV3 layers ...")
    inceptionv3features = base_model.predict(X)
    print("Saving InceptionV3 extracted features into %s ..." %(filename))
    np.save(filename, inceptionv3features)

Loading InceptionV3 extracted features from malimg-inceptionv3features.npy ...


In [8]:
inceptionv3features.shape

(9339, 5, 5, 2048)

In [9]:
inceptionv3features = np.reshape(inceptionv3features,(inceptionv3features.shape[0],-1))

In [None]:
inceptionv3features.shape

(9339, 51200)

In [None]:
# Training top_model
C_range = np.logspace(-2, 10, 13)
gamma_range = np.logspace(-9, 3, 13)
param_grid = dict(gamma=gamma_range, C=C_range)
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=1)

top_model = GridSearchCV(svm.SVC(), param_grid=param_grid, cv=cv, verbose=10, n_jobs=-1)
top_model.fit(inceptionv3features,y)  # Training 

Fitting 5 folds for each of 169 candidates, totalling 845 fits
[CV] C=0.01, gamma=1e-09 .............................................
[CV] C=0.01, gamma=1e-09 .............................................
[CV] C=0.01, gamma=1e-09 .............................................
[CV] C=0.01, gamma=1e-09 .............................................
[CV] C=0.01, gamma=1e-09 .............................................
[CV] C=0.01, gamma=1e-08 .............................................
[CV] C=0.01, gamma=1e-08 .............................................
[CV] C=0.01, gamma=1e-08 .............................................
[CV] C=0.01, gamma=1e-08 .............................................
[CV] C=0.01, gamma=1e-08 .............................................
[CV] C=0.01, gamma=1e-07 .............................................
[CV] C=0.01, gamma=1e-07 .............................................
[CV] C=0.01, gamma=1e-07 .............................................
[CV] C=0.01, g

[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed: 336.5min


[CV] C=0.01, gamma=0.0001 ............................................
[CV] ............. C=0.01, gamma=1e-09, score=0.315846, total=220.5min
[CV] C=0.01, gamma=0.0001 ............................................
[CV] ............. C=0.01, gamma=1e-05, score=0.401499, total=217.5min
[CV] C=0.01, gamma=0.0001 ............................................
[CV] ............. C=0.01, gamma=1e-05, score=0.401499, total=218.7min
[CV] C=0.01, gamma=0.0001 ............................................
[CV] ............. C=0.01, gamma=1e-06, score=0.315846, total=218.5min
[CV] C=0.01, gamma=0.0001 ............................................
[CV] ............. C=0.01, gamma=1e-05, score=0.401499, total=217.7min
[CV] C=0.01, gamma=0.001 .............................................
[CV] ............. C=0.01, gamma=1e-08, score=0.315846, total=218.9min
[CV] C=0.01, gamma=0.001 .............................................
[CV] ............. C=0.01, gamma=1e-09, score=0.315846, total=217.9min
[CV] C

[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed: 469.3min


[CV] C=0.01, gamma=0.01 ..............................................
[CV] ............. C=0.01, gamma=1e-05, score=0.401499, total=220.0min
[CV] C=0.01, gamma=0.01 ..............................................
[CV] ............. C=0.01, gamma=1e-09, score=0.315846, total=218.0min
[CV] C=0.01, gamma=0.01 ..............................................
[CV] ............. C=0.01, gamma=1e-08, score=0.315846, total=218.6min
[CV] ............. C=0.01, gamma=1e-07, score=0.315846, total=220.3min
[CV] ............. C=0.01, gamma=1e-08, score=0.315846, total=218.3min
[CV] ............. C=0.01, gamma=1e-06, score=0.315846, total=220.2min
[CV] C=0.01, gamma=0.01 ..............................................
[CV] C=0.01, gamma=0.1 ...............................................
[CV] C=0.01, gamma=0.1 ...............................................
[CV] C=0.01, gamma=0.1 ...............................................
[CV] ............. C=0.01, gamma=1e-07, score=0.315846, total=218.8min
[CV] C

[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed: 501.1min


[CV] C=0.01, gamma=1.0 ...............................................
[CV] ............ C=0.01, gamma=0.0001, score=0.614026, total=176.4min
[CV] C=0.01, gamma=1.0 ...............................................
[CV] ............. C=0.01, gamma=1e-05, score=0.401499, total=207.8min
[CV] C=0.01, gamma=1.0 ...............................................
[CV] ............ C=0.01, gamma=0.0001, score=0.612420, total=182.6min
[CV] C=0.01, gamma=10.0 ..............................................
[CV] ............ C=0.01, gamma=0.0001, score=0.621520, total=204.2min
[CV] C=0.01, gamma=10.0 ..............................................
[CV] ............ C=0.01, gamma=0.0001, score=0.617238, total=212.5min
[CV] C=0.01, gamma=10.0 ..............................................
[CV] ............ C=0.01, gamma=0.0001, score=0.619379, total=209.0min
[CV] C=0.01, gamma=10.0 ..............................................


In [None]:
print("Best parameters: %s - score: %0.4f" % (top_model.best_params_, top_model.best_score_))

In [None]:
top_model.cv_results_

In [None]:
top_model.grid_scores_

In [None]:
def plot_grid_search(cv_results, grid_param_1, grid_param_2, name_param_1, name_param_2):
    # Get Test Scores Mean and std for each grid search
    scores_mean = cv_results['mean_test_score']
    scores_mean = np.array(scores_mean).reshape(len(grid_param_2),len(grid_param_1))

    scores_sd = cv_results['std_test_score']
    scores_sd = np.array(scores_sd).reshape(len(grid_param_2),len(grid_param_1))

    # Plot Grid search scores
    _, ax = plt.subplots(1,1)

    # Param1 is the X-axis, Param 2 is represented as a different curve (color line)
    for idx, val in enumerate(grid_param_2):
        ax.plot(grid_param_1, scores_mean[idx,:], '-o', label= name_param_2 + ': ' + str(val))

    ax.set_title("Grid Search Scores", fontsize=20, fontweight='bold')
    ax.set_xlabel(name_param_1, fontsize=16)
    ax.set_ylabel('CV Average Score', fontsize=16)
    ax.legend(loc="best", fontsize=15)
    ax.grid('on')
    
    ax.set_xscale('log')
    #ax.set_ylim([0.0,1.0])

In [None]:
plot_grid_search(top_model.cv_results_,gamma_range,C_range,'Gamma','C')
figure = plt.gcf()
figure.set_size_inches(24, 9)
plt.show()

In [None]:
skplot.grid_search(top_model.grid_scores_, change='gamma', kind='bar')
figure = plt.gcf()
figure.set_size_inches(24, 9)
plt.show()

In [None]:
skplot.grid_search(top_model.grid_scores_, change='C', kind='bar')
figure = plt.gcf()
figure.set_size_inches(24, 9)
plt.show()

In [None]:
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=1)
svm_model = svm.SVC(C=top_model.best_params_['C'],gamma=top_model.best_params_['gamma'])
train_sizes = np.linspace(.1, 1.0, 10)
train_sizes, train_scores, test_scores = learning_curve(svm_model, inceptionv3features, y, cv=cv, n_jobs=-1, train_sizes=train_sizes)

In [None]:
figure = plt.gcf()
figure.set_size_inches(24, 9)
skplot.learning_curve(train_scores, test_scores, train_sizes)
plt.show()