In [1]:
# List of imports.
import os
import os.path
import glob

import numpy as np
from PIL import Image

In [2]:
from keras.applications import VGG16
from keras.preprocessing import image
from keras.applications.imagenet_utils import preprocess_input

Using TensorFlow backend.


In [25]:
imagedir = "Datasets/malimg_paper_dataset_imgs"

cur_dir = os.getcwd()
os.chdir(imagedir)  # the parent folder with sub-folders

# Get number of samples per family
list_fams = sorted(os.listdir(os.getcwd()), key=str.lower)  # vector of strings with family names
no_imgs = []  # No. of samples per family
for i in range(len(list_fams)):
    os.chdir(list_fams[i])
    len1 = len(glob.glob('*.png'))  # assuming the images are stored as 'png'
    no_imgs.append(len1)
    os.chdir('..')
num_samples = np.sum(no_imgs)  # total number of all samples

# Compute the labels
y = np.zeros(num_samples)
pos = 0
label = 0
for i in no_imgs:
    print ("Label:%2d\tFamily: %s\tNumber of images: %d" % (label, list_fams[label], i))
    for j in range(i):
        y[pos] = label
        pos += 1
    label += 1

# Compute the features
width, height,channels = (224,224,3)
X = np.zeros((num_samples, width*height*channels))
cnt = 0
print("Processing images ...")
for i in range(len(list_fams)):
    for img_file in image.list_pictures(list_fams[i], ext='jpg|jpeg|bmp|png'):
        #print("[%d] Processing image: %s" % (cnt, img_file))
        img = image.load_img(img_file, target_size=(224, 224))
        x = image.img_to_array(img)
        x = np.expand_dims(x, axis=0)
        #x /= 255
        #x = preprocess_input(x)
        X[cnt] = np.array(x).reshape(width*height*channels)
        cnt += 1
print("Images processed: %d" %(cnt))
    
os.chdir(cur_dir)

Label: 0	Family: Adialer.C	Number of images: 122
Label: 1	Family: Agent.FYI	Number of images: 116
Label: 2	Family: Allaple.A	Number of images: 2949
Label: 3	Family: Allaple.L	Number of images: 1591
Label: 4	Family: Alueron.gen!J	Number of images: 198
Label: 5	Family: Autorun.K	Number of images: 106
Label: 6	Family: C2LOP.gen!g	Number of images: 200
Label: 7	Family: C2LOP.P	Number of images: 146
Label: 8	Family: Dialplatform.B	Number of images: 177
Label: 9	Family: Dontovo.A	Number of images: 162
Label:10	Family: Fakerean	Number of images: 381
Label:11	Family: Instantaccess	Number of images: 431
Label:12	Family: Lolyda.AA1	Number of images: 213
Label:13	Family: Lolyda.AA2	Number of images: 184
Label:14	Family: Lolyda.AA3	Number of images: 123
Label:15	Family: Lolyda.AT	Number of images: 159
Label:16	Family: Malex.gen!J	Number of images: 136
Label:17	Family: Obfuscator.AD	Number of images: 142
Label:18	Family: Rbot!gen	Number of images: 158
Label:19	Family: Skintrim.N	Number of images: 8

In [26]:
X

array([[  90.,   90.,   90., ...,  238.,  238.,  238.],
       [  90.,   90.,   90., ...,   32.,   32.,   32.],
       [  90.,   90.,   90., ...,  103.,  103.,  103.],
       ..., 
       [   0.,    0.,    0., ...,  105.,  105.,  105.],
       [   0.,    0.,    0., ...,  105.,  105.,  105.],
       [   0.,    0.,    0., ...,  105.,  105.,  105.]])

In [27]:
X.shape

(9339, 150528)

In [28]:
model = VGG16(weights='imagenet', include_top=True)
print('VGG16 model loaded.')

VGG16 model loaded.


In [30]:
preds = model.predict(x)

In [31]:
preds

array([[  6.25647161e-08,   1.00574152e-06,   1.14567577e-07,
          4.64175258e-07,   2.08213010e-07,   1.59856279e-06,
          1.91743759e-08,   4.34762200e-07,   2.26486264e-07,
          1.44188903e-07,   6.64400517e-08,   7.36960573e-08,
          1.64732043e-07,   4.03872309e-06,   1.69874667e-07,
          1.40938607e-06,   1.37456100e-08,   2.26212549e-07,
          4.65191215e-06,   2.87616331e-07,   4.24341124e-07,
          9.72530998e-08,   1.66273685e-06,   8.06267678e-07,
          5.47052650e-07,   1.81180184e-07,   6.36537777e-07,
          8.42178508e-07,   7.99013122e-08,   1.38082896e-07,
          4.67123451e-09,   9.81871722e-08,   2.29360744e-08,
          1.37690694e-08,   2.79008918e-06,   9.87232980e-08,
          7.31304681e-07,   6.22165544e-07,   4.55476112e-07,
          2.92150215e-08,   2.46164063e-07,   6.04383388e-07,
          3.70842095e-08,   7.67338619e-08,   1.03657726e-06,
          2.86766308e-08,   1.29931053e-07,   2.22950902e-07,
        

In [32]:
preds.shape

(1, 1000)