In [1]:
import glob
import csv
import os
from tsne import bh_sne
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.model_selection import StratifiedKFold                                                                                                                       
from sklearn.metrics import confusion_matrix,accuracy_score
from sklearn.neighbors import KNeighborsClassifier

import numpy as np
from numpy.random import RandomState
np.random.seed(1)

In [2]:
flist = glob.glob("train/*.bytes")

In [3]:
num_samples = len(flist)
print("Number of samples: %i" %(num_samples))

Number of samples: 10868


In [4]:
flist

['train/2jf7mkt8YCUM3AriKVas.bytes',
 'train/3oMQAPEXqKpiBlHvOJYz.bytes',
 'train/AxMQBldJRotHf69j32PG.bytes',
 'train/27o93DUCHcGPXdTxlNwg.bytes',
 'train/kaz4GujLsES3IDtnXc10.bytes',
 'train/9gBVZNKoIPkFM8vOp45G.bytes',
 'train/jvGIm6Yu2XKcE8hr4gMy.bytes',
 'train/aZKEkJF7CQtNSDf64d0P.bytes',
 'train/7cA9IVu3YOH5r8hK42wD.bytes',
 'train/3ftwoRhscPpNCxdlIbrU.bytes',
 'train/hGrRVejLFzNtXcBW6s2p.bytes',
 'train/5fsyaH409vx3SUXN2jOq.bytes',
 'train/7RVrih9gAL1FodIKNTc2.bytes',
 'train/l3iRBsCVb4Z81TLNxSgM.bytes',
 'train/HZI8SimVhNLz1JD5fbyW.bytes',
 'train/hrMLJ7DFIfmUGPB2q8y1.bytes',
 'train/DkNYd806tFbgoJ4zf1UA.bytes',
 'train/je3Im0RAozk8M7gnlXiO.bytes',
 'train/i5XIeh8WyrO79NklcwBJ.bytes',
 'train/gPUaRd12vzpH7lkV5DhG.bytes',
 'train/FalYuVzr8LgtiKRZ9oEy.bytes',
 'train/JVdaIxt2DBu8Ez0cH5gY.bytes',
 'train/IBEoqZCidDj8LlUGg6u9.bytes',
 'train/aM4JPZrzBXhgSCF8UTVb.bytes',
 'train/4BrHgYquho9skZMR06ea.bytes',
 'train/3k1meX0gV2WMjAvGDrCq.bytes',
 'train/dZv5FgTBxasz7GMhKRSL.bytes',
 

In [6]:
vectorizer = HashingVectorizer(input='filename', ngram_range=(3, 3), stop_words=None, token_pattern=r'\b\w\w\b')

In [None]:
X = vectorizer.transform(flist)

In [None]:
X = X.todense()

In [None]:
X

In [None]:
X.shape

In [None]:
trainLabelFile = "trainLabels.csv"

In [None]:
with open(trainLabelFile) as f:
    labelDict = dict(filter(None, csv.reader(f)))

In [None]:
labelDict

In [None]:
y = np.zeros(num_samples)
pos = 0
for f in flist:
    fname = os.path.splitext(os.path.basename(f))[0]
    print("File: %s - Class: %s" %(fname,labelDict[fname]))
    y[pos] = int(labelDict[fname])
    pos += 1

In [None]:
y

In [None]:
y.shape

In [None]:
uniques, no_imgs = np.unique(y, return_counts=True)
no_imgs

In [None]:
list_fams = ['Ramnit','Lollipop','Kelihos_ver3','Vundo','Simda','Tracur','Kelihos_ver1','Obfuscator.ACY','Gatak']

In [None]:
print("Running t-SNE ...")
vis_data = bh_sne(np.float64(X), d=2, perplexity=30., theta=0.5, random_state=RandomState(1))

In [None]:
print("Plotting t-SNE ...")
figure = plt.gcf()
figure.set_size_inches(24, 18)
plt.scatter(vis_data[:, 0], vis_data[:, 1], c=y, cmap=plt.cm.get_cmap("gist_ncar", len(list_fams)),edgecolors="black")
plt.clim(-0.5, len(list_fams)-0.5)
cbar = plt.colorbar(ticks=range(len(list_fams)))
cbar.ax.set_yticklabels(list_fams)                     
plt.show()

In [None]:
# Create stratified k-fold subsets                                                                                                                                        
kfold = 10  # no. of folds                                                                 
skf = StratifiedKFold(kfold, shuffle=True,random_state=1)
skfind = [None] * kfold  # skfind[i][0] -> train indices, skfind[i][1] -> test indices
cnt = 0                                              
for index in skf.split(X, y):         
    skfind[cnt] = index                                                 
    cnt += 1 

In [None]:
# Training top_model and saving min training loss weights
n_neighbors = 1
conf_mat = np.zeros((len(list_fams),len(list_fams))) # Initializing the Confusion Matrix
for i in range(kfold):
    train_indices = skfind[i][0]
    test_indices = skfind[i][1]
    X_train = X[train_indices]
    y_train = y[train_indices]
    X_test = X[test_indices]
    y_test = y[test_indices]
    
    top_model = KNeighborsClassifier(n_neighbors, weights='distance',n_jobs=-1)
    top_model.fit(X_train,y_train)  # Training
    y_pred = top_model.predict(X_test)  # Testing
    print("[%d] Test acurracy: %.4f" %(i,accuracy_score(y_test,y_pred)))
    cm = confusion_matrix(y_test,y_pred)  # Compute confusion matrix for this fold
    conf_mat = conf_mat + cm  # Compute global confusion matrix

In [None]:
# Computing the average accuracy
avg_acc = np.trace(conf_mat)/sum(no_imgs)
print("Average acurracy: %.4f" %(avg_acc))

In [None]:
# Viewing the confusion matrix
conf_mat = conf_mat.T  # since rows and cols are interchangeable
conf_mat_norm = conf_mat/no_imgs  # Normalizing the confusion matrix

In [None]:
print("Plotting the confusion matrix")
conf_mat = np.around(conf_mat_norm,decimals=2)  # rounding to display in figure
figure = plt.gcf()
figure.set_size_inches(24, 18)
plt.imshow(conf_mat,interpolation='nearest')
for row in range(len(list_fams)):
    for col in range(len(list_fams)):
        plt.annotate(str(conf_mat[row][col]),xy=(col,row),ha='center',va='center')
plt.xticks(range(len(list_fams)),list_fams,rotation=90,fontsize=10)
plt.yticks(range(len(list_fams)),list_fams,fontsize=10)
plt.title('Confusion matrix')
plt.colorbar()
plt.show()