# Import libraries

In [1]:
using Pkg

using Flux, MLDataPattern, Mill, JsonGrinder, JSON, Statistics, IterTools, StatsBase, ThreadTools
using JsonGrinder: suggestextractor, ExtractDict
using Mill: reflectinmodel
using CSV, DataFrames
using Random
using Dates
using Plots
using Printf

In [2]:
THREADS = Threads.nthreads() 

1

# Import data

In [3]:
PATH_BEN_REPORTS = "C:/Users/aleca/PycharmProjects/MasterThesis/dataset1/ben_behavior/"
PATH_MAL_REPORTS = "C:/Users/aleca/PycharmProjects/MasterThesis/dataset1/mal_behavior/"
PATH_TO_LABELS = "C:/Users/aleca/PycharmProjects/MasterThesis/dataset1/labels_behavior.csv" ;

In [4]:
df=CSV.read(PATH_TO_LABELS,DataFrame);

df_labels_ben=filter("label" => x -> x == 0,df)[1:20,:]
df_labels_mal=filter("label" => x -> x == 1,df)[1:20,:];

df_labels=vcat(df_labels_ben,df_labels_mal)

n_classes=length(Set(df_labels.label));

In [5]:
@time begin
#labels=[]
jsons = map(df_labels.name,df_labels.label) do n,y
    try 
        if y==1
            path=PATH_MAL_REPORTS
        elseif y==0
            path=PATH_BEN_REPORTS
        end
        x=open(JSON.parse, "$(path)$(n).json")
        #append!(labels,y)
        #x=x
        #delete!(x,"static") # Take only the behavioral info
        #delete!(x,"behavior") # Take only the static info
    catch e
        @error "Error when processing sha $n: $e"
    end
end ;
end
n_samples=length(jsons)
println("N samples: $(n_samples)")
println("N labels: $(n_classes)")
@assert n_samples == length(df_labels.label)

230.440142 seconds (39.00 M allocations: 3.362 GiB, 1.41% gc time, 0.10% compilation time)
N samples: 40
N labels: 2


In [6]:
typeof(jsons)

Vector{Dict{String, Any}} (alias for Array{Dict{String, Any}, 1})

# Split train and test, and define the schema 

In [7]:
idx=shuffle(collect(1:n_samples))
tr_frac=0.8
train_indexes=idx[1:round(Int,tr_frac*n_samples)]
test_indexes=setdiff(idx,train_indexes)

train_size = length(train_indexes)
test_size = length(test_indexes)

println("Train size: $(train_size)")
println("Test size: $(test_size)")

Train size: 32
Test size: 8


In [8]:
chunks = Iterators.partition(train_indexes, div(train_size, THREADS))
sch_parts = tmap(chunks) do ch
    JsonGrinder.schema(jsons[ch])
end
complete_schema = merge(sch_parts...)
printtree(complete_schema)

┌ Info: In path [value]: Instability in the schema detected. Using multiple representation.
└ @ JsonGrinder C:\Users\aleca\.julia\packages\JsonGrinder\HGYbL\src\schema\schema.jl:13
┌ Info: In path [iid]: Instability in the schema detected. Using multiple representation.
└ @ JsonGrinder C:\Users\aleca\.julia\packages\JsonGrinder\HGYbL\src\schema\schema.jl:13
┌ Info: In path [iid]: Instability in the schema detected. Using multiple representation.
└ @ JsonGrinder C:\Users\aleca\.julia\packages\JsonGrinder\HGYbL\src\schema\schema.jl:13
┌ Info: In path [type]: Instability in the schema detected. Using multiple representation.
└ @ JsonGrinder C:\Users\aleca\.julia\packages\JsonGrinder\HGYbL\src\schema\schema.jl:13
┌ Info: In path [desired_access]: Instability in the schema detected. Using multiple representation.
└ @ JsonGrinder C:\Users\aleca\.julia\packages\JsonGrinder\HGYbL\src\schema\schema.jl:13


[34m[Dict][39m[90m  # updated = 32[39m
[34m  ├── processtree: [39m[31m[List][39m[90m  # updated = 32[39m
[34m  │                [39m[31m  ╰── [39m[32m[Dict][39m[90m  # updated = 84[39m
[34m  │                [39m[31m      [39m[32m  ├────── children: [39m[33m[List][39m[90m  # updated = 84[39m
[34m  │                [39m[31m      [39m[32m  │                 [39m[33m  ╰── [39m[36m[Dict][39m[90m  # updated = 10[39m
[34m  │                [39m[31m      [39m[32m  │                 [39m[33m      [39m[36m  ├────── children: [39m[35m[List][39m[90m  # updated = 10[39m
[34m  │                [39m[31m      [39m[32m  │                 [39m[33m      [39m[36m  │                 [39m[35m  ╰── [39m[34m[Dict][39m[90m  # updated = 35[39m
[34m  │                [39m[31m      [39m[32m  │                 [39m[33m      [39m[36m  │                 [39m[35m      [39m[34m  ├────── children: [39m[31m[List][39m[90m  # updated =

Excessive output truncated after 524331 bytes.

[39m[Scalar - Int64], 1 unique values[90m  # updated = 1[39m
[34m  │                [39m[31m  │         [39m[32m  ├────────────────────── OleInitialize: [39m[39m[Scalar - Int64], 1 unique values[90m  # updated = 1[39m
[34m  │                [39m[31m  │         [39m[32m  ├──────────────── GetForegroundWindow: [39m[39m[Scalar - Int64], 1 unique values[90m  # updated = 1[39m
[34m  │                [39m[31m  │         [39m[32m  ├────────────────────── GetSystemInfo: [39m[39m[Scalar - Int64], 2 unique values[90m  # updated = 2[39m
[34m  │                [39m[31m  │         [39m[32m  ├──────────────── NtFreeVirtualMemory: [39m[39m[Scalar - Int64], 2 unique values[90m  # updated = 2[39m
[34m  │                [39m[31m  │         [39m[32m  ├───────────────────── RegSetValueExA: [39m[39m[Scalar - Int64], 1 unique values[90m  # updated = 1[39m
[34m  │                [39m[31m  │         [39m[32m  ├──────────────────────── NtOpenKeyEx: [39m[39

In [9]:
extractor=suggestextractor(complete_schema);
data=map(extractor,jsons);

└ @ JsonGrinder C:\Users\aleca\.julia\packages\JsonGrinder\HGYbL\src\schema\dict.jl:61
└ @ JsonGrinder C:\Users\aleca\.julia\packages\JsonGrinder\HGYbL\src\schema\dict.jl:61
└ @ JsonGrinder C:\Users\aleca\.julia\packages\JsonGrinder\HGYbL\src\schema\dict.jl:61


LoadError: MethodError: no method matching val2idx(::ExtractCategorical{Number, UInt32}, ::Nothing)
[0mClosest candidates are:
[0m  val2idx(::ExtractCategorical{var"#s60", I} where var"#s60"<:Number, [91m::Number[39m) where {V, I} at C:\Users\aleca\.julia\packages\JsonGrinder\HGYbL\src\extractors\extractcategorical.jl:100
[0m  val2idx(::ExtractCategorical{V, I}, [91m::V[39m) where {V, I} at C:\Users\aleca\.julia\packages\JsonGrinder\HGYbL\src\extractors\extractcategorical.jl:99

In [10]:
data[1]

LoadError: UndefVarError: data not defined

# Define the model

In [11]:
labelnames = sort(unique(df_labels.label))
neurons = 32
model = reflectinmodel(complete_schema, extractor,
	k -> Dense(k, neurons, relu),
	d -> SegmentedMeanMax(d),
	fsm = Dict("" => k -> Dense(k, n_classes)),
)

minibatchsize = 10
function minibatch()
	idx = StatsBase.sample(train_indexes, minibatchsize, replace = false)
	reduce(catobs, data[idx]), Flux.onehotbatch(df_labels.label[idx], labelnames)
end

iterations = 200

function calculate_accuracy(x,y) 
    vals = tmap(x) do s
        Flux.onecold(softmax(model(s)), labelnames)[1]
    end
    mean(vals .== y)
end     
    

eval_trainset = shuffle(train_indexes)
eval_testset = shuffle(test_indexes)

cb = () -> begin
	train_acc = calculate_accuracy(data[eval_trainset], df_labels.label[eval_trainset])
	test_acc = calculate_accuracy(data[eval_testset], df_labels.label[eval_testset])
	println("accuracy: train = $train_acc, test = $test_acc")
end
ps = Flux.params(model)
loss = (x,y) -> Flux.logitcrossentropy(model(x), y)
opt = ADAM()

LoadError: MethodError: no method matching val2idx(::ExtractCategorical{Number, UInt32}, ::Nothing)
[0mClosest candidates are:
[0m  val2idx(::ExtractCategorical{var"#s60", I} where var"#s60"<:Number, [91m::Number[39m) where {V, I} at C:\Users\aleca\.julia\packages\JsonGrinder\HGYbL\src\extractors\extractcategorical.jl:100
[0m  val2idx(::ExtractCategorical{V, I}, [91m::V[39m) where {V, I} at C:\Users\aleca\.julia\packages\JsonGrinder\HGYbL\src\extractors\extractcategorical.jl:99

In [12]:
Flux.Optimise.train!(loss, ps, repeatedly(minibatch, iterations), opt, cb = Flux.throttle(cb, 2))

LoadError: UndefVarError: minibatch not defined

# Accuracy and ROC curve

In [13]:
full_train_accuracy = calculate_accuracy(data[train_indexes], df_labels.label[train_indexes])
full_test_accuracy = calculate_accuracy(data[test_indexes], df_labels.label[test_indexes])
println("Final evaluation:")
println("Accuratcy on train data: $(full_train_accuracy)")
println("Accuratcy on test data: $(full_test_accuracy)")

LoadError: UndefVarError: data not defined

In [14]:
using ROC: roc,AUC

scores=softmax(model(data[test_indexes]))[2,:]
roc_curve = roc(scores, df_labels.label[test_indexes], true);

println("AUC: $(AUC(roc_curve))")
plot(roc_curve,lw=2)



LoadError: UndefVarError: data not defined