# Import library

In [1]:
using Pkg

using Flux, MLDataPattern, Mill, JsonGrinder, JSON, Statistics, IterTools, StatsBase, ThreadTools
using JsonGrinder: suggestextractor, ExtractDict
using Mill: reflectinmodel
using CSV, DataFrames
using Random
using Dates
using Plots
using Printf

#using Zygote, MLDataPattern

#ENV["PYTHON"]="C:\\Users\\aleca\\anaconda3\\python.exe"
#Pkg.build("PyCall")
#using ScikitLearn, PyCall

# Import data

In [2]:
THREADS = Threads.nthreads() 

1

In [3]:
PATH_TO_REDUCED_REPORTS = "../Avast/public_small_reports/"
#PATH_TO_LABELS = "../Avast/subset_10.csv" ;
#PATH_TO_LABELS = "../Avast/subset_50.csv" ;
PATH_TO_LABELS = "../Avast/subset_100.csv" ;
#PATH_TO_LABELS = "../Avast/public_labels.csv" ;#

In [4]:
df_labels=CSV.read(PATH_TO_LABELS,DataFrame);
targets=df_labels.classification_family;
labels=Set(df_labels.classification_family);
n_classes=length(labels);

In [13]:
jsons = tmap(df_labels.sha256) do s
    try 
        x=open(JSON.parse, "$(PATH_TO_REDUCED_REPORTS)$(s).json")
        delete!(x,"static") # Take only the behavioral info
        #delete!(x,"behavior") # Take only the static info
    catch e
        @error "Error when processing sha $s: $e"
    end
end ;

n_samples=length(jsons)
println("N samples: $(n_samples)")
println("N classes: $(n_classes)")
    
@assert size(jsons, 1) == length(targets)

N samples: 1000
N classes: 10


In [14]:
timesplit = Date(2019,8,1)
train_indexes = findall(i -> df_labels.date[i] < timesplit, 1:n_samples)
test_indexes = [setdiff(Set(1:n_samples), Set(train_indexes))...] ;

train_size = length(train_indexes)
test_size = length(test_indexes)

println("Train size: $(train_size)")
println("Test size: $(test_size)")

Train size: 817
Test size: 183


# Define scheme and model

In [15]:
chunks = Iterators.partition(train_indexes, div(train_size, THREADS))
sch_parts = tmap(chunks) do ch
    JsonGrinder.schema(jsons[ch])
end
time_split_complete_schema = merge(sch_parts...)
printtree(time_split_complete_schema)

[34m[Dict][39m[90m  # updated = 817[39m
[34m  ╰── behavior: [39m[31m[Dict][39m[90m  # updated = 817[39m
[34m                [39m[31m  ╰── summary: [39m[32m[Dict][39m[90m  # updated = 817[39m
[34m                [39m[31m               [39m[32m  ├─────── delete_files: [39m[33m[List][39m[90m  # updated = 817[39m
[34m                [39m[31m               [39m[32m  │                      [39m[33m  ╰── [39m[39m[Scalar - String], 3342 unique values[90m  # updated = 4046[39m
[34m                [39m[31m               [39m[32m  ├──────── delete_keys: [39m[33m[List][39m[90m  # updated = 817[39m
[34m                [39m[31m               [39m[32m  │                      [39m[33m  ╰── [39m[39m[Scalar - String], 22 unique values[90m  # updated = 116[39m
[34m                [39m[31m               [39m[32m  ├─────────────── keys: [39m[33m[List][39m[90m  # updated = 817[39m
[34m                [39m[31m               [39m[32m  

In [16]:
#sch=schema(jsons);
extractor=suggestextractor(time_split_complete_schema);
data=map(extractor,jsons);

In [17]:
labelnames = sort(unique(df_labels.classification_family))
neurons = 32
model = reflectinmodel(time_split_complete_schema, extractor,
	k -> Dense(k, neurons, relu),
	d -> SegmentedMeanMax(d),
	fsm = Dict("" => k -> Dense(k, n_classes)),
)

minibatchsize = 50
function minibatch()
	idx = StatsBase.sample(train_indexes, minibatchsize, replace = false)
	reduce(catobs, data[idx]), Flux.onehotbatch(df_labels.classification_family[idx], labelnames)
end

iterations = 200

function calculate_accuracy(x,y) 
    vals = tmap(x) do s
        Flux.onecold(softmax(model(s)), labelnames)[1]
    end
    mean(vals .== y)
end     
    

eval_trainset = shuffle(train_indexes)
eval_testset = shuffle(test_indexes)

cb = () -> begin
	train_acc = calculate_accuracy(data[eval_trainset], df_labels.classification_family[eval_trainset])
	test_acc = calculate_accuracy(data[eval_testset], df_labels.classification_family[eval_testset])
	println("accuracy: train = $train_acc, test = $test_acc")
end
ps = Flux.params(model)
loss = (x,y) -> Flux.logitcrossentropy(model(x), y)
opt = ADAM()

Adam(0.001, (0.9, 0.999), 1.0e-8, IdDict{Any, Any}())

# Training the model

In [18]:
epochs=3
for i in 1:epochs
    println("Epoch $(i)")
Flux.Optimise.train!(loss, ps, repeatedly(minibatch, iterations), opt, cb = Flux.throttle(cb, 2))
end

Epoch 1
accuracy: train = 0.15299877600979192, test = 0.0273224043715847
accuracy: train = 0.8837209302325582, test = 0.6229508196721312
accuracy: train = 0.9167686658506732, test = 0.6994535519125683
accuracy: train = 0.9290085679314566, test = 0.6830601092896175
accuracy: train = 0.9461444308445532, test = 0.6830601092896175
accuracy: train = 0.966952264381885, test = 0.7049180327868853
accuracy: train = 0.981640146878825, test = 0.73224043715847
accuracy: train = 0.97796817625459, test = 0.7486338797814208
accuracy: train = 0.988984088127295, test = 0.7540983606557377
accuracy: train = 0.9938800489596084, test = 0.7540983606557377
accuracy: train = 0.9938800489596084, test = 0.7540983606557377
accuracy: train = 0.9938800489596084, test = 0.7540983606557377
accuracy: train = 0.9938800489596084, test = 0.7650273224043715
accuracy: train = 0.996328029375765, test = 0.7650273224043715
accuracy: train = 0.9951040391676866, test = 0.7650273224043715
accuracy: train = 0.9951040391676866, t

# Accuracy evaluation

In [19]:
full_train_accuracy = calculate_accuracy(data[train_indexes], df_labels.classification_family[train_indexes])
full_test_accuracy = calculate_accuracy(data[test_indexes], df_labels.classification_family[test_indexes])
println("Final evaluation:")
println("Accuratcy on train data: $(full_train_accuracy)")
println("Accuratcy on test data: $(full_test_accuracy)")

Final evaluation:
Accuratcy on train data: 1.0
Accuratcy on test data: 0.907103825136612


## Confusion matrix

In [20]:
test_predictions = Dict()
for true_label in labelnames
    current_predictions = Dict()
    [current_predictions[pl]=0.0 for pl in labelnames]
    family_indexes = filter(i -> df_labels.classification_family[i] == true_label, test_indexes)
    predictions = tmap(data[family_indexes]) do s
        Flux.onecold(softmax(model(s)), labelnames)[1]
    end
    [current_predictions[pl] += 1.0 for pl in predictions]
    [current_predictions[pl] = current_predictions[pl] ./ length(predictions) for pl in labelnames]
    test_predictions[true_label] = current_predictions
end

@printf "%8s\t" "TL\\PL"
[@printf " %8s" s for s in labelnames]
print("\n")
for tl in labelnames
    @printf "%8s\t" tl 
    for pl in labelnames
        @printf "%9s" @sprintf "%.2f" test_predictions[tl][pl]*100
    end
    print("\n")
end

   TL\PL	   Adload   Emotet   HarHar  Lokibot   Qakbot   Swisyn Trickbot   Ursnif     Zeus    njRAT
  Adload	   100.00     0.00     0.00     0.00     0.00     0.00     0.00     0.00     0.00     0.00
  Emotet	     7.14    85.71     0.00     0.00     0.00     0.00     3.57     0.00     3.57     0.00
  HarHar	     0.00     0.00   100.00     0.00     0.00     0.00     0.00     0.00     0.00     0.00
 Lokibot	     0.00     0.00     0.00    88.89     0.00     0.00     0.00     0.00    11.11     0.00
  Qakbot	     0.00     0.00     0.00     0.00   100.00     0.00     0.00     0.00     0.00     0.00
  Swisyn	     0.00     0.00     0.00     2.78     0.00    97.22     0.00     0.00     0.00     0.00
Trickbot	     0.00     0.00     0.00     4.00     0.00     0.00    96.00     0.00     0.00     0.00
  Ursnif	     2.50     0.00     0.00     7.50     0.00     0.00     7.50    77.50     5.00     0.00
    Zeus	     0.00     0.00     0.00     0.00     0.00     0.00     0.00     0.00   100.00     0.00
