# Import library

In [1]:
using Pkg

using Flux, MLDataPattern, Mill, JsonGrinder, JSON, Statistics, IterTools, StatsBase, ThreadTools
using JsonGrinder: suggestextractor, ExtractDict
using Mill: reflectinmodel
using CSV, DataFrames
using Random
using Dates
using Plots
using Printf

#using Zygote, MLDataPattern

#ENV["PYTHON"]="C:\\Users\\aleca\\anaconda3\\python.exe"
#Pkg.build("PyCall")
#using ScikitLearn, PyCall

# Import data

In [2]:
THREADS = Threads.nthreads() 

1

In [3]:
PATH_TO_REDUCED_REPORTS = "../data/Avast/public_small_reports/"
#PATH_TO_LABELS = "../data/Avast/subset_10.csv" ;
#PATH_TO_LABELS = "../data/Avast/subset_50.csv" ;
PATH_TO_LABELS = "../data/Avast/subset_100.csv" ;
#PATH_TO_LABELS = "../data/Avast/public_labels.csv" ;#

In [4]:
df_labels=CSV.read(PATH_TO_LABELS,DataFrame);
targets=df_labels.classification_family;
labels=Set(df_labels.classification_family);
n_classes=length(labels);

In [5]:
jsons = tmap(df_labels.sha256) do s
    try 
        x=open(JSON.parse, "$(PATH_TO_REDUCED_REPORTS)$(s).json")
        delete!(x,"static") # Take only the behavioral info
        #delete!(x,"behavior") # Take only the static info
    catch e
        @error "Error when processing sha $s: $e"
    end
end ;

n_samples=length(jsons)
println("N samples: $(n_samples)")
println("N classes: $(n_classes)")
    
@assert size(jsons, 1) == length(targets)

N samples: 1000
N classes: 10


In [6]:
timesplit = Date(2019,8,1)
train_indexes = findall(i -> df_labels.date[i] < timesplit, 1:n_samples)
test_indexes = [setdiff(Set(1:n_samples), Set(train_indexes))...] ;

train_size = length(train_indexes)
test_size = length(test_indexes)

println("Train size: $(train_size)")
println("Test size: $(test_size)")

Train size: 822
Test size: 178


# Define scheme and model

In [7]:
chunks = Iterators.partition(train_indexes, div(train_size, THREADS))
sch_parts = tmap(chunks) do ch
    JsonGrinder.schema(jsons[ch])
end
time_split_complete_schema = merge(sch_parts...)
printtree(time_split_complete_schema)

[34m[Dict][39m[90m 	# updated = 822[39m
[34m  ╰── behavior: [39m[31m[Dict][39m[90m 	# updated = 822[39m
[34m                [39m[31m  ╰── summary: [39m[32m[Dict][39m[90m 	# updated = 822[39m
[34m                [39m[31m               [39m[32m  ├─────── delete_files: [39m[33m[List][39m[90m 	# updated = 822[39m
[34m                [39m[31m               [39m[32m  │                      [39m[33m  ╰── [39m[39m[Scalar - String], 1709 unique values[90m 	# updated = 2442[39m
[34m                [39m[31m               [39m[32m  ├──────── delete_keys: [39m[33m[List][39m[90m 	# updated = 822[39m
[34m                [39m[31m               [39m[32m  │                      [39m[33m  ╰── [39m[39m[Scalar - String], 10 unique values[90m 	# updated = 89[39m
[34m                [39m[31m               [39m[32m  ├─────────────── keys: [39m[33m[List][39m[90m 	# updated = 822[39m
[34m                [39m[31m               [39m[32m  │

In [9]:
#sch=schema(jsons);
extractor=suggestextractor(time_split_complete_schema);
data=map(extractor,jsons);
printtree(data[1])

└ @ JsonGrinder C:\Users\aleca\.julia\packages\JsonGrinder\u8083\src\schema\dict.jl:61


[34mProductNode[39m[90m 	# 1 obs, 408 bytes[39m
[34m  ╰── behavior: [39m[31mProductNode[39m[90m 	# 1 obs, 408 bytes[39m
[34m                [39m[31m  ╰── summary: [39m[32mProductNode[39m[90m 	# 1 obs, 408 bytes[39m
[34m                [39m[31m               [39m[32m  ├─────── delete_files: [39m[33mBagNode[39m[90m 	# 1 obs, 104 bytes[39m
[34m                [39m[31m               [39m[32m  │                      [39m[33m  ╰── [39m[39mArrayNode(2053×0 NGramMatrix with Int64 elements)[90m 	# 0 obs, 104 bytes[39m
[34m                [39m[31m               [39m[32m  ├──────── delete_keys: [39m[33mBagNode[39m[90m 	# 1 obs, 80 bytes[39m
[34m                [39m[31m               [39m[32m  │                      [39m[33m  ╰── [39m[39mArrayNode(11×0 OneHotArray with Bool elements)[90m 	# 0 obs, 56 bytes[39m
[34m                [39m[31m               [39m[32m  ├────────────── files: [39m[33mBagNode[39m[90m 	# 1 obs, 104 bytes[

In [10]:
labelnames = sort(unique(df_labels.classification_family))
neurons = 32
model = reflectinmodel(time_split_complete_schema, extractor,
	k -> Dense(k, neurons, relu),
	d -> SegmentedMeanMax(d),
	fsm = Dict("" => k -> Dense(k, n_classes)),
)

minibatchsize = 50
function minibatch()
	idx = StatsBase.sample(train_indexes, minibatchsize, replace = false)
	reduce(catobs, data[idx]), Flux.onehotbatch(df_labels.classification_family[idx], labelnames)
end

iterations = 200

function calculate_accuracy(x,y) 
    vals = tmap(x) do s
        Flux.onecold(softmax(model(s)), labelnames)[1]
    end
    mean(vals .== y)
end     
    

eval_trainset = shuffle(train_indexes)
eval_testset = shuffle(test_indexes)

cb = () -> begin
	train_acc = calculate_accuracy(data[eval_trainset], df_labels.classification_family[eval_trainset])
	test_acc = calculate_accuracy(data[eval_testset], df_labels.classification_family[eval_testset])
	println("accuracy: train = $train_acc, test = $test_acc")
end
ps = Flux.params(model)
loss = (x,y) -> Flux.logitcrossentropy(model(x), y)
opt = ADAM()

Adam(0.001, (0.9, 0.999), 1.0e-8, IdDict{Any, Any}())

In [15]:
printtree(model)

[34mProductModel ↦ Dense(32 => 10)[39m[90m 	# 2 arrays, 330 params, 1.367 KiB[39m
[34m  ╰── behavior: [39m[31mProductModel ↦ identity[39m
[34m                [39m[31m  ╰── summary: [39m[32mProductModel ↦ Dense(384 => 32, relu)[39m[90m 	# 2 arrays, 12_320 params, 48.203 KiB[39m
[34m                [39m[31m               [39m[32m  ├─────── delete_files: [39m[33mBagModel ↦ [SegmentedMean(32); SegmentedMax(32)] ↦ Dense(64 => 32, relu)[39m[90m 	# 4 arrays, 2_144 params, 8.531 KiB[39m
[34m                [39m[31m               [39m[32m  │                      [39m[33m  ╰── [39m[39mArrayModel(Dense(2053 => 32, relu))[90m 	# 2 arrays, 65_728 params, 256.828 KiB[39m
[34m                [39m[31m               [39m[32m  ├──────── delete_keys: [39m[33mBagModel ↦ [SegmentedMean(32); SegmentedMax(32)] ↦ Dense(64 => 32, relu)[39m[90m 	# 4 arrays, 2_144 params, 8.531 KiB[39m
[34m                [39m[31m               [39m[32m  │                      

# Training the model

In [11]:
epochs=3
for i in 1:epochs
    println("Epoch $(i)")
Flux.Optimise.train!(loss, ps, repeatedly(minibatch, iterations), opt, cb = Flux.throttle(cb, 2))
end

Epoch 1
accuracy: train = 0.29075425790754256, test = 0.15168539325842698
accuracy: train = 0.8041362530413625, test = 0.48314606741573035
accuracy: train = 0.8917274939172749, test = 0.5561797752808989
accuracy: train = 0.9221411192214112, test = 0.6629213483146067
accuracy: train = 0.9416058394160584, test = 0.6910112359550562
accuracy: train = 0.9671532846715328, test = 0.7865168539325843
accuracy: train = 0.9781021897810219, test = 0.7921348314606742
accuracy: train = 0.9805352798053528, test = 0.7808988764044944
accuracy: train = 0.9829683698296837, test = 0.7752808988764045
accuracy: train = 0.9878345498783455, test = 0.7921348314606742
accuracy: train = 0.9914841849148418, test = 0.7921348314606742
accuracy: train = 0.9939172749391727, test = 0.7865168539325843
accuracy: train = 0.9975669099756691, test = 0.797752808988764
accuracy: train = 0.9975669099756691, test = 0.7865168539325843
accuracy: train = 0.9987834549878345, test = 0.797752808988764
accuracy: train = 0.99878345498

# Accuracy evaluation

In [12]:
full_train_accuracy = calculate_accuracy(data[train_indexes], df_labels.classification_family[train_indexes])
full_test_accuracy = calculate_accuracy(data[test_indexes], df_labels.classification_family[test_indexes])
println("Final evaluation:")
println("Accuratcy on train data: $(full_train_accuracy)")
println("Accuratcy on test data: $(full_test_accuracy)")

Final evaluation:
Accuratcy on train data: 1.0
Accuratcy on test data: 0.7921348314606742


## Confusion matrix

In [13]:
test_predictions = Dict()
for true_label in labelnames
    current_predictions = Dict()
    [current_predictions[pl]=0.0 for pl in labelnames]
    family_indexes = filter(i -> df_labels.classification_family[i] == true_label, test_indexes)
    predictions = tmap(data[family_indexes]) do s
        Flux.onecold(softmax(model(s)), labelnames)[1]
    end
    [current_predictions[pl] += 1.0 for pl in predictions]
    [current_predictions[pl] = current_predictions[pl] ./ length(predictions) for pl in labelnames]
    test_predictions[true_label] = current_predictions
end

@printf "%8s\t" "TL\\PL"
[@printf " %8s" s for s in labelnames]
print("\n")
for tl in labelnames
    @printf "%8s\t" tl 
    for pl in labelnames
        @printf "%9s" @sprintf "%.2f" test_predictions[tl][pl]*100
    end
    print("\n")
end

   TL\PL	   Adload   Emotet   HarHar  Lokibot   Qakbot   Swisyn Trickbot   Ursnif     Zeus    njRAT
  Adload	      NaN      NaN      NaN      NaN      NaN      NaN      NaN      NaN      NaN      NaN
  Emotet	     0.00    95.00     0.00     0.00     0.00     0.00     5.00     0.00     0.00     0.00
  HarHar	     0.00     0.00   100.00     0.00     0.00     0.00     0.00     0.00     0.00     0.00
 Lokibot	     0.00     0.00     0.00   100.00     0.00     0.00     0.00     0.00     0.00     0.00
  Qakbot	     0.00     0.00     0.00     0.00   100.00     0.00     0.00     0.00     0.00     0.00
  Swisyn	     0.00     0.00     0.00     0.00     0.00   100.00     0.00     0.00     0.00     0.00
Trickbot	     0.00     0.00     0.00     0.00     0.00     0.00    93.33     0.00     3.33     3.33
  Ursnif	     0.00     0.00     0.00     0.00     0.00     0.00    69.70     0.00    30.30     0.00
    Zeus	     0.00    20.00     0.00     0.00     0.00     0.00     0.00     0.00    80.00     0.00
