# Import library

In [1]:
using Pkg

using Flux, MLDataPattern, Mill, JsonGrinder, JSON, Statistics, IterTools, StatsBase, ThreadTools
using JsonGrinder: suggestextractor, ExtractDict
using Mill: reflectinmodel
using CSV, DataFrames
using Random
using Dates
using Plots
using Printf

#using Zygote, MLDataPattern

#ENV["PYTHON"]="C:\\Users\\aleca\\anaconda3\\python.exe"
#Pkg.build("PyCall")
#using ScikitLearn, PyCall

# Read data

In [4]:
THREADS = Threads.nthreads() 

1

In [5]:
PATH_TO_REDUCED_REPORTS = "C:/Users/aleca/PycharmProjects/MasterThesis/Avast/public_small_reports/"
PATH_TO_LABELS = "C:/Users/aleca/PycharmProjects/MasterThesis/Avast/subset_10.csv" ;
#PATH_TO_LABELS = "C:/Users/aleca/PycharmProjects/MasterThesis/Avast/subset_50.csv" ;
#PATH_TO_LABELS = "C:/Users/aleca/PycharmProjects/MasterThesis/Avast/subset_100.csv" ;
#PATH_TO_LABELS = "C:/Users/aleca/PycharmProjects/MasterThesis/Avast/public_labels.csv" ;#

In [6]:
df_labels=CSV.read(PATH_TO_LABELS,DataFrame);
targets=df_labels.classification_family;
labels=Set(df_labels.classification_family);
n_classes=length(labels);

In [7]:
jsons = tmap(df_labels.sha256) do s
    try 
        x=open(JSON.parse, "$(PATH_TO_REDUCED_REPORTS)$(s).json")
        #delete!(x,"static") # Take only the behavioral info
        delete!(x,"behavior") # Take only the static info
    catch e
        @error "Error when processing sha $s: $e"
    end
end ;

n_samples=length(jsons)
println("N samples: $(n_samples)")
println("N classes: $(n_classes)")
    
@assert size(jsons, 1) == length(targets)

N samples: 100
N classes: 10


In [8]:
timesplit = Date(2019,8,1)
train_indexes = findall(i -> df_labels.date[i] < timesplit, 1:n_samples)
test_indexes = [setdiff(Set(1:n_samples), Set(train_indexes))...] ;

train_size = length(train_indexes)
test_size = length(test_indexes)

println("Train size: $(train_size)")
println("Test size: $(test_size)")

Train size: 76
Test size: 24


# Define scheme and model

In [11]:
chunks = Iterators.partition(train_indexes, div(train_size, THREADS))
sch_parts = tmap(chunks) do ch
    JsonGrinder.schema(jsons[ch])
end
time_split_complete_schema = merge(sch_parts...)
printtree(time_split_complete_schema)

[34m[Dict][39m[90m  # updated = 76[39m
[34m  ╰── static: [39m[31m[Dict][39m[90m  # updated = 75[39m
[34m              [39m[31m  ╰── pe: [39m[32m[Dict][39m[90m  # updated = 75[39m
[34m              [39m[31m          [39m[32m  ├─── reported_checksum: [39m[39m[Scalar - String], 37 unique values[90m  # updated = 75[39m
[34m              [39m[31m          [39m[32m  ├───────── versioninfo: [39m[33m[Empty List][39m[90m  # updated = 75[39m
[34m              [39m[31m          [39m[32m  │                       [39m[33m  ╰── [39m[39mNothing
[34m              [39m[31m          [39m[32m  ├───────────── exports: [39m[33m[List][39m[90m  # updated = 75[39m
[34m              [39m[31m          [39m[32m  │                       [39m[33m  ╰── [39m[36m[Dict][39m[90m  # updated = 2[39m
[34m              [39m[31m          [39m[32m  │                       [39m[33m      [39m[36m  ├── address: [39m[39m[Scalar - String], 2 unique val

In [8]:
#sch=schema(jsons);
extractor=suggestextractor(time_split_complete_schema);
data=map(extractor,jsons);

└ @ JsonGrinder C:\Users\aleca\.julia\packages\JsonGrinder\HGYbL\src\schema\dict.jl:61
└ @ JsonGrinder C:\Users\aleca\.julia\packages\JsonGrinder\HGYbL\src\schema\dict.jl:61


In [9]:
#model= reflectinmodel(data[1],
#    d -> Dense(d,32,relu),
#    SegmentedMeanMax,
#    fsm=Dict("" => d -> Chain(Dense(d,32,relu),Dense(32,length(labels)))))
labelnames = sort(unique(df_labels.classification_family))
neurons = 32
model = reflectinmodel(time_split_complete_schema, extractor,
	k -> Dense(k, neurons, relu),
	d -> SegmentedMeanMax(d),
	fsm = Dict("" => k -> Dense(k, n_classes)),
)

minibatchsize = 10
function minibatch()
	idx = StatsBase.sample(train_indexes, minibatchsize, replace = false)
	reduce(catobs, data[idx]), Flux.onehotbatch(df_labels.classification_family[idx], labelnames)
end

iterations = 200

function calculate_accuracy(x,y) 
    vals = tmap(x) do s
        Flux.onecold(softmax(model(s)), labelnames)[1]
    end
    mean(vals .== y)
end     
    

eval_trainset = shuffle(train_indexes)
eval_testset = shuffle(test_indexes)

cb = () -> begin
	train_acc = calculate_accuracy(data[eval_trainset], df_labels.classification_family[eval_trainset])
	test_acc = calculate_accuracy(data[eval_testset], df_labels.classification_family[eval_testset])
	println("accuracy: train = $train_acc, test = $test_acc")
end
ps = Flux.params(model)
loss = (x,y) -> Flux.logitcrossentropy(model(x), y)
opt = ADAM()

Adam(0.001, (0.9, 0.999), 1.0e-8, IdDict{Any, Any}())

In [None]:
Flux.Optimise.train!(loss, ps, repeatedly(minibatch, iterations), opt, cb = Flux.throttle(cb, 2))

accuracy: train = 0.10893512851897184, test = 0.0273224043715847
accuracy: train = 0.7270501835985312, test = 0.453551912568306


In [None]:
full_train_accuracy = calculate_accuracy(data[train_indexes], df_labels.classification_family[train_indexes])
full_test_accuracy = calculate_accuracy(data[test_indexes], df_labels.classification_family[test_indexes])
println("Final evaluation:")
println("Accuratcy on train data: $(full_train_accuracy)")
println("Accuratcy on test data: $(full_test_accuracy)")

# Classification