In [1]:
using CSV
using DataFrames
using Tables
using Statistics
using ScikitLearn
using ScikitLearn.CrossValidation: train_test_split
@sk_import preprocessing: OneHotEncoder
@sk_import preprocessing: LabelEncoder
@sk_import naive_bayes: CategoricalNB
@sk_import naive_bayes: GaussianNB
@sk_import naive_bayes: MultinomialNB
@sk_import tree: DecisionTreeClassifier
@sk_import svm: SVC
@sk_import svm: LinearSVC
@sk_import neural_network: MLPClassifier
@sk_import metrics: accuracy_score

PyObject <function accuracy_score at 0x7f9a72f770d0>

# Cars

## Preprocessing

In [2]:
cars = CSV.File("data/cars/car.data", 
        header=["buying", "maint", "doors", "persons", "lug_boot", "safety", "class"]) |> DataFrame
cars_X = convert(Array, cars[:, Not(7)])
cars_y = convert(Array, cars[:, 7])

1728-element Array{String,1}:
 "unacc"
 "unacc"
 "unacc"
 "unacc"
 "unacc"
 "unacc"
 "unacc"
 "unacc"
 "unacc"
 "unacc"
 "unacc"
 "unacc"
 "unacc"
 ⋮
 "unacc"
 "good"
 "vgood"
 "unacc"
 "acc"
 "good"
 "unacc"
 "good"
 "vgood"
 "unacc"
 "good"
 "vgood"

In [3]:
X_train, X_test, y_train, y_test = train_test_split(cars_X, cars_y, test_size=0.3, random_state=5)

4-element Array{Array{String,N} where N,1}:
 ["vhigh" "low" … "small" "low"; "high" "vhigh" … "small" "low"; … ; "vhigh" "high" … "big" "high"; "med" "vhigh" … "med" "low"]
 ["high" "low" … "med" "med"; "vhigh" "vhigh" … "med" "high"; … ; "vhigh" "vhigh" … "small" "high"; "high" "high" … "big" "med"]
 ["unacc", "unacc", "good", "unacc", "unacc", "unacc", "acc", "acc", "acc", "unacc"  …  "unacc", "acc", "unacc", "unacc", "unacc", "unacc", "acc", "unacc", "unacc", "unacc"]
 ["acc", "unacc", "acc", "acc", "acc", "unacc", "unacc", "unacc", "vgood", "unacc"  …  "vgood", "acc", "acc", "unacc", "unacc", "unacc", "unacc", "acc", "unacc", "acc"]

In [4]:
# encode the categorical features 
ohc = OneHotEncoder()
X_train_dummies = ohc.fit_transform(X_train).toarray()
X_test_dummies = ohc.transform(X_test).toarray()

# encode the y's 
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)

519-element Array{Int64,1}:
 0
 2
 0
 0
 0
 2
 2
 2
 3
 2
 2
 0
 2
 ⋮
 2
 2
 3
 0
 0
 2
 2
 2
 2
 0
 2
 0

## Model Creation

### Naive Bayesian Classifier

In [5]:
@time begin
    nb = fit!(CategoricalNB(1), X_train_dummies, y_train_encoded)
    nb_pred = predict(nb, X_test_dummies)
    accuracy_score(nb_pred, y_test_encoded)
end

  0.151088 seconds (444.63 k allocations: 22.625 MiB)


0.8709055876685935

### Decision Tree

In [6]:
@time begin 
    # use cross validation to find optimal max depth 
    scores = []
    for i in 1:30
        tree = DecisionTreeClassifier(max_depth=i, random_state=5)
        result = ScikitLearn.CrossValidation.cross_val_score(tree, X_train_dummies, y_train_encoded, cv=5)
        append!(scores, mean(result))
    end

    # fit the best model 
    best_depth = argmax(scores)
    tree = fit!(DecisionTreeClassifier(max_depth=best_depth, random_state=5),
                X_train_dummies,
                y_train_encoded)

    # results
    tree_pred = predict(tree, X_test_dummies)
    accuracy_score(tree_pred, y_test_encoded)
end

  1.549962 seconds (6.16 M allocations: 369.776 MiB, 5.36% gc time)


0.9479768786127167

### SVM

In [7]:
@time begin
    # grid of parameters to be tried
    param_grid = Dict("kernel" => ["linear", "poly", "rbf", "sigmoid"],
                      "C" => [0.01, 0.1, 1, 10, 100])

    # cross-validated grid search 
    svc = SVC()
    grid_cv = ScikitLearn.GridSearch.GridSearchCV(svc, param_grid)
    fit!(grid_cv, X_train_dummies, y_train_encoded)

    # results 
    svc_pred = predict(grid_cv, X_test_dummies)
    accuracy_score(svc_pred, y_test_encoded) 
end

  2.834458 seconds (4.58 M allocations: 252.049 MiB, 1.40% gc time)


0.9980732177263969

### MLP

In [8]:
@time begin
    activations = ["identity", "logistic", "tanh", "relu"]
    sizes = [30, 100, 500, 1000]
    scores = []
    for a in activations
        for s in sizes
            mlp = MLPClassifier(hidden_layer_sizes=(s), activation=a)
            fit!(mlp, X_train_dummies, y_train_encoded)
            mlp_pred = predict(mlp, X_test_dummies)
            append!(scores, accuracy_score(mlp_pred, y_test_encoded))
            end
    end
    maximum(scores)
end

 21.967077 seconds (303.43 k allocations: 16.206 MiB)




0.9865125240847784

# Abalone

## Preprocessing

In [9]:
# read in data, include column names
abalone = CSV.File("data/abalone/abalone.data", 
                   header=["Sex", "Length", "Diameter", "Height", "Whole weight",
                           "Shucked weight", "Viscera weight", "Shell weight", "Rings"]) |> DataFrame

Unnamed: 0_level_0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight
Unnamed: 0_level_1,String,Float64,Float64,Float64,Float64,Float64,Float64
1,M,0.455,0.365,0.095,0.514,0.2245,0.101
2,M,0.35,0.265,0.09,0.2255,0.0995,0.0485
3,F,0.53,0.42,0.135,0.677,0.2565,0.1415
4,M,0.44,0.365,0.125,0.516,0.2155,0.114
5,I,0.33,0.255,0.08,0.205,0.0895,0.0395
6,I,0.425,0.3,0.095,0.3515,0.141,0.0775
7,F,0.53,0.415,0.15,0.7775,0.237,0.1415
8,F,0.545,0.425,0.125,0.768,0.294,0.1495
9,M,0.475,0.37,0.125,0.5095,0.2165,0.1125
10,F,0.55,0.44,0.15,0.8945,0.3145,0.151


In [10]:
# Bins used in "   David Clark, Zoltan Schreter, Anthony Adams "A Quantitative Comparison of
#    Dystal and Backpropagation", submitted to the Australian Conference on
#    Neural Networks (ACNN'96). Data set treated as a 3-category classification
#    problem (grouping ring classes 1-8, 9 and 10, and 11 on)
abalone[:, :ring_bin] = abalone[:, :Rings]
abalone[abalone[:, :Rings] .< 9, :ring_bin] .= 1
abalone[abalone[:, :Rings] .== 9, :ring_bin] .= 2
abalone[abalone[:, :Rings] .== 10, :ring_bin] .= 2
abalone[abalone[:, :Rings] .> 10, :ring_bin] .= 3

1447-element view(::Array{Int64,1}, [1, 7, 8, 10, 11, 13, 16, 21, 23, 26  …  4143, 4144, 4145, 4146, 4148, 4149, 4161, 4162, 4173, 4177]) with eltype Int64:
 3
 3
 3
 3
 3
 3
 3
 3
 3
 3
 3
 3
 3
 ⋮
 3
 3
 3
 3
 3
 3
 3
 3
 3
 3
 3
 3

In [11]:
# separate X and Y
abalone_X = convert(Array, abalone[:, 1:8])
abalone_y = convert(Array, abalone[:, 10])

4177-element Array{Int64,1}:
 3
 1
 2
 2
 1
 1
 3
 3
 2
 3
 3
 2
 3
 ⋮
 1
 2
 2
 1
 2
 2
 1
 3
 2
 2
 2
 3

In [12]:
X_train, X_test, y_train, y_test = train_test_split(abalone_X, abalone_y, test_size=0.3, random_state=5)

# Encode the only categorical column 
ohc = OneHotEncoder()
sex_ohc_train = ohc.fit_transform(reshape(X_train[:, 1], :, 1)).toarray()
sex_ohc_test = ohc.transform(reshape(X_test[:, 1], :, 1)).toarray()

# combine encoded data with the original data
X_train_transformed = convert(Array{Float64, 2}, hcat(sex_ohc_train, X_train[:, Not(1)]))
X_test_transformed = convert(Array{Float64, 2}, hcat(sex_ohc_test, X_test[:, Not(1)]))

1254×10 Array{Float64,2}:
 0.0  0.0  1.0  0.43   0.33   0.095  0.34    0.1315  0.085   0.112
 1.0  0.0  0.0  0.485  0.375  0.135  0.556   0.1925  0.1315  0.1685
 1.0  0.0  0.0  0.71   0.55   0.17   1.614   0.743   0.345   0.45
 1.0  0.0  0.0  0.52   0.4    0.12   0.6515  0.261   0.2015  0.165
 1.0  0.0  0.0  0.685  0.54   0.215  1.7025  0.664   0.3655  0.4735
 1.0  0.0  0.0  0.65   0.545  0.185  1.5055  0.6565  0.341   0.43
 0.0  1.0  0.0  0.455  0.345  0.11   0.434   0.207   0.0855  0.1215
 0.0  0.0  1.0  0.565  0.44   0.175  0.9025  0.31    0.193   0.325
 0.0  1.0  0.0  0.36   0.28   0.105  0.199   0.0695  0.045   0.08
 0.0  0.0  1.0  0.535  0.42   0.15   0.6995  0.2575  0.153   0.24
 1.0  0.0  0.0  0.5    0.405  0.14   0.6735  0.265   0.124   0.25
 0.0  0.0  1.0  0.58   0.455  0.135  0.7955  0.405   0.167   0.204
 0.0  0.0  1.0  0.64   0.515  0.08   1.042   0.515   0.1755  0.175
 ⋮                            ⋮                              
 0.0  0.0  1.0  0.425  0.33   0.08   0.361 

### Naive Bayesian Classifier 

In [13]:
@time begin
    gauss_nb = fit!(GaussianNB(), X_train_transformed, y_train)
    gauss_nb_pred = predict(gauss_nb, X_test_transformed)
    accuracy_score(y_test, gauss_nb_pred)
end

  0.001392 seconds (92 allocations: 14.734 KiB)


0.5717703349282297

In [14]:
@time begin
    mult_nb = fit!(MultinomialNB(), X_train_transformed, y_train)
    mult_nb_pred = predict(mult_nb, X_test_transformed)
    accuracy_score(y_test, mult_nb_pred)
end

  0.001365 seconds (92 allocations: 14.734 KiB)


0.5382775119617225

### Decision Tree

In [15]:
# CV to find best depth, report results 
@time begin
    scores = []
    for i in 1:30
        tree = DecisionTreeClassifier(max_depth=i, random_state=5)
        result = ScikitLearn.CrossValidation.cross_val_score(tree, X_train_transformed, y_train, cv=5)
        append!(scores, mean(result))
    end
    
    best_depth = argmax(scores)
    tree = fit!(DecisionTreeClassifier(max_depth=best_depth, random_state=5),
                X_train_transformed,
                y_train)
    tree_pred = predict(tree, X_test_transformed)
    accuracy_score(y_test, tree_pred)
end

  1.248002 seconds (190.43 k allocations: 120.662 MiB, 1.49% gc time)


0.6140350877192983

### SVM

In [16]:
# CV to perform grid search on params, report results 
# FutureWarning is due to the Sklean Julia implementation being outdated

@time begin
    param_grid = Dict("kernel" => ["linear", "poly", "rbf", "sigmoid"],
                      "C" => [0.01, 0.1, 1, 10, 100])
    svc = SVC(random_state=5)
    grid_cv = ScikitLearn.GridSearch.GridSearchCV(svc, param_grid, cv=5)
    fit!(grid_cv, X_train_transformed, y_train)
    svc_pred = predict(grid_cv, X_test_transformed)
    accuracy_score(y_test, svc_pred)
end



 24.665251 seconds (81.74 k allocations: 31.604 MiB, 0.03% gc time)


0.6562998405103668

### MLP

In [17]:
@time begin
    activations = ["identity", "logistic", "tanh", "relu"]
    sizes = [30, 100, 500, 1000]
    scores = []
    for a in activations
        for s in sizes
            mlp = MLPClassifier(hidden_layer_sizes=(s), activation=a)
            fit!(mlp, X_train_transformed, y_train)
            mlp_pred = predict(mlp, X_test_transformed)
            append!(scores, accuracy_score(mlp_pred, y_test))
            end
    end
    maximum(scores)
end

 35.575634 seconds (2.08 k allocations: 265.531 KiB)


0.6602870813397129

## MADELON

### Preprocessing

In [18]:
# No encoding or splitting needed, just read in required files
madelon_X_train = CSV.File("data/madelon/madelon_train.data", header=0, delim=' ')|> DataFrame
madelon_X_test = CSV.File("data/madelon/madelon_valid.data", header=0, delim=' ')|> DataFrame
madelon_y_train = CSV.File("data/madelon/madelon_train.labels", header=0, delim=' ')|> DataFrame
madelon_y_test = CSV.File("data/madelon/madelon_valid.labels", header=0, delim=' ')|> DataFrame

madelon_X_train = convert(Array, madelon_X_train[:, 1:500])
madelon_X_test = convert(Array, madelon_X_test[:, 1:500])
madelon_y_train = convert(Array, madelon_y_train)[:, 1]
madelon_y_test = convert(Array, madelon_y_test)[:, 1]

600-element Array{Int64,1}:
 -1
 -1
 -1
  1
 -1
  1
 -1
 -1
 -1
  1
  1
  1
 -1
  ⋮
  1
 -1
 -1
  1
 -1
 -1
  1
  1
  1
  1
  1
 -1

## Model Creation 

### Naive Bayesian Classifier

In [19]:
@time begin
    gauss_nb = fit!(GaussianNB(), madelon_X_train, madelon_y_train)
    gauss_nb_pred = predict(gauss_nb, madelon_X_test)
    println(accuracy_score(madelon_y_test, gauss_nb_pred))
end

0.5916666666666667
  0.159406 seconds (1.13 M allocations: 54.775 MiB, 6.90% gc time)


In [20]:
using Random: shuffle

In [21]:
@time begin
    for i in 1:100 
        feature_idx = shuffle(1:500)[1:100]
        gauss_nb = fit!(GaussianNB(), madelon_X_train[:, feature_idx], madelon_y_train)
        gauss_nb_pred = predict(gauss_nb, madelon_X_test[:, feature_idx])
        println(accuracy_score(madelon_y_test, gauss_nb_pred))
    end
end

0.615
0.575
0.5733333333333334
0.5433333333333333
0.565
0.56
0.5233333333333333
0.5316666666666666
0.585
0.5733333333333334
0.59
0.55
0.575
0.525
0.5416666666666666
0.545
0.5783333333333334
0.5316666666666666
0.5883333333333334
0.5283333333333333
0.555
0.61
0.48333333333333334
0.57
0.6033333333333334
0.585
0.5633333333333334
0.5083333333333333
0.5266666666666666
0.5716666666666667
0.58
0.585
0.5916666666666667
0.5466666666666666
0.5633333333333334
0.56
0.5
0.5666666666666667
0.5333333333333333
0.5933333333333334
0.55
0.5966666666666667
0.5866666666666667
0.545
0.585
0.5616666666666666
0.585
0.58
0.5616666666666666
0.5583333333333333
0.5766666666666667
0.5983333333333334
0.5966666666666667
0.57
0.575
0.5533333333333333
0.575
0.5866666666666667
0.5816666666666667
0.5533333333333333
0.5466666666666666
0.565
0.5516666666666666
0.5983333333333334
0.5166666666666667
0.56
0.5466666666666666
0.585
0.5266666666666666
0.6066666666666667
0.5816666666666667
0.5433333333333333
0.56
0.58833333333333

### Decision Tree

In [22]:
# CV to find best depth, report results
@time begin
    scores = []
    for i in 1:30
        tree = DecisionTreeClassifier(max_depth=i, random_state=5)
        result = ScikitLearn.CrossValidation.cross_val_score(tree, madelon_X_train, madelon_y_train, cv=5)
        append!(scores, mean(result))
    end
    best_depth = argmax(scores)
    tree = fit!(DecisionTreeClassifier(max_depth=best_depth, random_state=5),
                madelon_X_train,
                madelon_y_train)
    tree_pred = predict(tree, madelon_X_test)
    accuracy_score(madelon_y_test, tree_pred)
end

 36.916296 seconds (696.85 k allocations: 1.200 GiB, 0.35% gc time)


0.795

### SVM

In [23]:
# training time too long for CV and grid search
# choose radial kernel to allow flexibility 
@time begin
    C = [0.01, 0.1, 1, 10, 100]
    scores = []
    for c in C 
        svc = SVC(random_state=5, kernel="rbf", C=c)
        fit!(svc, madelon_X_train, madelon_y_train)
        svc_pred = predict(svc, madelon_X_test)
        append!(scores, accuracy_score(madelon_y_test, svc_pred))
    end

end
maximum(scores)

  3.898623 seconds (125.76 k allocations: 6.730 MiB)


0.6966666666666667

### MLP

In [24]:
@time begin
    alphas = [0.0001, 0.001, 0.01, 0.1]
    sizes = [30, 100, 500, 1000]
    scores = []
    for a in alphas
        for s in sizes
            mlp = MLPClassifier(hidden_layer_sizes=(s, s, s), alpha=a)
            fit!(mlp, madelon_X_train, madelon_y_train)
            mlp_pred = predict(mlp, madelon_X_test)
            append!(scores, accuracy_score(madelon_y_test, mlp_pred))
            end
    end
    maximum(scores)
end

216.851011 seconds (288.72 k allocations: 15.502 MiB, 0.00% gc time)


0.6033333333333334

# KDD

## Preprocessing

In [25]:
# read in data
kdd = CSV.File("data/KDD/kddcup.data_10_percent_corrected", header=0)|> DataFrame
kdd_X = convert(Array, kdd[:, Not(42)])
kdd_y = convert(Array, kdd[:, 42])

# separate training and testing 
X_train, X_test, y_train, y_test = train_test_split(kdd_X, kdd_y, test_size=0.3, random_state=5)


4-element Array{Array,1}:
 Any[0 "tcp" … 0.0 0.0; 0 "icmp" … 0.0 0.0; … ; 0 "tcp" … 0.0 0.0; 0 "icmp" … 0.0 0.0]
 Any[0 "icmp" … 0.0 0.0; 0 "udp" … 0.0 0.0; … ; 0 "icmp" … 0.0 0.0; 0 "icmp" … 0.0 0.0]
 ["neptune.", "smurf.", "neptune.", "smurf.", "smurf.", "normal.", "smurf.", "neptune.", "smurf.", "neptune."  …  "smurf.", "normal.", "smurf.", "back.", "neptune.", "smurf.", "neptune.", "normal.", "normal.", "smurf."]
 ["smurf.", "normal.", "neptune.", "smurf.", "smurf.", "smurf.", "neptune.", "neptune.", "normal.", "neptune."  …  "normal.", "smurf.", "smurf.", "smurf.", "neptune.", "smurf.", "smurf.", "neptune.", "smurf.", "smurf."]

In [26]:
# Encode categorical features
ohc = OneHotEncoder()
encoded_cols_train = ohc.fit_transform(X_train[:, 2:4]).toarray()
encoded_cols_test = ohc.transform(X_test[:, 2:4]).toarray()

# combine the data 
X_train_transformed = hcat(encoded_cols_train, X_train[:, Not(2:4)])
X_test_transformed = hcat(encoded_cols_test, X_test[:, Not(2:4)])

148207×118 Array{Any,2}:
 1.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0   1.0   0.0  0.0  0.0  0.0  0.0
 0.0  0.0  1.0  0.0  0.0  0.0  0.0     0.02  0.01  0.0  0.0  0.0  0.0  0.0
 0.0  1.0  0.0  0.0  0.0  0.0  0.0     0.08  0.0   0.0  0.0  0.0  1.0  1.0
 1.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0   1.0   0.0  0.0  0.0  0.0  0.0
 1.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0   1.0   0.0  0.0  0.0  0.0  0.0
 1.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0   1.0   0.0  0.0  0.0  0.0  0.0
 0.0  1.0  0.0  0.0  0.0  0.0  0.0     0.06  0.0   0.0  1.0  1.0  0.0  0.0
 0.0  1.0  0.0  0.0  0.0  0.0  0.0     0.05  0.0   0.0  1.0  1.0  0.0  0.0
 0.0  1.0  0.0  0.0  0.0  0.0  0.0     0.0   0.0   0.0  0.0  0.0  0.0  0.0
 0.0  1.0  0.0  0.0  0.0  0.0  0.0     0.06  0.0   0.0  0.0  0.0  1.0  1.0
 1.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0   1.0   0.0  0.0  0.0  0.0  0.0
 1.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0   1.0   0.0  0.0  0.0  0.0  0.0
 0.0  0.0  1.0  0.0  0.0  0.0  0.0     0.01  0.0   0.0  0.0  0.0  0.0  0.0


In [27]:
# encode y's 
le = LabelEncoder()
y_train_labeled = le.fit_transform(y_train)
y_test_labeled = le.transform(y_test)

# binary y's 
y_train_binary = y_train .== "normal."
y_test_binary = y_test .== "normal."

148207-element BitArray{1}:
 0
 1
 0
 0
 0
 0
 0
 0
 1
 0
 0
 0
 1
 ⋮
 0
 0
 1
 0
 0
 0
 0
 0
 0
 0
 0
 0

In [28]:
# class imbalance in training data 
sum(y_train .== "normal.") / length(y_train)

0.19701631512894216

## Model Creation

### Naive Bayesian Classifier

In [29]:
@time begin
    gauss_nb = fit!(GaussianNB(), X_train_transformed, y_train_labeled)
    gauss_nb_pred = predict(gauss_nb, X_test_transformed)
    println(accuracy_score(y_test_labeled, gauss_nb_pred))
end

0.9506096203283245
 42.770700 seconds (117.62 M allocations: 1.755 GiB, 17.38% gc time)


In [30]:
@time begin
    gauss_nb = fit!(GaussianNB(), X_train_transformed, y_train_binary)
    gauss_nb_pred = predict(gauss_nb, X_test_transformed)
    println(accuracy_score(y_test_binary, gauss_nb_pred))
end

0.9798390089536931
 54.742738 seconds (118.26 M allocations: 1.787 GiB, 20.00% gc time)


### Decision Tree

In [31]:
# Cross validation takes too long
# attempt to find best depth 
@time begin
    scores = []
    for i in 25:30
        tree = fit!(DecisionTreeClassifier(max_depth=i, random_state=5),
                X_train_transformed,
                y_train_binary)
        tree_pred = predict(tree, X_test_transformed)
        result = accuracy_score(y_test_binary, tree_pred)
        append!(scores, result)
    end
    maximum(scores)
end

336.552685 seconds (705.46 M allocations: 10.516 GiB, 12.55% gc time)


0.9997436018541634

### SVM

In [32]:
# Cross validation takes too long
# find best C 
@time begin
    C = [0.01, 0.1, 1, 10, 100]
    scores = []
    for c in C
        svc = LinearSVC(random_state=5, C=c)
        fit!(svc, X_train_transformed, y_train_binary)
        svc_pred = predict(svc, X_test_transformed)
        append!(scores, accuracy_score(y_test_binary, svc_pred))
    end
    maximum(scores)
end

472.174987 seconds (588.03 M allocations: 8.771 GiB, 6.79% gc time)


0.9937519820251405

### MLP

In [33]:
@time begin
    mlp = MLPClassifier()
    fit!(mlp, X_train_transformed, y_train_binary)
    mlp_pred = predict(mlp, X_test_transformed)
    accuracy_score(y_test_binary, mlp_pred)
end

166.629484 seconds (117.58 M allocations: 1.753 GiB, 3.15% gc time)


0.9978745943174074