# File Paths

<span style="color:red">! Important: These are the file paths of the various datasets. The data files are already in the zip file. If you decide to adjust the file structure, you must ensure you change the file paths in the first cell (under heading “File paths”) to correspond with the correct files. If you do not, the Jupyter notebook will not be able to read the data. !</span>

In [31]:
car_data = "./original_data/car-evaluation/car.data";
abalone_data = "./original_data/abalone/abalone.data";
madelon_train_data = "./original_data/madelon/madelon_train.data";
madelon_train_label_data = "./original_data/madelon/madelon_train.labels";
madelon_valid_data = "./original_data/madelon/madelon_valid.data";
madelon_valid_label_data = "./original_data/madelon/madelon_valid.labels";
madelon_test_data = "./original_data/madelon/madelon_test.data";
kdd_data = "./original_data/kddcup99/kddcup.data_10_percent";
kdd_test_data = "./original_data/kddcup99/kddcup.testdata.unlabeled_10_percent";

# Package Installs/Imports

In [1]:
using Pkg
Pkg.add("CSVFiles")
Pkg.add("ScikitLearn")
Pkg.add("StatsBase")
Pkg.add("PyCall")

[32m[1m   Updating[22m[39m registry at `~/.julia/registries/General`
######################################################################### 100.0%
[32m[1m  Resolving[22m[39m package versions...
[32m[1mNo Changes[22m[39m to `~/.julia/environments/v1.5/Project.toml`
[32m[1mNo Changes[22m[39m to `~/.julia/environments/v1.5/Manifest.toml`
[32m[1m  Resolving[22m[39m package versions...
[32m[1mNo Changes[22m[39m to `~/.julia/environments/v1.5/Project.toml`
[32m[1mNo Changes[22m[39m to `~/.julia/environments/v1.5/Manifest.toml`
[32m[1m  Resolving[22m[39m package versions...
[32m[1mNo Changes[22m[39m to `~/.julia/environments/v1.5/Project.toml`
[32m[1mNo Changes[22m[39m to `~/.julia/environments/v1.5/Manifest.toml`
[32m[1m  Resolving[22m[39m package versions...
[32m[1mNo Changes[22m[39m to `~/.julia/environments/v1.5/Project.toml`
[32m[1mNo Changes[22m[39m to `~/.julia/environments/v1.5/Manifest.toml`


In [1]:
using CSVFiles
using DataFrames
using ScikitLearn
using StatsBase
using PyCall
joblib = pyimport("joblib");

In [2]:
# Scikit-learn imports
@sk_import preprocessing: (OrdinalEncoder, LabelEncoder, MinMaxScaler)
@sk_import model_selection: (cross_val_score, GridSearchCV, RandomizedSearchCV)
@sk_import ensemble: (RandomForestClassifier, BaggingClassifier)
@sk_import svm: LinearSVC
@sk_import decomposition: PCA
@sk_import naive_bayes: GaussianNB
@sk_import neural_network: MLPClassifier
@sk_import multiclass: OneVsRestClassifier

└ @ ScikitLearn.Skcore /Users/ambergfisher/.julia/packages/ScikitLearn/NJwUf/src/Skcore.jl:179


PyObject <class 'sklearn.multiclass.OneVsRestClassifier'>

# Custom Functions

Splitting into training & testing (70-30 split). 
The method takes 70% of the values between 1 and the size of the features set to get the training indices and
uses that to get the training features and labels. The testing indices are whatever is left over (and the training features and labels follow from the indices).

In [4]:
function train_test_split(features, labels)
    n = size(features, 1)
    train_ind = sort(StatsBase.sample(1:n, floor(Int, 0.7 * n), replace=false))
    test_ind = setdiff(1:n, train_ind)

    train_features = features[train_ind, :]
    test_features = features[test_ind, :]
    
    train_labels = labels[train_ind]
    test_labels = labels[test_ind]
    
    return train_features, train_labels, test_features, test_labels
end

train_test_split (generic function with 1 method)

Manipulate the data as desired. The mapper is required in Julia to map column names to something that Python can handle. The modifier array contains the a list of column keys and the modifier class to use on them. The data type is only used to determine if the data should be converted into integers (used for the OrdinalEncoding and LabelEncoding classes). At the end, a DataFrame of the modified data is returned.

In [5]:
function transform_features(raw_data, modifier_arr, dataType)
    mapper = DataFrameMapper(modifier_arr)
    if dataType == "Int"
        data = floor.(Int, fit_transform!(mapper, raw_data))
    else
        data = fit_transform!(mapper, raw_data)
    end
    return DataFrame(data, propertynames(raw_data))
end

transform_features (generic function with 1 method)

Score the model based on the average of the 5-fold cross validation scores.

In [6]:
function average_cv_score(model, features, labels, crossval=5)
    scores = cross_val_score(model, features, labels, cv=crossval)
    return mean(scores)
end

average_cv_score (generic function with 2 methods)

# Car Data Preprocessing

In [32]:
# Import the data and split into features and labels
car_df = DataFrame(load(File(format"CSV", car_data); header_exists=false, colnames=["buying", "maint", "doors", "persons", "lug_boot", "safety", "value"]));
car_features, car_labels = car_df[:, 1:end-1], car_df[:, end];

In [33]:
# Ordinal encode, then normalize the data. Finally, split the data into testing and training sets.
car_features = transform_features(car_features, [(propertynames(car_features), OrdinalEncoder())], "Int")
car_features = transform_features(car_features, [(propertynames(car_features), MinMaxScaler())], nothing)
car_train_features, car_train_labels, car_test_features, car_test_labels = train_test_split(car_features, car_labels);

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


In [34]:
# Convert DataFrames to Arrays because Julia is annoying
car_train_features = convert(Array, car_train_features)
car_train_labels = convert(Array, car_train_labels)
car_test_features = convert(Array, car_test_features)
car_test_labels = convert(Array, car_test_labels);

In [35]:
# Convert the labels into binary "acceptable" vs "unacceptable" 
# (acc & unacc are already labeled properly so we just need to change vgood and good to acc)
temp_train = copy(car_train_labels)
temp_test = copy(car_test_labels)
car_train_labels[temp_train.=="vgood"] .= "acc"
car_train_labels[temp_train.=="good"] .= "acc"
car_test_labels[temp_test.=="vgood"] .= "acc"
car_test_labels[temp_test.=="good"] .= "acc";

# Abalone Data Preprocessing

In [36]:
# Import the data and split into features and labels
abalone_df = DataFrame(load(File(format"CSV", abalone_data); header_exists=false, colnames=["sex", "length", "diameter", "height", "whole_weight", "shucked_weight", "viscera_weight", "shell_weight", "rings"]));
abalone_features, abalone_labels = abalone_df[:, 1:end - 1], abalone_df[:, end];

In [37]:
# Label encode, then normalize the data. Finally, split the data into testing and training sets.
abalone_features.sex = LabelEncoder().fit_transform(abalone_features[:, :sex])
abalone_features = transform_features(abalone_features, [(propertynames(abalone_features), MinMaxScaler())], nothing)
abalone_train_features, abalone_train_labels, abalone_test_features, abalone_test_labels = train_test_split(abalone_features, abalone_labels);

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


In [38]:
# Convert DataFrames to Arrays because Julia is annoying
abalone_train_features = convert(Array{Any}, abalone_train_features)
abalone_train_labels = convert(Array{Any}, abalone_train_labels)
abalone_test_features = convert(Array{Any}, abalone_test_features)
abalone_test_labels = convert(Array{Any}, abalone_test_labels);

In [39]:
# Convert the labels into binary "young" vs "old"
temp_train = copy(abalone_train_labels)
temp_test = copy(abalone_test_labels)
abalone_train_labels[temp_train.<=9] .= "young"
abalone_train_labels[temp_train.>9] .= "old"
abalone_test_labels[temp_test.<=9] .= "young"
abalone_test_labels[temp_test.>9] .= "old";

# Madelon Data Preprocessing

In [40]:
# Import the data and combine the training and validation sets into a single training set
madelon_train_features = DataFrame(load(File(format"CSV", madelon_train_data); spacedelim=true, header_exists=false));
madelon_train_labels = DataFrame(load(File(format"CSV", madelon_train_label_data); header_exists=false, colnames=["Value"]));
madelon_valid_features = DataFrame(load(File(format"CSV", madelon_valid_data); spacedelim=true, header_exists=false));
madelon_valid_labels = DataFrame(load(File(format"CSV", madelon_valid_label_data); header_exists=false, colnames=["Value"]));

madelon_train_features = vcat(madelon_train_features, madelon_valid_features)
madelon_train_labels = vcat(madelon_train_labels, madelon_valid_labels);

In [41]:
# Import the testing features
madelon_test_features = DataFrame(load(File(format"CSV", madelon_test_data); spacedelim=true, header_exists=false));

In [42]:
# Convert DataFrames to Arrays because Julia is annoying
madelon_train_features = convert(Array, madelon_train_features)
madelon_train_labels = convert(Array, madelon_train_labels)
madelon_test_features = convert(Array, madelon_test_features);

# KDD Cup 1999 Data Preprocessing

In [43]:
# Import the training data and add column names. Then split into training features and labels
kdd_df = DataFrame(load(File(format"CSV", kdd_data); header_exists=false));
rename!(kdd_df, [:duration, :protocol_type, :service, :flag, :src_bytes, :dst_bytes, :land, :wrong_fragment, :urgent, :hot, :num_failed_logins, :logged_in, :num_compromised, :root_shell, :su_attempted, :num_root, :num_file_creations, :num_shells, :num_access_files, :num_outbound_cmds, :is_host_login, :is_guest_login, :count, :srv_count, :serror_rate, :srv_serror_rate, :rerror_rate, :srv_rerror_rate, :same_srv_rate, :diff_srv_rate, :srv_diff_host_rate, :dst_host_count, :dst_host_srv_count, :dst_host_same_srv_rate, :dst_host_diff_srv_rate, :dst_host_same_src_port_rate, :dst_host_srv_diff_host_rate, :dst_host_serror_rate, :dst_host_srv_serror_rate, :dst_host_rerror_rate, :dst_host_srv_rerror_rate, :value]);
kdd_train_features, kdd_train_labels = kdd_df[:, 1:end - 1], kdd_df[:, end];

In [44]:
# Import the testing features and add column names
kdd_test_features = DataFrame(load(File(format"CSV", kdd_test_data); header_exists=false))
rename!(kdd_test_features, [:duration, :protocol_type, :service, :flag, :src_bytes, :dst_bytes, :land, :wrong_fragment, :urgent, :hot, :num_failed_logins, :logged_in, :num_compromised, :root_shell, :su_attempted, :num_root, :num_file_creations, :num_shells, :num_access_files, :num_outbound_cmds, :is_host_login, :is_guest_login, :count, :srv_count, :serror_rate, :srv_serror_rate, :rerror_rate, :srv_rerror_rate, :same_srv_rate, :diff_srv_rate, :srv_diff_host_rate, :dst_host_count, :dst_host_srv_count, :dst_host_same_srv_rate, :dst_host_diff_srv_rate, :dst_host_same_src_port_rate, :dst_host_srv_diff_host_rate, :dst_host_serror_rate, :dst_host_srv_serror_rate, :dst_host_rerror_rate, :dst_host_srv_rerror_rate]);

In [45]:
# Ordinal encode all columns with type String and reassign to kdd_train_features with updated column types
string_cols = findall(x -> x == String, eltype.(eachcol(kdd_train_features)))
kdd_train_strings = kdd_train_features[:, string_cols]
kdd_train_features[!, string_cols] = convert.(Int64, transform_features(kdd_train_strings, [(propertynames(kdd_train_strings), OrdinalEncoder())], "Int"))
kdd_test_strings = kdd_test_features[:, string_cols]
kdd_test_features[!, string_cols] = convert.(Int64, transform_features(kdd_test_strings, [(propertynames(kdd_test_strings), OrdinalEncoder())], "Int"));

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

In [46]:
# Normalize all columns
kdd_train_features = transform_features(kdd_train_features, [(propertynames(kdd_train_features), MinMaxScaler())], nothing)
kdd_test_features = transform_features(kdd_test_features, [(propertynames(kdd_test_features), MinMaxScaler())], nothing);

In [47]:
# Convert DataFrames to Arrays because Julia is annoying
kdd_train_features = convert(Array, kdd_train_features)
kdd_train_labels = convert(Array, kdd_train_labels)
kdd_test_features = convert(Array, kdd_test_features);

In [48]:
# Convert the labels into binary "normal." vs "abnormal."
kdd_train_labels[kdd_train_labels.!="normal."] .= "abnormal.";

# Classification Trees (Scikit-Learn Random Forest)

In [49]:
# Number of features to consider at every split
max_features = ["sqrt", 0.5, 0.8, 0.9, nothing]
# Maximum number of levels in tree
max_depth = [1,5,10,15,nothing]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [true, false]

# Put into a dictionary
random_grid = Dict(["max_features" => max_features, "max_depth" => max_depth, "min_samples_split" => min_samples_split, "min_samples_leaf" => min_samples_leaf, "bootstrap" => bootstrap])

# Create a model and create the randomized search class
car_forest_model = RandomForestClassifier(n_jobs=-1)
car_forest_rand = RandomizedSearchCV(estimator = car_forest_model, param_distributions = random_grid, n_iter = 25, cv = 3, random_state=42, n_jobs = -1)
# Timing how long it takes to do the actual search
@time car_forest_rand.fit(car_train_features, car_train_labels)
println(car_forest_rand.best_params_)

  3.804594 seconds (1.22 k allocations: 19.562 KiB)
Dict{Any,Any}("max_features" => nothing,"max_depth" => nothing,"min_samples_leaf" => 2,"bootstrap" => false,"min_samples_split" => 2)


  self.best_estimator_.fit(X, y, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


In [50]:
# Convert the search's best parameters into a dictionary with Symbol keys
# n_jobs just speeds up the actual fitting process later
best_params = Dict(car_forest_rand.best_params_)
best_params["n_jobs"] = -1
best_params = Dict(Symbol(k) => v for (k, v) in best_params)

# Score the model
car_forest_scores = average_cv_score(RandomForestClassifier(;best_params...), car_train_features, car_train_labels)
println(car_forest_scores)

# Time how long it takes to fit the model
println("Training time:")
@time car_forest_model.fit(car_train_features, car_train_labels)

# Time how long it takes to predict values
println("Testing time:")
@time car_forest_model.predict(car_test_features);

0.8487534721031513
Training time:
  0.120501 seconds (1.22 k allocations: 19.562 KiB)
Testing time:
  0.015986 seconds (14 allocations: 800 bytes)


In [51]:
# Number of features to consider at every split
max_features = ["sqrt", 0.5, 0.8, 0.9, nothing]
# Maximum number of levels in tree
max_depth = [1,5,10,15,nothing]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [true, false]

# Put into a dictionary
random_grid = Dict(["max_features" => max_features, "max_depth" => max_depth, "min_samples_split" => min_samples_split, "min_samples_leaf" => min_samples_leaf, "bootstrap" => bootstrap])

# Create a model and create the randomized search class
abalone_forest_model = RandomForestClassifier(n_jobs=-1)
abalone_forest_rand = RandomizedSearchCV(estimator = abalone_forest_model, param_distributions = random_grid, n_iter = 25, cv = 3, random_state=42, n_jobs = -1)
# Timing how long it takes to do the actual search
@time abalone_forest_rand.fit(abalone_train_features, abalone_train_labels)
print(abalone_forest_rand.best_params_)

  5.640548 seconds (58.47 k allocations: 913.859 KiB)
Dict{Any,Any}("max_features" => "sqrt","max_depth" => 15,"min_samples_leaf" => 2,"bootstrap" => true,"min_samples_split" => 10)

In [52]:
# Convert the search's best parameters into a dictionary with Symbol keys
# n_jobs just speeds up the actual fitting process later
best_params = Dict(abalone_forest_rand.best_params_)
best_params["n_jobs"] = -1
best_params = Dict(Symbol(k) => v for (k, v) in best_params)

# Score the model
abalone_forest_scores = average_cv_score(RandomForestClassifier(;best_params...), abalone_train_features, abalone_train_labels)
println(abalone_forest_scores)

# Time how long it takes to fit the model
println("Training time:")
@time abalone_forest_model.fit(abalone_train_features, abalone_train_labels)

# Time how long it takes to predict values
println("Testing time:")
@time abalone_forest_model.predict(abalone_test_features);

0.7960988174686804
Training time:
  0.143955 seconds (58.47 k allocations: 913.859 KiB)
Testing time:
  0.023975 seconds (22.59 k allocations: 353.250 KiB)


In [53]:
# Number of features to consider at every split
max_features = ["sqrt", 0.5, 0.8, 0.9, nothing]
# Maximum number of levels in tree
max_depth = [1,5,10,15,nothing]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [true, false]

# Put into a dictionary
random_grid = Dict(["max_features" => max_features, "max_depth" => max_depth, "min_samples_split" => min_samples_split, "min_samples_leaf" => min_samples_leaf, "bootstrap" => bootstrap])

# Create a model and create the randomized search class
madelon_forest_model = RandomForestClassifier(n_jobs=-1)
madelon_forest_rand = RandomizedSearchCV(estimator = madelon_forest_model, param_distributions = random_grid, n_iter = 10, cv = 3, random_state=42, n_jobs = -1)
# Timing how long it takes to do the actual search
@time madelon_forest_rand.fit(madelon_train_features, madelon_train_labels)
print(madelon_forest_rand.best_params_)

 98.219515 seconds (17 allocations: 976 bytes)
Dict{Any,Any}("max_features" => 0.5,"max_depth" => nothing,"min_samples_leaf" => 1,"bootstrap" => false,"min_samples_split" => 5)

In [54]:
# Convert the search's best parameters into a dictionary with Symbol keys
# n_jobs just speeds up the actual fitting process later
best_params = Dict(madelon_forest_rand.best_params_)
best_params["n_jobs"] = -1
best_params = Dict(Symbol(k) => v for (k, v) in best_params)

# Score the model
madelon_forest_scores = average_cv_score(RandomForestClassifier(;best_params...), madelon_train_features, madelon_train_labels)
println(madelon_forest_scores)

# Time how long it takes to fit the model
println("Training time:")
@time madelon_forest_model.fit(madelon_train_features, madelon_train_labels)

# Time how long it takes to predict values
println("Testing time:")
@time madelon_forest_model.predict(madelon_test_features);

0.8496153846153846
Training time:
  0.468943 seconds (17 allocations: 976 bytes)
Testing time:
  0.021089 seconds (33 allocations: 15.828 KiB)


In [None]:
# Create the model
kdd_forest_model = RandomForestClassifier(n_jobs=-1)

# Score the model
kdd_forest_scores = average_cv_score(kdd_forest_model, kdd_train_features, kdd_train_labels)
println(kdd_forest_scores)

# Time how long it takes to fit the model
println("Training time:")
@time kdd_forest_model.fit(kdd_train_features, kdd_train_labels)

# Time how long it takes to predict values
println("Testing time:")
@time kdd_forest_model.predict(kdd_test_features);

# Support Vector Machines (LinearSVC with Bagging)

In [32]:
# Create the model
car_svc_model = BaggingClassifier(LinearSVC(), n_jobs=-1)

# Score the model
car_svc_scores = average_cv_score(car_svc_model, car_train_features, car_train_labels)
println(car_svc_scores)

# Time how long it takes to fit the model
println("Training time:")
@time car_svc_model.fit(car_train_features, car_train_labels)

# Time how long it takes to predict values
println("Testing time:")
@time car_svc_model.predict(car_test_features);

0.6890127224717946
Training time:
  0.034530 seconds (1.22 k allocations: 19.562 KiB)
Testing time:
  0.006674 seconds (14 allocations: 800 bytes)


In [33]:
# Create the model
abalone_svc_model = BaggingClassifier(LinearSVC(), n_jobs=-1)

# Score the model
abalone_svc_scores = average_cv_score(abalone_svc_model, abalone_train_features, abalone_train_labels)
println(abalone_svc_scores)

# Time how long it takes to fit the model
println("Training time:")
@time abalone_svc_model.fit(abalone_train_features, abalone_train_labels)

# Time how long it takes to predict values
println("Testing time:")
@time abalone_svc_model.predict(abalone_test_features);

0.7786541388596184
Training time:
  0.217020 seconds (58.47 k allocations: 913.859 KiB)
Testing time:
  0.038418 seconds (22.59 k allocations: 353.250 KiB)


In [34]:
# Create the model
madelon_svc_model = BaggingClassifier(LinearSVC(), n_jobs=-1)

# Score the model
madelon_svc_scores = average_cv_score(madelon_svc_model, madelon_train_features, madelon_train_labels)
println(madelon_svc_scores)

# Time how long it takes to fit the model
println("Training time:")
@time madelon_svc_model.fit(madelon_train_features, madelon_train_labels)

# Time how long it takes to predict values
println("Testing time:")
@time madelon_svc_model.predict(madelon_test_features);

0.5496153846153845
Training time:
  5.118226 seconds (17 allocations: 976 bytes)
Testing time:
  0.182036 seconds (33 allocations: 15.828 KiB)


In [35]:
# Create the model
kdd_svc_model = BaggingClassifier(LinearSVC(), n_jobs=-1)

# Score the model
kdd_svc_scores = average_cv_score(kdd_svc_model, kdd_train_features, kdd_train_labels)
println(kdd_svc_scores)

# Time how long it takes to fit the model
println("Training time:")
@time kdd_svc_model.fit(kdd_train_features, kdd_train_labels)

# Time how long it takes to predict values
println("Testing time:")
@time kdd_svc_model.predict(kdd_test_features);

0.9881402451616971
Training time:
 21.900456 seconds (494.04 k allocations: 7.539 MiB)
Testing time:
  1.599752 seconds (14 allocations: 800 bytes)


# Naive Bayesian (Gaussian) Classifier

In [36]:
# Create the model
car_nb_model = GaussianNB()

# Score the model
car_nb_scores = average_cv_score(car_nb_model, car_train_features, car_train_labels[:, 1])
println(car_nb_scores)

# Time how long it takes to fit the model
println("Training time:")
@time car_nb_model.fit(car_train_features, car_train_labels)

# Time how long it takes to predict values
println("Testing time:")
@time car_nb_model.predict(car_test_features);

0.7485682932684063
Training time:
  0.001121 seconds (1.22 k allocations: 19.562 KiB)
Testing time:
  0.000412 seconds (14 allocations: 800 bytes)


In [37]:
# Create the model
abalone_nb_model = GaussianNB()

# Score the model
abalone_nb_scores = average_cv_score(abalone_nb_model, abalone_train_features, abalone_train_labels[:, 1])
println(abalone_nb_scores)

# Time how long it takes to fit the model
println("Training time:")
@time abalone_nb_model.fit(abalone_train_features, abalone_train_labels)

# Time how long it takes to predict values
println("Testing time:")
@time abalone_nb_model.predict(abalone_test_features);

0.7379381805409203
Training time:
  0.012609 seconds (58.47 k allocations: 913.859 KiB)
Testing time:
  0.005100 seconds (22.59 k allocations: 353.250 KiB)


In [38]:
# Create the model
madelon_nb_model = GaussianNB()

# Score the model
madelon_nb_scores = average_cv_score(madelon_nb_model, madelon_train_features, madelon_train_labels[:, 1])
println(madelon_nb_scores)

# Time how long it takes to fit the model
println("Training time:")
@time madelon_nb_model.fit(madelon_train_features, madelon_train_labels)

# Time how long it takes to predict values
println("Testing time:")
@time madelon_nb_model.predict(madelon_test_features);

0.5934615384615385
Training time:
  0.031000 seconds (17 allocations: 976 bytes)
Testing time:
  0.012870 seconds (33 allocations: 15.828 KiB)


In [39]:
# Create the model
kdd_nb_model = GaussianNB()

# Score the model
kdd_nb_scores = average_cv_score(kdd_nb_model, kdd_train_features, kdd_train_labels[:, 1])
println(kdd_nb_scores)

# Time how long it takes to fit the model
println("Training time:")
@time kdd_nb_model.fit(kdd_train_features, kdd_train_labels)

# Time how long it takes to predict values
println("Testing time:")
@time kdd_nb_model.predict(kdd_test_features);

0.9394580412234858
Training time:
  0.446924 seconds (494.04 k allocations: 7.539 MiB)
Testing time:
  0.112058 seconds (14 allocations: 800 bytes)


# Neural Net (MLP Classifier)

In [40]:
# Create the parameter grid in a dictionary
grid = Dict(["alpha" => exp10.(range(-5, -1, step=1)), "learning_rate" => ["constant", "invscaling", "adaptive"]])

# Create the grid search class
car_neural_search = GridSearchCV(estimator=MLPClassifier(), param_grid=grid, cv=3, n_jobs=-1)

# Time how long it takes to get the best parameters
@time car_neural_search.fit(car_train_features, car_train_labels)
println(car_neural_search.best_params_)

  4.645834 seconds (1.22 k allocations: 19.562 KiB)
Dict{Any,Any}("alpha" => 0.001,"learning_rate" => "invscaling")


In [41]:
# Convert the search's best parameters to a dictionary with Symbol keys
best_params = Dict(Symbol(k) => v for (k, v) in car_neural_search.best_params_)

# Create the model using the best params
car_neural_model = MLPClassifier(;best_params...)

# Score the model
car_neural_scores = average_cv_score(car_neural_model, car_train_features, car_train_labels)
println(car_neural_scores)

# Time how long it takes to fit the model
println("Training time:")
@time car_neural_model.fit(car_train_features, car_train_labels)

# Time how long it takes to predict values
println("Testing time:")
@time car_neural_model.predict(car_test_features);

0.8677205857137957
Training time:
  0.508937 seconds (1.22 k allocations: 19.562 KiB)
Testing time:
  0.000750 seconds (14 allocations: 800 bytes)


In [42]:
# Create the parameter grid in a dictionary
grid = Dict(["alpha" => exp10.(range(-5, -1, step=1)), "learning_rate" => ["constant", "invscaling", "adaptive"]])

# Create the grid search class
abalone_neural_search = GridSearchCV(estimator=MLPClassifier(), param_grid=grid, cv=3, n_jobs=-1)

# Time how long it takes to get the best parameters
@time abalone_neural_search.fit(abalone_train_features, abalone_train_labels)
println(abalone_neural_search.best_params_)

  9.685579 seconds (58.47 k allocations: 913.859 KiB)
Dict{Any,Any}("alpha" => 0.001,"learning_rate" => "adaptive")


In [43]:
# Convert the search's best parameters to a dictionary with Symbol keys
best_params = Dict(Symbol(k) => v for (k, v) in abalone_neural_search.best_params_)

# Create the model using the best params
abalone_neural_model = MLPClassifier(;best_params...)

# Score the model
abalone_neural_scores = average_cv_score(abalone_neural_model, abalone_train_features, abalone_train_labels)
println(abalone_neural_scores)

# Time how long it takes to fit the model
println("Training time:")
@time abalone_neural_model.fit(abalone_train_features, abalone_train_labels)

# Time how long it takes to predict values
println("Testing time:")
@time abalone_neural_model.predict(abalone_test_features);

0.7793332162510245
Training time:
  0.864102 seconds (58.47 k allocations: 913.859 KiB)
Testing time:
  0.005766 seconds (22.59 k allocations: 353.250 KiB)


In [44]:
# Create the parameter grid in a dictionary
grid = Dict(["alpha" => exp10.(range(-5, -1, step=1)), "learning_rate" => ["constant", "invscaling", "adaptive"]])

# Create the grid search class
madelon_neural_search = GridSearchCV(estimator=MLPClassifier(), param_grid=grid, cv=3, n_jobs=-1)

# Time how long it takes to get the best parameters
@time madelon_neural_search.fit(madelon_train_features, madelon_train_labels)
println(madelon_neural_search.best_params_)

  9.493848 seconds (17 allocations: 976 bytes)
Dict{Any,Any}("alpha" => 9.999999999999999e-5,"learning_rate" => "constant")


In [45]:
# Convert the search's best parameters to a dictionary with Symbol keys
best_params = Dict(Symbol(k) => v for (k, v) in madelon_neural_search.best_params_)

# Create the model using the best params
madelon_neural_model = MLPClassifier(;best_params...)

# Score the model
madelon_neural_scores = average_cv_score(madelon_neural_model, madelon_train_features, madelon_train_labels)
println(madelon_neural_scores)

# Time how long it takes to fit the model
println("Training time:")
@time madelon_neural_model.fit(madelon_train_features, madelon_train_labels)

# Time how long it takes to predict values
println("Testing time:")
@time madelon_neural_model.predict(madelon_test_features);

0.5034615384615384
Training time:
  1.171509 seconds (17 allocations: 976 bytes)
Testing time:
  0.011455 seconds (33 allocations: 15.828 KiB)


In [46]:
# Create the parameter grid in a dictionary
grid = Dict(["alpha" => exp10.(range(-5, -1, step=1)), "learning_rate" => ["constant", "invscaling", "adaptive"]])

# Create the grid search class
kdd_neural_search = GridSearchCV(estimator=MLPClassifier(), param_grid=grid, cv=3, n_jobs=-1)

# Time how long it takes to get the best parameters
@time kdd_neural_search.fit(kdd_train_features, kdd_train_labels)
println(kdd_neural_search.best_params_)

278.849733 seconds (494.04 k allocations: 7.539 MiB)
Dict{Any,Any}("alpha" => 1.0e-5,"learning_rate" => "invscaling")


In [47]:
# Convert the search's best parameters to a dictionary with Symbol keys
best_params = Dict(Symbol(k) => v for (k, v) in kdd_neural_search.best_params_)

# Create the model using the best params
kdd_neural_model = MLPClassifier(;best_params...)

# Score the model
kdd_neural_scores = average_cv_score(kdd_neural_model, kdd_train_features, kdd_train_labels)
println(kdd_neural_scores)

# Time how long it takes to fit the model
println("Training time:")
@time kdd_neural_model.fit(kdd_train_features, kdd_train_labels)

# Time how long it takes to predict values
println("Testing time:")
@time kdd_neural_model.predict(kdd_test_features);

0.9925914588649777
Training time:
 42.399160 seconds (494.04 k allocations: 7.539 MiB)
Testing time:
  0.379280 seconds (14 allocations: 800 bytes)


# Finding the model sizes

In [48]:
forest_models = Dict(["car_forest.model" => car_forest_model, "abalone_forest.model" => abalone_forest_model, "madelon_forest.model" => madelon_forest_model, "kdd_forest.model" => kdd_forest_model])

for (filename, model) in forest_models
    joblib.dump(model, filename)
    println(filename * "\t" * string(stat(filename).size) * " bytes")
end

car_forest.model	1703339 bytes
abalone_forest.model	6610940 bytes
kdd_forest.model	5097244 bytes
madelon_forest.model	4101990 bytes


In [49]:
svc_models = Dict(["car_svc.model" => car_svc_model, "abalone_svc.model" => abalone_svc_model, "madelon_svc.model" => madelon_svc_model, "kdd_svc.model" => kdd_svc_model])

for (filename, model) in svc_models
    joblib.dump(model, filename)
    println(filename * "\t" * string(stat(filename).size) * " bytes")
end

madelon_svc.model	85640 bytes
abalone_svc.model	6905 bytes
kdd_svc.model	12221 bytes
car_svc.model	6585 bytes


In [50]:
nb_models = Dict(["car_nb.model" => car_nb_model, "abalone_nb.model" => abalone_nb_model, "madelon_nb.model" => madelon_nb_model, "kdd_nb.model" => kdd_nb_model])

for (filename, model) in nb_models
    joblib.dump(model, filename)
    println(filename * "\t" * string(stat(filename).size) * " bytes")
end

madelon_nb.model	16696 bytes
abalone_nb.model	967 bytes
car_nb.model	903 bytes
kdd_nb.model	2055 bytes


In [51]:
neural_models = Dict(["car_neural.model" => car_neural_model, "abalone_neural.model" => abalone_neural_model, "madelon_neural.model" => madelon_neural_model, "kdd_neural.model" => kdd_neural_model])

for (filename, model) in neural_models
    joblib.dump(model, filename)
    println(filename * "\t" * string(stat(filename).size) * " bytes")
end

abalone_neural.model	39864 bytes
car_neural.model	34321 bytes
kdd_neural.model	143035 bytes
madelon_neural.model	1611979 bytes
