In [1]:
import pickle
import pandas as pd
import numpy as np
import tensorflow as tf
from itertools import product

2023-02-21 11:53:19.461608: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
class JobIterator():
    """
    JobIterator object used to define the order of an array of experiments. By Andrew H. Fagg,
    modified by Jay Rothenberger
    """

    def __init__(self, params):
        """
        Constructor

        @param params Dictionary of key/list pairs
        """
        self.params = params
        # List of all combinations of parameter values
        self.product = list(dict(zip(params, x)) for x in product(*params.values()))
        # Iterator over the combinations
        self.iter = (dict(zip(params, x)) for x in product(*params.values()))

    def next(self):
        """
        @return The next combination in the list
        """
        return self.iter.next()

    def get_index(self, i):
        """
        Return the ith combination of parameters

        @param i Index into the Cartesian product list
        @return The ith combination of parameters
        """

        return self.product[i]

    def get_njobs(self):
        """
        @return The total number of combinations
        """

        return len(self.product)

    def set_attributes_by_index(self, i, obj):
        """
        For an arbitrary object, set the attributes to match the ith job parameters

        @param i Index into the Cartesian product list
        @param obj Arbitrary object (to be modified)
        @return A string representing the combinations of parameters, and the args object
        """

        # Fetch the ith combination of parameter values
        d = self.get_index(i)
        # Iterate over the parameters
        for k, v in d.items():
            obj[k] = v

        return obj, self.get_param_str(i)

    def get_param_str(self, i):
        """
        Return the string that describes the ith job parameters.
        Useful for generating file names

        @param i Index into the Cartesian product list
        """

        out = 'JI_'
        # Fetch the ith combination of parameter values
        d = self.get_index(i)
        # Iterate over the parameters
        for k, v in d.items():
            out = out + "%s_%s_" % (k, v)

        # Return all but the last character
        return out[:-1]

In [3]:
# load the data from the pickle file provided by Alberto
with open('./giga_df_pickle', 'rb') as fp:
    giga_df = pickle.load(fp)
# how many examples do we have?
print(len(giga_df))

4386


In [4]:
# dropping columns that contain only nan values
#for i in range(len(giga_df)):
#    giga_df[i] = giga_df[i][:29] + giga_df[i][39:]
# how verifying we still have the same number of examples (sanity check)
print("size of raw data (rows)", len(giga_df))
print("features per row:", len(giga_df[0]))

size of raw data (rows) 4386
features per row: 41


In [5]:
# convert the list of tuples to a pandas datframe
df = pd.DataFrame(giga_df)
# drop rows with nan values -  empirically if there are some nans all are nans, so this is sensible
nan_rows = df.loc[df.isna().any(axis=1)]
# how many rows were in the df?
print("number of examples:", len(df))
# how many after we dropped the nan rows?
print("after dropping nan rows", len(df.dropna()))
# drop them
df = df.dropna()
# filepaths dataframe for image training
filepaths = pd.DataFrame(df[29])
# labels dataframe
labels = pd.DataFrame(df[30])
# drop filepath and label columns from features dataframe
df = df.drop([29, 30], axis=1)
df = df.drop([0, 1, 2, 3], axis=1)
# let's take a look at the tabular data
print(df.head())
print(filepaths.head())
print(labels.head())

number of examples: 4386
after dropping nan rows 0
Empty DataFrame
Columns: [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40]
Index: []

[0 rows x 35 columns]
Empty DataFrame
Columns: [29]
Index: []
Empty DataFrame
Columns: [30]
Index: []


In [6]:
from sklearn.preprocessing import OneHotEncoder
# one-hot-encode the string data
def encode_string_tokens(df):
    # columns that were encoded
    redundant_cols = []
    for column in df.columns:
        # if the column is string-valued
        if isinstance(np.unique(df[column])[0], str):
            redundant_cols.append(column)
            enc = OneHotEncoder(sparse=False)
            # one-hot encode it
            encoding = enc.fit_transform(np.array(df[column]).reshape(-1, 1))
            # create a new binary feature for each position in the encoding
            for i in range(len(encoding[0])):
                df[f'encoded - {column}.{i}'] = encoding[:, i]
            # let's see the unique values
            print(np.unique(encoding))
    # drop the redundant columns
    return df.drop(redundant_cols, axis=1)

df = encode_string_tokens(df)
df.head()

enc = OneHotEncoder(sparse=False)
print(type(labels))
labels = pd.DataFrame(enc.fit_transform(np.array(labels[labels.columns[0]]).reshape(-1, 1)))

labels.head()

<class 'pandas.core.frame.DataFrame'>




Unnamed: 0,0,1,2
0,1.0,0.0,0.0
1,1.0,0.0,0.0
2,1.0,0.0,0.0
3,1.0,0.0,0.0
4,1.0,0.0,0.0


In [7]:
# Turn on scikit-learn optimizations with these 2 simple lines:

from sklearnex import patch_sklearn

patch_sklearn()

# Import scikit-learn algorithms after the patch is enabled 

from sklearn.model_selection import train_test_split

# convert data to numpy
X, Y = df.to_numpy(), labels.to_numpy()
# shuffle the data
np.random.seed(42)
np.random.shuffle(X)
np.random.shuffle(Y)
# split the data
x_train, x_val_test, y_train, y_val_test = train_test_split(X, Y, train_size=.7)
x_val, x_test, y_val, y_test = train_test_split(x_val_test, y_val_test, train_size=.33)
print(f'train size: {len(y_train)}, val_size: {len(y_val)}, test_size: {len(y_test)}')

train size: 2762, val_size: 390, test_size: 794


Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [8]:
from sklearn.tree import DecisionTreeClassifier

decision_tree_params = {
    'criterion': ['gini', 'entropy'],
    'min_samples_leaf': [10, 20, 50, 100, 150],
    'max_features': ['sqrt', 'log2']
}

ji = JobIterator((decision_tree_params))

dt_combination_scores = [] # tuples of hparams, scores

for i in range(ji.get_njobs()):
    # create a basic decision tree
    tree_classifier = DecisionTreeClassifier(**ji.get_index(i))
    # fit it
    tree_classifier.fit(x_train, y_train)
    # evaluate on validation data
    dt_combination_scores.append((ji.get_index(i), tree_classifier.score(x_val, y_val)))

print(max(dt_combination_scores, key=lambda k: k[-1]))

({'criterion': 'entropy', 'min_samples_leaf': 150, 'max_features': 'log2'}, 0.5820512820512821)


In [9]:
from sklearn.ensemble import RandomForestClassifier

rf_combination_scores = [] # tuples of hparams, scores

for i in range(ji.get_njobs()):
    # create a basic random forest
    forest_classifier = RandomForestClassifier(**ji.get_index(i))
    # fit it
    forest_classifier.fit(x_train, y_train)
    # evaluate it on validation data
    
    rf_combination_scores.append((ji.get_index(i), forest_classifier.score(x_val, y_val)))

print(max(rf_combination_scores, key=lambda k: k[-1]))

({'criterion': 'gini', 'min_samples_leaf': 50, 'max_features': 'sqrt'}, 0.6153846153846154)


In [10]:
import tensorflow as tf
from keras.layers import *

def MLP(layers, n_classes=3, learning_rate=1e-3, activation='relu'):
    inputs = Input((25, ))
    x = inputs
    x = BatchNormalization()(x)
    for units in layers:
        x = Dense(units, activation=activation)(x)
    outputs = Dense(n_classes, activation='softmax')(x)
    
    model = tf.keras.models.Model(inputs=[inputs], outputs=outputs)
    # you can ignore these next two statements
    opt = tf.keras.optimizers.Nadam(learning_rate=learning_rate,
                                    beta_1=0.9, beta_2=0.999,
                                    epsilon=None, decay=0.99)
    opt = tf.keras.mixed_precision.LossScaleOptimizer(opt)
    # select the correct kind of accuracy
    # compile the model
    model.compile(loss='categorical_crossentropy',
                  optimizer=opt,
                  metrics=[tf.keras.metrics.CategoricalAccuracy()])
    return model


In [11]:
model_params = {
    'layers': [[], [32], [3, 3, 3], [10, 12], [32, 16, 8]],
    'n_classes': [y_train.shape[-1]],
    'learning_rate': [1e-3, 1e-2],
    'activation': ['elu']
}

ji_nn = JobIterator(model_params)

nn_combination_scores = [] # tuples of hparams, scores

for i in range(ji_nn.get_njobs()):
    print(i, end='\r')
    # create a basic random forest
    model = MLP(**ji_nn.get_index(i))
    # fit it
    model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=100, callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_categorical_accuracy', patience=5, restore_best_weights=True)], batch_size=32, verbose=0)
    # evaluate it on validation data
    
    nn_combination_scores.append((model, ji_nn.get_index(i), forest_classifier.score(x_val, y_val)))
print()
print(max(nn_combination_scores, key=lambda k: k[-1]))

max(nn_combination_scores, key=lambda k: k[-1])[0].evaluate(x_test, y_test)


9
(<keras.engine.functional.Functional object at 0x2b4c5bab1270>, {'layers': [], 'n_classes': 3, 'learning_rate': 0.001, 'activation': 'elu'}, 0.6153846153846154)


[27.01494598388672, 0.29848867654800415]