This file was run on a jupyterhub. It needs optim_maxes.csv and IGTD.csv files in the same directory so it runs. 

One thing I think we really need to consider is how I am treating the Nans. They cant all be replaced by 1s or 0s. See more in writeup, but this is a serious struggle for this dataset. 

In [2]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import KFold
from tqdm import tqdm
import numpy as np
import pandas as pd
import csv
from random import sample
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import ParameterGrid
import os

## Basic Functions

In [3]:
def train_meta_model(sk_algorithm, X, y, valid_datasets, maxes, kfold_num=10):
	accuracies = []
	kf = KFold(n_splits=kfold_num)
	kf.get_n_splits(X)
	for train_index, test_index in tqdm(kf.split(X)):
		X_train, X_test = X[train_index], X[test_index]
		y_train, y_test = y[train_index], y[test_index] # dont use y_test
		ds_train, ds_test = valid_datasets[train_index], valid_datasets[test_index] # valid_datasets is the list of datasets
		sk_algorithm.fit(X_train, y_train)
		y_pred = sk_algorithm.predict(X_test)
		correct = 0
		for ds, prediction in zip(ds_test, y_pred):
			if prediction in maxes[ds]:
				correct += 1
		accuracies.append(correct/len(ds_test))
	return max(accuracies)

In [6]:
# get maxes
maxes = {}
with open('optim_maxes.csv', 'r') as f:
    reader = csv.reader(f)
    for row in reader:
        key = int(row[0])
        values_str = ','.join(row[1:])
        values_str = values_str.strip()
        values = list(map(int, values_str.split(',')))
        maxes[key] = values

In [7]:
meta_dataset_path = 'IGTD.csv'
df = pd.read_csv(meta_dataset_path) 
df.replace(np.nan, 0, inplace=True)
df.replace(np.inf, 0, inplace=True)
X = []
y = []
valid_datasets = []

In [8]:
for i in range(df.shape[0]):
    # check the first item in the ith row of df
    if df.loc[i, 'dataset'] in maxes:
        y.append(sample(maxes[df.loc[i, 'dataset']],k=1)[0])
        X.append(df.iloc[i, 1:].tolist())
        valid_datasets.append(df.loc[i, 'dataset'])
X = np.array(X)
y = np.array(y)
valid_datasets = np.array(valid_datasets)

## Building Basic NN

This is exactly the same NN used as one of the 16 algorithms trained on the 466 datasets. 

In [None]:
nn = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1, max_iter=100000000)
nn_accuracies = train_meta_model(nn, X, y, valid_datasets, maxes)

In [10]:
# get the accuraccies
print('nn_accuracies: ', nn_accuracies)

nn_accuracies:  0.6363636363636364


## Building a Grid Search 

I didn't make it far with this. Definitely an area for future work. 

In [12]:
# Define the parameter grid
param_grid = {
    'hidden_layer_sizes': [(100,), (50, 50), (100, 50, 25)],
    'activation': ['relu', 'tanh'],
    'solver': ['sgd', 'adam'],
    'learning_rate': ['constant', 'adaptive'],
    'max_iter': [100, 200, 300]
}

# Generate the parameter combinations
param_combinations = ParameterGrid(param_grid)

# Print the parameter combinations
for params in param_combinations:
    # Instantiate and train MLP model with the current parameters
    mlp = MLPClassifier(**params)
    # Your training code here
    nn_accuracies = train_meta_model(nn, X, y, valid_datasets, maxes)
    print('nn_accuracies: ', nn_accuracies, 'params: ', params, '\n')


10it [00:43,  4.35s/it]


nn_accuracies:  0.6363636363636364 params:  {'activation': 'relu', 'hidden_layer_sizes': (100,), 'learning_rate': 'constant', 'max_iter': 100, 'solver': 'sgd'} 



10it [00:44,  4.43s/it]


nn_accuracies:  0.6363636363636364 params:  {'activation': 'relu', 'hidden_layer_sizes': (100,), 'learning_rate': 'constant', 'max_iter': 100, 'solver': 'adam'} 



10it [00:44,  4.40s/it]


nn_accuracies:  0.6363636363636364 params:  {'activation': 'relu', 'hidden_layer_sizes': (100,), 'learning_rate': 'constant', 'max_iter': 200, 'solver': 'sgd'} 



10it [00:43,  4.33s/it]


nn_accuracies:  0.6363636363636364 params:  {'activation': 'relu', 'hidden_layer_sizes': (100,), 'learning_rate': 'constant', 'max_iter': 200, 'solver': 'adam'} 



10it [00:44,  4.41s/it]


nn_accuracies:  0.6363636363636364 params:  {'activation': 'relu', 'hidden_layer_sizes': (100,), 'learning_rate': 'constant', 'max_iter': 300, 'solver': 'sgd'} 



10it [00:44,  4.41s/it]


nn_accuracies:  0.6363636363636364 params:  {'activation': 'relu', 'hidden_layer_sizes': (100,), 'learning_rate': 'constant', 'max_iter': 300, 'solver': 'adam'} 



10it [00:43,  4.34s/it]


nn_accuracies:  0.6363636363636364 params:  {'activation': 'relu', 'hidden_layer_sizes': (100,), 'learning_rate': 'adaptive', 'max_iter': 100, 'solver': 'sgd'} 



10it [00:42,  4.25s/it]


nn_accuracies:  0.6363636363636364 params:  {'activation': 'relu', 'hidden_layer_sizes': (100,), 'learning_rate': 'adaptive', 'max_iter': 100, 'solver': 'adam'} 



10it [00:43,  4.31s/it]


nn_accuracies:  0.6363636363636364 params:  {'activation': 'relu', 'hidden_layer_sizes': (100,), 'learning_rate': 'adaptive', 'max_iter': 200, 'solver': 'sgd'} 



10it [00:42,  4.26s/it]


nn_accuracies:  0.6363636363636364 params:  {'activation': 'relu', 'hidden_layer_sizes': (100,), 'learning_rate': 'adaptive', 'max_iter': 200, 'solver': 'adam'} 



10it [00:44,  4.49s/it]


nn_accuracies:  0.6363636363636364 params:  {'activation': 'relu', 'hidden_layer_sizes': (100,), 'learning_rate': 'adaptive', 'max_iter': 300, 'solver': 'sgd'} 



10it [00:43,  4.32s/it]


nn_accuracies:  0.6363636363636364 params:  {'activation': 'relu', 'hidden_layer_sizes': (100,), 'learning_rate': 'adaptive', 'max_iter': 300, 'solver': 'adam'} 



10it [00:43,  4.38s/it]


nn_accuracies:  0.6363636363636364 params:  {'activation': 'relu', 'hidden_layer_sizes': (50, 50), 'learning_rate': 'constant', 'max_iter': 100, 'solver': 'sgd'} 



10it [00:45,  4.51s/it]


nn_accuracies:  0.6363636363636364 params:  {'activation': 'relu', 'hidden_layer_sizes': (50, 50), 'learning_rate': 'constant', 'max_iter': 100, 'solver': 'adam'} 



4it [00:17,  4.18s/it]

In [None]:
# this isnt exactly what I want because I want to try a bunch of parameters and then measure the accuracy with train_meta_model
# TODO

## h20.ai

In [12]:
import h2o 
from h2o.automl import H2OAutoML

In [13]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.19" 2023-04-18; OpenJDK Runtime Environment (build 11.0.19+7-post-Ubuntu-0ubuntu120.04.1); OpenJDK 64-Bit Server VM (build 11.0.19+7-post-Ubuntu-0ubuntu120.04.1, mixed mode, sharing)
  Starting server from /home/jupyter-as2273/.local/lib/python3.9/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpuph_4n1v
  JVM stdout: /tmp/tmpuph_4n1v/h2o_jupyter_as2273_started_from_python.out
  JVM stderr: /tmp/tmpuph_4n1v/h2o_jupyter_as2273_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,01 secs
H2O_cluster_timezone:,America/Denver
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.40.0.4
H2O_cluster_version_age:,1 month and 10 days
H2O_cluster_name:,H2O_from_python_jupyter_as2273_4q0z0x
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,29.97 Gb
H2O_cluster_total_cores:,32
H2O_cluster_allowed_cores:,32


In [14]:
# using the same X, y, valid_datasets from above
# convert to h2o frames
X_h2o = h2o.H2OFrame(X)
y_h2o = h2o.H2OFrame(y)

# convert to factors

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [None]:
aml = H2OAutoML(max_models=20, max_runtime_secs=300, seed=1)
aml.train(x=X_h2o.columns, y=y_h2o.columns[0], training_frame=X_h2o)

In [16]:
lb = aml.leaderboard
lb.head(rows=lb.nrows)  # Print all rows instead of default (10 rows)

_train param, Dropping bad and constant columns: (and it dropped all of them)

and then it said it cant complete in the 300 max runtime it was given. So it looks like this approach will *not* work 