In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import os

import sys
sys.path.append('../src/')

from ml import datasets as ml_datasets
from ml import ml_models

from utils import metrics

In [2]:
def generate_output(model_obj,
                       datasets,
                       save_path):

    
    
    if not os.path.isdir(save_path):
        os.makedirs(save_path)
        
    print(model_obj)
    
    for name, dataset in zip(['train', 'valid', 'test_1', 'test_2'], datasets):
        if dataset is not None:
            y_pred = model_obj.predict(dataset['X'])

            np.save(save_path + '/' + name, np.stack([dataset['y'], y_pred]))
    
        

In [3]:
model_names = [
    'rf', 
    'gb',
    'ab',
    'svm',
]

dataset_names = ['RIKEN', 'Fiehn_HILIC', 'SMRT']

NUM_SEARCHES = 20

In [None]:

for model_name in model_names:
    
    for dataset_name in dataset_names:
        
        train, valid, test_1, test_2 = ml_datasets.get_descriptor_datasets(
            dataset_path=f'../input/datasets/{dataset_name}.csv')
        
        model_iter = models.ModelGenerator(model_name, NUM_SEARCHES)
        
        best_error = float('inf')
        for model in tqdm(model_iter):
            
            model.fit(train['X'], train['y'])
            preds = model.predict(valid['X'])
            error = metrics.get('mae')(valid['y'], preds)
            
            if error < best_error:
                best_error = error
                best_model = model
        
        print('model      : {}'.format(model_name))
        print('dataset    : {}'.format(dataset_name))
        print('best score : {}'.format(best_error))
        print('---'*20)
        
        generate_output(
            model_obj=best_model,
            datasets=[train, valid, test_1, test_2],
            save_path='../output/predictions/{}/{}'.format(
                dataset_name, model_name)
        )
        
        

100%|██████████| 20/20 [00:04<00:00,  4.99it/s]


model      : rf
dataset    : RIKEN
best score : 0.7200726495726499
------------------------------------------------------------
RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=46, max_features=36, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=120, n_jobs=-1, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)


100%|██████████| 20/20 [00:05<00:00,  3.78it/s]

model      : rf
dataset    : Fiehn_HILIC
best score : 0.9692966199958842
------------------------------------------------------------
RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=93, max_features=52, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=3,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=202, n_jobs=-1, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)



 20%|██        | 4/20 [01:16<04:51, 18.25s/it]

100%|██████████| 20/20 [01:01<00:00,  3.07s/it]

model      : rf
dataset    : RIKEN
best score : 0.7467297008547013
------------------------------------------------------------
RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mae',
                      max_depth=46, max_features=36, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=120, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)



100%|██████████| 20/20 [03:35<00:00, 10.79s/it]

model      : rf
dataset    : Fiehn_HILIC
best score : 0.98239152892562
------------------------------------------------------------
RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mae',
                      max_depth=11, max_features=71, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=165, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)



100%|██████████| 20/20 [00:07<00:00,  2.54it/s]

model      : gb
dataset    : RIKEN
best score : 0.5589955934881803
------------------------------------------------------------
GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.11123579911258707,
                          loss='huber', max_depth=3, max_features=16,
                          max_leaf_nodes=None, min_impurity_decrease=0.0,
                          min_impurity_split=None, min_samples_leaf=1,
                          min_samples_split=2, min_weight_fraction_leaf=0.0,
                          n_estimators=170, n_iter_no_change=None,
                          presort='deprecated', random_state=None,
                          subsample=0.6684691198108689, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)



100%|██████████| 20/20 [00:10<00:00,  1.85it/s]

model      : gb
dataset    : Fiehn_HILIC
best score : 0.8967465543743781
------------------------------------------------------------
GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1635424442952658,
                          loss='huber', max_depth=2, max_features=95,
                          max_leaf_nodes=None, min_impurity_decrease=0.0,
                          min_impurity_split=None, min_samples_leaf=1,
                          min_samples_split=2, min_weight_fraction_leaf=0.0,
                          n_estimators=239, n_iter_no_change=None,
                          presort='deprecated', random_state=None,
                          subsample=0.4566969187504992, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)



100%|██████████| 20/20 [02:53<00:00,  8.68s/it]

model      : ab
dataset    : RIKEN
best score : 0.7493589743589743
------------------------------------------------------------
AdaBoostRegressor(base_estimator=DecisionTreeRegressor(ccp_alpha=0.0,
                                                       criterion='mae',
                                                       max_depth=5,
                                                       max_features=None,
                                                       max_leaf_nodes=None,
                                                       min_impurity_decrease=0.0,
                                                       min_impurity_split=None,
                                                       min_samples_leaf=1,
                                                       min_samples_split=2,
                                                       min_weight_fraction_leaf=0.0,
                                                       presort='deprecated',
                                     


100%|██████████| 20/20 [12:17<00:00, 36.89s/it]

model      : ab
dataset    : Fiehn_HILIC
best score : 0.9689772727272729
------------------------------------------------------------
AdaBoostRegressor(base_estimator=DecisionTreeRegressor(ccp_alpha=0.0,
                                                       criterion='mae',
                                                       max_depth=5,
                                                       max_features=None,
                                                       max_leaf_nodes=None,
                                                       min_impurity_decrease=0.0,
                                                       min_impurity_split=None,
                                                       min_samples_leaf=1,
                                                       min_samples_split=2,
                                                       min_weight_fraction_leaf=0.0,
                                                       presort='deprecated',
                               


