In [2]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import os

import sys
sys.path.append('../src/')

from ml import datasets
from ml import models

import configuration

from utils import metrics

from notebook_utils import fit_and_predict_ml as fit_and_predict

In [3]:
model_names = [
    'rf', 
    'gb',
    'ab',
    'svm',
]

dataset_names = list(configuration.datasets.keys())

NUM_REPL = configuration.NUM_REPLICATES
NUM_SEARCHES = configuration.NUM_SEARCHES

In [1]:

for model_name in model_names:
    
    for dataset_name in dataset_names:
        
        train, valid, test = datasets.get_descriptor_datasets(
            dataset_path=f'../input/datasets/{dataset_name}.csv')
        
        model_iter = models.ModelGenerator(model_name, NUM_SEARCHES)
        
        best_score = float('inf')
        for model in tqdm(model_iter):
            
            model.fit(train['X'], train['y'])
            preds = model.predict(valid['X'])
            rmse = metrics.get('rmse')(valid['y'], preds)
            
            if rmse < best_score:
                best_score = rmse
                best_params = model_iter.param.copy()
                best_model = model
        
        
        print('model      : {}'.format(model_name))
        print('dataset    : {}'.format(dataset_name))
        print('best score : {}'.format(best_score))
        print('---'*20)
        
        
        fit_and_predict(
            model_obj=model_iter.model,
            model_params=best_params,
            model_best=best_model,
            datasets=[train, valid, test],
            num_repl=NUM_REPL,
            save_path='../output/predictions/{}/{}'.format(
                dataset_name, model_name)
        )
        
        