In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
data = (pd.read_parquet('../input/ubiquant-parquet/train_low_mem.parquet')
         .sort_values(['time_id', 'investment_id'])
         .drop(columns=['row_id'])
         .reset_index(drop=True))[:100000];

In [3]:
data.shape

In [4]:
data_input = data.drop(columns = ['target'])
data_target = data['target']
data_input.head(10)

In [5]:
from sklearn.base import clone
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# models
from sklearn.tree import DecisionTreeRegressor

# metrics
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')

In [6]:
tree_params = [
    {
        'max_depth': [2,3,5,7,10],
        'min_samples_split':[2,3,5,7,10],
        'min_samples_leaf':[2,3,5,7,10]
    }
]

# tree model
models_only_tree = {
    'DT': (DecisionTreeRegressor(), tree_params)
}

In [7]:
models = np.sort(list(models_only_tree.keys())).tolist()

scoring = {
        'MSE': make_scorer(mean_squared_error)
    }

results_columns = ['DATASET', 'MODEL', 'TRIAL'] + [
    "TRAIN_" + x for x in list(scoring.keys())] + ["TEST_" + x for x in list(scoring.keys())]


In [8]:
def perform_trials(dataset_name, models, data_X, data_y):
    
    num_trials = 1
    
    data_results = pd.DataFrame(columns=results_columns)

    for model_name in models.keys():
        model = models[model_name][0]        
        model_params_grid = models[model_name][1]
        model_results = pd.DataFrame(columns=results_columns)
        
        # perform 1 trials using each model on the dataset
        for trial_count in range(num_trials):
            # pick 5000 samples with replacement to be in the training set
            X_train, X_test, y_train, y_test = train_test_split(data_X, data_y, 
                                                                test_size=0.3, 
                                                                random_state=trial_count)
            
            # grid search with 5 k-folds
            search = GridSearchCV(model, model_params_grid, cv=5, verbose=3,
                                  n_jobs=-1, refit=False, scoring=scoring)
            
            # fit grid search model with training set
            search.fit(X_train, y_train)
            
            # store 7 metrics calculated in one trial
            model_result = {
                'DATASET': dataset_name,
                'MODEL': model_name,
                'TRIAL': trial_count + 1
            }
            
            for score_name in scoring.keys():
                # find the best parameters that make model achieves best score of the metric
                best_params = search.cv_results_['params'][np.argmin(
                    search.cv_results_['rank_test_' + score_name])]
                # use best parameters to create the optimal model for the metric
                best_model = clone(model).set_params(**best_params)
                # train the optimal model
                best_model.fit(X_train, y_train)
                
                # compute metrics
                train_score = scoring[score_name](best_model, X_train, y_train)
                test_score = scoring[score_name](best_model, X_test, y_test)

                # append scores
                model_result['TRAIN_' + score_name] = train_score
                model_result['TEST_' + score_name] = test_score
            
            # append scores of one trial to the model_results dataframe
            model_results = model_results.append(model_result, ignore_index=True)
        
        # append model_results to data_results
        data_results = data_results.append(model_results, ignore_index=True)
        print(data_results)

    
    return data_results


In [9]:
results_tree = perform_trials('market_price', models_only_tree, data_input, data_target)


In [10]:
results_tree

In [None]:
100000