In [None]:
import pandas as pd
import os, sys
import numpy as np

sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '../..')))

from utils.prediction_utils import *

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import plot_tree

from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import make_scorer, mean_squared_error

import xgboost as xgb

random_seed = 42

In [None]:
base_dir = "/Users/andrew/Desktop/Harvard/idreos-research/gpu_profiling/final/data"
X, y = get_data("mm", base_dir, sample_rate=0.3)

df = pd.concat([X, y], axis=1)
df = df.dropna()
df = df[df["time"] > 0]
X, y = df.drop(columns=["time"]), df["time"]

In [None]:
n_iter = 20

X_train, X_val, X_test, y_train, y_val, y_test = get_train_test_split(X, y, return_concat=False)
X_cv = pd.concat([X_train, X_val])
y_cv = pd.concat([y_train, y_val])

tree_model = RandomForestRegressor(random_state=random_seed)

param_dist = {
    "max_depth": [100, 150, 200],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [2, 5],
    "max_features": [10, 20, 50, 100, 1.0, "sqrt"],
    "n_estimators": [50, 100],
}

randomized_search = RandomizedSearchCV(
    estimator=tree_model,
    param_distributions=param_dist,
    n_iter=n_iter,  # Number of iterations to perform
    scoring=make_scorer(mean_squared_error, greater_is_better=False),
    cv=5,
    random_state=random_seed,
    n_jobs=3,
)

In [None]:
randomized_search.fit(X_cv, y_cv)
print("Best parameters found: ", randomized_search.best_params_)
print("Best MSE found: ", randomized_search.best_score_)

best_tree_model = randomized_search.best_estimator_
best_tree_model.fit(X_cv, y_cv)
run_val_pipeline(best_tree_model, X_cv, X_test, y_cv, y_test, bins=30)