In [1]:
import numpy as np
import cuml
import cupy
from cuml.ensemble import RandomForestRegressor

from sklearn.decomposition import PCA

import matplotlib.pyplot as plt

In [2]:
X_train, y_train, X_test = np.load("./data/dataset.npz", allow_pickle=True).values()

pca = PCA(n_components=2)
Y_train = pca.fit_transform(X_train)
Y_test = pca.transform(X_test)


In [3]:
Yg_train = cupy.asarray(Y_train)
Yg_test = cupy.asarray(Y_test)
yg_train = cupy.asarray(y_train)

reg = RandomForestRegressor(random_state=42)
reg.fit(Yg_train, yg_train)
y_pred = reg.predict(Yg_test)

In [4]:
reg.score(Yg_train, yg_train)

0.01894531523004428

In [9]:
import time

# some hyperparameter tuning

max_depths = [5, 10, 15, 20]
max_features = [1, 2]
n_estimators = [50, 100, 200]

results = {(md, mf, ne): 1e8 for md in max_depths for mf in max_features for ne in n_estimators}
times = {(md, mf, ne): 1e8 for md in max_depths for mf in max_features for ne in n_estimators}
for (md, mf, ne) in results.keys():
    start = time.time()
    reg = RandomForestRegressor(max_depth=md, max_features=mf, n_estimators=ne, random_state=42)
    reg.fit(Yg_train, yg_train)
    results[(md, mf, ne)] = reg.score(Yg_train, yg_train)
    end = time.time()
    times[(md, mf, ne)] = end - start
    print(f"Trained model with (md={md}, mf={mf}, ne={ne}) in {end - start:.2f} seconds")

Trained model with (md=5, mf=1, ne=50) in 1.50 seconds
Trained model with (md=5, mf=1, ne=100) in 2.56 seconds
Trained model with (md=5, mf=1, ne=200) in 4.96 seconds
Trained model with (md=5, mf=2, ne=50) in 1.30 seconds
Trained model with (md=5, mf=2, ne=100) in 2.26 seconds
Trained model with (md=5, mf=2, ne=200) in 4.34 seconds
Trained model with (md=10, mf=1, ne=50) in 1.29 seconds
Trained model with (md=10, mf=1, ne=100) in 2.22 seconds
Trained model with (md=10, mf=1, ne=200) in 4.69 seconds
Trained model with (md=10, mf=2, ne=50) in 1.32 seconds
Trained model with (md=10, mf=2, ne=100) in 2.71 seconds
Trained model with (md=10, mf=2, ne=200) in 5.29 seconds
Trained model with (md=15, mf=1, ne=50) in 1.30 seconds
Trained model with (md=15, mf=1, ne=100) in 2.64 seconds
Trained model with (md=15, mf=1, ne=200) in 4.77 seconds
Trained model with (md=15, mf=2, ne=50) in 1.90 seconds
Trained model with (md=15, mf=2, ne=100) in 3.51 seconds
Trained model with (md=15, mf=2, ne=200) in

In [10]:
best_params = min(results.items(), key=lambda x: x[1])
print(f"Best params: {best_params}")

best_time = min(times.items(), key=lambda x: x[1])
print(f"Best time: {best_time}")

Best params: ((5, 1, 50), 0.00010608167465919749)
Best time: ((10, 1, 50), 1.293670892715454)


In [14]:
reg = RandomForestRegressor(max_depth=best_params[0][0], max_features=best_params[0][1], n_estimators=best_params[0][2], random_state=42)
reg.fit(Yg_train, yg_train)
y_pred = reg.predict(Yg_test)

In [16]:
import pandas
dtest = pandas.read_csv('./data/test.csv')

dl = pandas.DataFrame({'id': dtest['id'], 'beats': y_pred.get()})
dl.to_csv('./data/submission_rf_pca.csv', index=False)