In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

TEST_SIZE = 0.3
df = pd.read_excel('files/data_big.xlsx')

cat_x = df['v'].to_numpy()
holder = {}

for uniq in np.unique(cat_x):
    temp_arr = (cat_x==uniq) + np.zeros(len(cat_x))
    holder[str(uniq)] = temp_arr

cat_df = pd.DataFrame(holder, dtype = int)

df_clear = df[['x', 'x1', 'x2', 'x3']].astype(float)
X = pd.concat([df_clear, cat_df], axis = 1)
y = df['y'].astype(float)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = TEST_SIZE)

In [4]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score

regr = XGBRegressor(n_estimators = 100, max_depth = 200, learning_rate = 0.2)
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)

print(X_train)

print(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print('', r2)
print(mean_absolute_percentage_error(y_test, y_pred))

                  x            x1            x2            x3  1  10  11  100  \
543    3.020225e+10  1.915218e+10  0.000000e+00  1.915218e+10  0   0   0    0   
6062   3.725320e+10  2.629221e+10  2.629221e+10  0.000000e+00  0   0   0    0   
7088   2.081484e+10  4.133891e+09  4.133891e+09  4.133891e+09  0   0   0    0   
6823   1.741384e+10  4.133891e+09  4.133891e+09  4.133891e+09  0   0   0    0   
12359  4.133891e+09  4.133891e+09  4.133891e+09  4.133891e+09  0   0   0    0   
...             ...           ...           ...           ... ..  ..  ..  ...   
9831   1.741384e+10  4.133891e+09  4.133891e+09  4.133891e+09  0   0   0    0   
5614   1.645959e+10  1.244162e+10  1.244162e+10  0.000000e+00  0   0   0    0   
13682  3.062777e+10  1.866064e+10  1.866064e+10  1.866064e+10  0   0   0    0   
2538   2.983298e+10  2.477520e+10  0.000000e+00  2.477520e+10  0   0   0    0   
6765   2.381807e+10  4.133891e+09  4.133891e+09  4.133891e+09  0   0   0    0   

       101  110  111  
543 

In [5]:
import optuna

def objective(trial):
    params={'n_estimators': trial.suggest_int('n_estimators', 10, 200, step=10),
            'max_depth':trial.suggest_int('max_depth', 10, 350, step=10),
            'learning_rate':trial.suggest_float('learning_rate',0.01, 1, step=0.01)}
    regr = XGBRegressor(**params)
    regr.fit(X_train, y_train)
    y_pred = regr.predict(X_test)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    return mape

study = optuna.create_study()
study.optimize(objective, n_trials=200)


  from .autonotebook import tqdm as notebook_tqdm
[32m[I 2023-02-18 08:42:43,473][0m A new study created in memory with name: no-name-377ee5aa-155e-4b69-8c99-bd99f2857d85[0m
[32m[I 2023-02-18 08:42:44,296][0m Trial 0 finished with value: 2.427426840730055 and parameters: {'n_estimators': 50, 'max_depth': 200, 'learning_rate': 0.76}. Best is trial 0 with value: 2.427426840730055.[0m
[32m[I 2023-02-18 08:42:47,539][0m Trial 1 finished with value: 2.4195402552627585 and parameters: {'n_estimators': 70, 'max_depth': 50, 'learning_rate': 0.34}. Best is trial 1 with value: 2.4195402552627585.[0m
[32m[I 2023-02-18 08:42:51,427][0m Trial 2 finished with value: 2.4235290088837704 and parameters: {'n_estimators': 180, 'max_depth': 250, 'learning_rate': 0.16}. Best is trial 1 with value: 2.4195402552627585.[0m
[32m[I 2023-02-18 08:42:51,612][0m Trial 3 finished with value: 2.4178906395693023 and parameters: {'n_estimators': 10, 'max_depth': 270, 'learning_rate': 0.6}. Best is trial 

In [6]:
study.best_params
regr = XGBRegressor(**study.best_params)
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)

print('', r2)
print(mean_absolute_percentage_error(y_test, y_pred))

 0.6155113555355167
1.0267640515542764
