In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
import lightgbm as lgb
from sklearn import metrics
import joblib
from sklearn import metrics

In [2]:
df = pd.read_csv("/kaggle/input/small-nice-data-dota2/small_nice_data.csv",
                 index_col=0)
df.head()

Unnamed: 0,time,radiant_roshankill,radiant_Δroshankill,dire_roshankill,dire_Δroshankill,good_tower1,good_tower2,good_tower3,good_melee,good_range,...,radiant_buyback,dire_buyback,radiant_Δrunes,dire_Δrunes,radiant_runes,dire_runes,radiant_aegis,dire_aegis,radiantΔaegis,direΔaegis
0,0,0,0,0,0,3,3,3,3,3,...,0,0,1,1,1,1,0,0,0,0
1,60,0,0,0,0,3,3,3,3,3,...,0,0,1,1,2,2,0,0,0,0
2,120,0,0,0,0,3,3,3,3,3,...,0,0,0,0,2,2,0,0,0,0
3,180,0,0,0,0,3,3,3,3,3,...,0,0,0,0,2,2,0,0,0,0
4,240,0,0,0,0,3,3,3,3,3,...,0,0,0,0,2,2,0,0,0,0


In [3]:
y = df['time']
X = df.drop(columns='time')

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    train_size=0.7,
                                                    random_state=42)

In [4]:
X_train.shape, X_test.shape

((115009, 82), (49291, 82))

In [5]:
num_columns = X.select_dtypes(include='number').columns
cat_columns = X.select_dtypes(include='object').columns

num_transformer = Pipeline(
    steps=[('impute', SimpleImputer(strategy='median'))]
)

cat_transformer = Pipeline(
    steps=[('impute', SimpleImputer(strategy='most_frequent')),
           ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value',
                                     unknown_value=-1
                                     ))
          ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_columns),
        ('cat', cat_transformer, cat_columns)
    ]
)

preprocessor.set_output(transform='pandas')
preprocessor

preprocessor.fit(X_train)
X_train_prep = preprocessor.transform(X_train)
X_test_prep = preprocessor.transform(X_test)

X_train_prep

Unnamed: 0,num__radiant_roshankill,num__radiant_Δroshankill,num__dire_roshankill,num__dire_Δroshankill,num__good_tower1,num__good_tower2,num__good_tower3,num__good_melee,num__good_range,num__good_fort,...,num__radiant_buyback,num__dire_buyback,num__radiant_Δrunes,num__dire_Δrunes,num__radiant_runes,num__dire_runes,num__radiant_aegis,num__dire_aegis,num__radiantΔaegis,num__direΔaegis
147842,0.0,0.0,0.0,0.0,2.0,3.0,3.0,3.0,3.0,1.0,...,0.0,0.0,0.0,0.0,7.0,9.0,0.0,0.0,0.0,0.0
118988,0.0,0.0,0.0,0.0,3.0,3.0,3.0,3.0,3.0,1.0,...,0.0,0.0,1.0,0.0,4.0,8.0,0.0,0.0,0.0,0.0
98603,0.0,0.0,0.0,0.0,3.0,3.0,3.0,3.0,3.0,1.0,...,0.0,0.0,0.0,0.0,5.0,5.0,0.0,0.0,0.0,0.0
49785,0.0,0.0,0.0,0.0,3.0,3.0,3.0,3.0,3.0,1.0,...,0.0,0.0,0.0,0.0,3.0,5.0,0.0,0.0,0.0,0.0
17595,1.0,0.0,0.0,0.0,1.0,3.0,3.0,3.0,3.0,1.0,...,1.0,2.0,1.0,0.0,17.0,13.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120204,0.0,0.0,0.0,0.0,3.0,3.0,3.0,3.0,3.0,1.0,...,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0
103933,0.0,0.0,0.0,0.0,2.0,3.0,3.0,3.0,3.0,1.0,...,0.0,0.0,2.0,3.0,11.0,11.0,0.0,0.0,0.0,0.0
132257,0.0,0.0,0.0,0.0,3.0,3.0,3.0,3.0,3.0,1.0,...,0.0,0.0,1.0,1.0,8.0,5.0,0.0,0.0,0.0,0.0
147249,1.0,0.0,0.0,0.0,2.0,3.0,3.0,3.0,3.0,1.0,...,0.0,0.0,0.0,0.0,15.0,8.0,1.0,0.0,0.0,0.0


In [6]:
model = lgb.LGBMRegressor(objective='mae', # huber provided R2=0,02
                          n_estimators=100) # bild 100 trees

In [7]:
model.fit(X_train_prep, y_train,
          eval_set=[(X_test_prep, y_test), (X_train_prep, y_train)],
          eval_names = ['test', 'train'],
          eval_metric='mse')

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.042737 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8332
[LightGBM] [Info] Number of data points in the train set: 115009, number of used features: 82
[LightGBM] [Info] Start training from score 1020.000000


In [8]:
def get_metrics(y_true, y_pred, name="model"):
  df = pd.DataFrame()

  df.loc["MAE", name] = metrics.mean_absolute_error(y_true, y_pred)
  df.loc["RMSE", name] = metrics.mean_squared_error(y_true, y_pred) ** 0.5
  df.loc["R2", name] = metrics.r2_score(y_true, y_pred)

  return df.round(2)

In [9]:
df_metrics = pd.DataFrame()

df_metrics["train"] = get_metrics(y_train, model.predict(X_train_prep))
df_metrics["test"] = get_metrics(y_test, model.predict(X_test_prep))

df_metrics

Unnamed: 0,train,test
MAE,31.65,32.39
RMSE,55.0,56.36
R2,0.99,0.99


In [10]:
model = Pipeline(
    [("prep", preprocessor),
     ("model", model)
     ]
)

model

In [11]:
joblib.dump(model, 'model.pkl')

load_my_model = joblib.load('model.pkl')