<a href="https://colab.research.google.com/github/YarkoMarko/Boosting/blob/main/notebook06d71ca5a5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'small-nice-data-dota2:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F1202763%2F2011027%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240325%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240325T121921Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D2ae0b7ebd71da11e9a73b9e5ac3f589059ae2173000c8d3fd9770d17f06245abebcfae2fd39b9b54c49fb90cb5202cd0ac1c6513d0392db2ac15bdded037be7a84d8f09b37a7bd868941153dbadded91275f71ae67f58351113d0776d479489182dc59e01e121d9c75237dc60ff81ee4a09ef5d7d065d097a161ae98c92fbcef7ea0cd7299cd55d02dcfb17dd3a14fcf979c16e605543de1d4a9c9e75b7970228eee73d45430d570ac67fca9a9e621f72a4815940ecd69e08382857fb917a841f044ca65a899c4e4d2ab42c389cd92171c907611632736b55bae863f89fe81669c99ba98556db23909e7c6f94d2b0c10da81b51c6dcc79d6c0cd5d4ea2d325df'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# # Input data files are available in the read-only "../input/" directory
# # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv("/kaggle/input/small-nice-data-dota2/small_nice_data.csv",
                 index_col=0)

df.head()

In [None]:
y = df["time"]
X = df.drop(columns=["time"])

In [None]:
cat_columns = X.select_dtypes(include="object").columns

X[cat_columns] = X[cat_columns].astype("category")

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

num_columns = X.select_dtypes(include="number").columns
cat_columns = X.select_dtypes(include="category").columns

In [None]:
num_transformer = Pipeline(
    steps = [
        ("imputer", SimpleImputer(strategy="median"))
    ]
)

cat_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_transformer, num_columns),
        ("cat", cat_transformer, cat_columns)
    ]
)

preprocessor.set_output(transform="pandas")

In [None]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [None]:
import lightgbm as lgb

In [None]:
model = lgb.LGBMRegressor(objective='mse')

In [None]:
model.fit(X_train, y_train,
         eval_set=[(X_test, y_test), (X_train, y_train)],
         eval_names = ["test", "train"],
         eval_metric = "mse")

In [None]:
from sklearn import metrics

y_pred = model.predict(X_test)

def get_metrics(y_true, y_pred, name="model"):
  df_ = pd.DataFrame()

  df_.loc["MAE", name] = metrics.mean_absolute_error(y_true, y_pred)
  df_.loc["RMSE", name] = metrics.mean_squared_error(y_true, y_pred) ** 0.5
  df_.loc["R2", name] = metrics.r2_score(y_true, y_pred)

  return df_.round(2)

In [None]:
all_metrics = pd.DataFrame()

In [None]:
all_metrics["train"] = get_metrics(y_train, model.predict(X_train))
all_metrics["test"] = get_metrics(y_test, y_pred)

In [None]:
all_metrics

In [None]:
lgb.plot_metric(model,
               metric="l2")

In [None]:
model = Pipeline(
    steps=[
        ("prop", preprocessor),
        ("model", model)
    ]
)

In [None]:
import joblib

In [None]:
joblib.dump(model, "model_lgb.pkl")