# Setup

In [1]:
!pip install sklearn --upgrade -q
!pip install catboost -q

In [2]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from catboost import CatBoostRegressor
import lightgbm as lgb

# Preprocessing

In [3]:
train_df = pd.read_csv(r"../input/ventilator-pressure-prediction/train.csv")

In [4]:
train_ids = train_df["id"]
train_df = train_df.drop(columns=["breath_id", "id"])

In [5]:
y = train_df["pressure"]
X = train_df.drop(columns=["pressure"])

In [6]:
num_attribs = ["time_step", "u_in"]
cat_attribs = ["R", "C", "u_out"]

num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("standard_scaler", StandardScaler())
])


full_pipeline = ColumnTransformer([
    ("num_pipeline", num_pipeline, num_attribs),
    ("1hot_encoder", OneHotEncoder(), cat_attribs)
])

X_tfm = full_pipeline.fit_transform(X)

# Training

In [7]:
xgb_clf = XGBRegressor(tree_method='gpu_hist')
cb_clf = CatBoostRegressor(task_type="GPU", )
lgbm_params = {
    "objective":"regression",
}

In [8]:
train_fold_df = pd.read_csv(r"../input/always-splitting-data-first/train_folds.csv")
train_fold_df

In [9]:
lgbm_5_fold = {}
train_fold_df = pd.read_csv(r"../input/always-splitting-data-first/train_folds.csv")

for fold in range(5):
    lgbm_y = train_fold_df[train_fold_df.kfold == fold]["pressure"][:160000]
    X = train_fold_df[train_fold_df.kfold == fold].drop(columns=["id", "breath_id", "kfold", "pressure"])[:160000]
    lgbm_X_tfm = full_pipeline.transform(X)
    
    X_train = lgb.Dataset(lgbm_X_tfm, label=lgbm_y.values)
    lgb_clf = lgb.train(lgbm_params, X_train, num_boost_round=10000)
    lgbm_5_fold[fold] = lgb_clf

# xgb_clf.fit(X_tfm, y.values)
# cb_clf.fit(X_tfm, y.values)

In [10]:
xgb_clf.fit(X_tfm, y.values)
cb_clf.fit(X_tfm, y.values)

# Inference

In [11]:
test_df = pd.read_csv(r"../input/ventilator-pressure-prediction/test.csv")
test_ids = test_df["id"]
test_df = test_df.drop(columns=["id"])

In [23]:
X_test = full_pipeline.fit_transform(test_df)
preds = xgb_clf.predict(X_test)
preds_1 = cb_clf.predict(X_test)
preds = (preds + preds_1)/2

In [13]:
lgbm_preds = []
for fold, lgbm in lgbm_5_fold.items():
    preds = lgbm.predict(X_test)
    lgbm_preds.append(preds)
    break

In [24]:
preds_final = lgbm_preds[0] + preds

In [25]:
preds_final

# Submission

In [26]:
submission = pd.DataFrame({
    "id": test_ids,
    "pressure": preds
})
submission.to_csv("submission.csv", index=False)