In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import warnings
warnings.filterwarnings('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e1/sample_submission.csv
/kaggle/input/playground-series-s5e1/train.csv
/kaggle/input/playground-series-s5e1/test.csv


In [2]:
train = pd.read_csv('/kaggle/input/playground-series-s5e1/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s5e1/test.csv')
print(train.head(6))

   id        date country              store             product  num_sold
0   0  2010-01-01  Canada  Discount Stickers   Holographic Goose       NaN
1   1  2010-01-01  Canada  Discount Stickers              Kaggle     973.0
2   2  2010-01-01  Canada  Discount Stickers        Kaggle Tiers     906.0
3   3  2010-01-01  Canada  Discount Stickers            Kerneler     423.0
4   4  2010-01-01  Canada  Discount Stickers  Kerneler Dark Mode     491.0
5   5  2010-01-01  Canada  Stickers for Less   Holographic Goose     300.0


In [3]:
train.dtypes

id            int64
date         object
country      object
store        object
product      object
num_sold    float64
dtype: object

In [4]:
from catboost import CatBoostRegressor
from catboost import Pool

train['num_sold'] = train['num_sold'].fillna(train['num_sold'].mean())
train['num_sold'] = np.log(train['num_sold'])

X = train.drop(columns = ['id', 'num_sold']).astype('str').astype('category')
y = train['num_sold']

x_test = test.drop(columns = 'id').astype('str').astype('category')

train_pool = Pool(data = X, label = y, cat_features = X.columns.values)
test_pool = Pool(data = x_test, cat_features = x_test.columns.values)

catboost_model = CatBoostRegressor(
                    iterations = 1000,
                    learning_rate = 0.1,
                    depth = 6,
                    cat_features = X.columns.values,
                    loss_function = 'MAE',
                    verbose = 100
)

catboost_model.fit(train_pool)

cat_train = catboost_model.predict(train_pool).reshape(-1,1)
cat_test = catboost_model.predict(test_pool).reshape(-1,1)

0:	learn: 0.9834042	total: 222ms	remaining: 3m 41s
100:	learn: 0.1664849	total: 7.03s	remaining: 1m 2s
200:	learn: 0.1597024	total: 13.5s	remaining: 53.7s
300:	learn: 0.1570992	total: 20.3s	remaining: 47.2s
400:	learn: 0.1557519	total: 26.8s	remaining: 40s
500:	learn: 0.1545580	total: 33.2s	remaining: 33s
600:	learn: 0.1537465	total: 39.6s	remaining: 26.3s
700:	learn: 0.1527565	total: 46s	remaining: 19.6s
800:	learn: 0.1524021	total: 52.9s	remaining: 13.1s
900:	learn: 0.1520118	total: 59.3s	remaining: 6.51s
999:	learn: 0.1516890	total: 1m 5s	remaining: 0us


In [5]:
import xgboost as xgb

dtrain = xgb.DMatrix(X, label=y, enable_categorical=True)
dtest = xgb.DMatrix(x_test, enable_categorical=True)

params = {
    'objective': 'reg:squarederror',
    'max_depth': 6,
    'eta': 0.1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'eval_metric': 'rmse',
    'seed': 42,
}
xgb_model = xgb.train(params, dtrain, 1000, verbose_eval=100)
xgb_train = xgb_model.predict(dtrain).reshape(-1,1)
xgb_test = xgb_model.predict(dtest).reshape(-1,1)

In [6]:
import lightgbm as lgb

lgb_model = lgb.LGBMRegressor(n_estimators=1000, learning_rate=0.1)
lgb_model.fit(X, y)
lgb_train = lgb_model.predict(X).reshape(-1,1)
lgb_test = lgb_model.predict(x_test).reshape(-1,1)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004619 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2546
[LightGBM] [Info] Number of data points in the train set: 230130, number of used features: 4
[LightGBM] [Info] Start training from score 5.952440


In [7]:
stacked_train = np.hstack((xgb_train, cat_train, lgb_train))
stacked_test = np.hstack((xgb_test, cat_test, lgb_test))

In [8]:
from sklearn.linear_model import Ridge
meta_model = Ridge(alpha=1.0)
meta_model.fit(stacked_train, y)

In [9]:
y_pred = np.exp(meta_model.predict(stacked_test))

In [10]:
submission = pd.DataFrame({
                'id' : test['id'],
                'num_sold' : y_pred
})

submission.to_csv('submission.csv', index=False)