# Imports

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Load dataset

In [2]:
train = pd.read_csv('../input/tabular-playground-series-aug-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-aug-2021/test.csv')
print('train shape:',train.shape)
print('test shape:',test.shape)

train shape: (250000, 102)
test shape: (150000, 101)


In [3]:
train.head()

Unnamed: 0,id,f0,f1,f2,f3,f4,f5,f6,f7,f8,...,f91,f92,f93,f94,f95,f96,f97,f98,f99,loss
0,0,-0.00235,59,0.766739,-1.35046,42.2727,16.6857,30.3599,1.2673,0.392007,...,-42.4399,26.854,1.45751,0.696161,0.941764,1.82847,0.92409,2.29658,10.4898,15
1,1,0.784462,145,-0.463845,-0.530421,27324.9,3.47545,160.498,0.828007,3.73586,...,-184.132,7.90137,1.70644,-0.494699,-2.0583,0.819184,0.439152,2.3647,1.14383,3
2,2,0.317816,19,-0.432571,-0.382644,1383.26,19.7129,31.1026,-0.515354,34.4308,...,7.43721,37.2181,3.25339,0.337934,0.615037,2.21676,0.745268,1.69679,12.3055,6
3,3,0.210753,17,-0.616454,0.946362,-119.253,4.08235,185.257,1.38331,-47.5214,...,9.66778,0.626942,1.49425,0.517513,-10.2221,2.62731,0.61727,1.45645,10.0288,2
4,4,0.439671,20,0.968126,-0.092546,74.302,12.3065,72.186,-0.233964,24.3991,...,290.657,15.6043,1.73557,-0.476668,1.39019,2.19574,0.826987,1.78485,7.07197,1


In [4]:
# Train data
X=train.drop(columns = ['loss','id'])
y=train['loss'].values

# Test data
X_test=test.drop(columns = ['id'])
print('Train set:', X.shape)
print('Test set:', X_test.shape)

Train set: (250000, 100)
Test set: (150000, 100)


# Train Catboost model

In [5]:
from catboost import CatBoostRegressor

model = CatBoostRegressor(random_state = 44,
                         thread_count = 4,
                         verbose = False,
                         loss_function = 'RMSE',
                         eval_metric = 'RMSE',
                         od_type = "Iter",
                         early_stopping_rounds = 500,
                         iterations = 10000,
                         task_type = "CPU")
model.fit(X, y, verbose=0)

<catboost.core.CatBoostRegressor at 0x7f721293c110>

# Model performance

In [6]:
from sklearn import metrics

print('R2 score: ', model.score(X, y))
predicted = model.predict(X)
rmse = metrics.mean_squared_error(y, predicted, squared=False)
print('RMSE: ', rmse)

R2 score:  0.1943145114992415
RMSE:  7.127089207837219


# Prediction

In [7]:
y_pred = model.predict(X_test)

# Submission

In [8]:
preds = pd.read_csv("../input/tabular-playground-series-aug-2021/sample_submission.csv")
preds.loss = y_pred
preds.head()

Unnamed: 0,id,loss
0,250000,8.763262
1,250001,4.563811
2,250002,8.589442
3,250003,7.491645
4,250004,6.973805


In [9]:
preds.to_csv('submission_catboost_101.csv', index=False)