In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/30-days-of-ml/sample_submission.csv
/kaggle/input/30-days-of-ml/train.csv
/kaggle/input/30-days-of-ml/test.csv


In [2]:
# pipeline utilities
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# preprocessing
from sklearn.preprocessing import PowerTransformer, MinMaxScaler, OneHotEncoder, PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.metrics import make_scorer, mean_squared_error

# tuning
from sklearn.model_selection import GridSearchCV

# validation
from sklearn.model_selection import KFold, train_test_split

# model
from xgboost import XGBRegressor 

In [3]:
# Load data
df = pd.read_csv("../input/30-days-of-ml/train.csv", index_col=0)
df

Unnamed: 0_level_0,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,...,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,B,B,B,C,B,B,A,E,C,N,...,0.400361,0.160266,0.310921,0.389470,0.267559,0.237281,0.377873,0.322401,0.869850,8.113634
2,B,B,A,A,B,D,A,F,A,O,...,0.533087,0.558922,0.516294,0.594928,0.341439,0.906013,0.921701,0.261975,0.465083,8.481233
3,A,A,A,C,B,D,A,D,A,F,...,0.650609,0.375348,0.902567,0.555205,0.843531,0.748809,0.620126,0.541474,0.763846,8.364351
4,B,B,A,C,B,D,A,E,C,K,...,0.668980,0.239061,0.732948,0.679618,0.574844,0.346010,0.714610,0.540150,0.280682,8.049253
6,A,A,A,C,B,D,A,E,A,N,...,0.686964,0.420667,0.648182,0.684501,0.956692,1.000773,0.776742,0.625849,0.250823,7.972260
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499993,B,B,A,A,B,D,A,E,A,I,...,0.769792,0.450538,0.934360,1.005077,0.853726,0.422541,1.063463,0.697685,0.506404,7.945605
499996,A,B,A,C,B,B,A,E,E,F,...,0.528056,0.508502,0.358247,0.257825,0.433525,0.301015,0.268447,0.577055,0.823611,7.326118
499997,B,B,A,C,B,C,A,E,G,F,...,0.688747,0.372425,0.364936,0.383224,0.551825,0.661007,0.629606,0.714139,0.245732,8.706755
499998,A,B,A,C,B,B,A,E,E,I,...,0.344404,0.424243,0.382028,0.468819,0.351036,0.288768,0.611169,0.380254,0.332030,7.229569


In [4]:
# work on a random sample to speed-up process
# df_sample = df.sample(1000, random_state=123)
# X = df_sample.drop(['target'], axis=1)
# y = df_sample['target']

X = df.drop(['target'], axis=1)
y = df['target']
print(X.shape)
print(y.shape)

(300000, 24)
(300000,)


In [5]:
# Select numerical features
numFeatures = X.select_dtypes(exclude=['object']).columns
# Select Categorical features
catFeatures = X.select_dtypes(include=['object']).columns
print(numFeatures)
print(catFeatures)

Index(['cont0', 'cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7',
       'cont8', 'cont9', 'cont10', 'cont11', 'cont12', 'cont13'],
      dtype='object')
Index(['cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8',
       'cat9'],
      dtype='object')


In [6]:
# Pipeline for transforming categorical variables
catTransformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Pipeline for scaling numerical variables
numTransformer = Pipeline(steps=[
    ('scaler', MinMaxScaler()),
    ('power', PowerTransformer()),
])

# Create the preprocessing engine
preprocessor = ColumnTransformer(
    transformers=[
        ('numeric', numTransformer, numFeatures),
        ('categoric', catTransformer, catFeatures),
    ]
)

## Find PCA's optimal component number

60 found to be optimal (with 5-fold cross-validation, using hyperparameters from previous round).

## Fine-tune

In [7]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
val_preprocessor = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('dimred', PCA(random_state=123)),
    ]
)

model = XGBRegressor(
    tree_method='gpu_hist',
    objective='reg:squarederror',
    random_state=123
)

param_grid = {
    'model__n_estimators': [500, 1000, 1500],
    'model__max_depth': [2, 3, 4],
    'model__learning_rate': [0.1],    
    'model__subsample': [0.3],
    'model__colsample_bytree': [0.6],
    'model__booster': ['gbtree', 'gblinear', 'dart']
}

scorer = make_scorer(score_func=mean_squared_error, greater_is_better=False)
kf = KFold(n_splits=5)

estimator = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('dimred', PCA(n_components=60, random_state=123)),
        ('model', model)
    ]
)

search_xgbr = GridSearchCV(
    estimator=estimator, 
    param_grid=param_grid,
    scoring=scorer,
    cv=kf,
    verbose=2
)
search_xgbr.fit(X_train, y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV] model__booster=gbtree, model__colsample_bytree=0.6, model__learning_rate=0.1, model__max_depth=2, model__n_estimators=500, model__subsample=0.3 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  model__booster=gbtree, model__colsample_bytree=0.6, model__learning_rate=0.1, model__max_depth=2, model__n_estimators=500, model__subsample=0.3, total=  10.3s
[CV] model__booster=gbtree, model__colsample_bytree=0.6, model__learning_rate=0.1, model__max_depth=2, model__n_estimators=500, model__subsample=0.3 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   10.3s remaining:    0.0s


[CV]  model__booster=gbtree, model__colsample_bytree=0.6, model__learning_rate=0.1, model__max_depth=2, model__n_estimators=500, model__subsample=0.3, total=   8.1s
[CV] model__booster=gbtree, model__colsample_bytree=0.6, model__learning_rate=0.1, model__max_depth=2, model__n_estimators=500, model__subsample=0.3 
[CV]  model__booster=gbtree, model__colsample_bytree=0.6, model__learning_rate=0.1, model__max_depth=2, model__n_estimators=500, model__subsample=0.3, total=   8.5s
[CV] model__booster=gbtree, model__colsample_bytree=0.6, model__learning_rate=0.1, model__max_depth=2, model__n_estimators=500, model__subsample=0.3 
[CV]  model__booster=gbtree, model__colsample_bytree=0.6, model__learning_rate=0.1, model__max_depth=2, model__n_estimators=500, model__subsample=0.3, total=   8.0s
[CV] model__booster=gbtree, model__colsample_bytree=0.6, model__learning_rate=0.1, model__max_depth=2, model__n_estimators=500, model__subsample=0.3 
[CV]  model__booster=gbtree, model__colsample_bytree=0.

[Parallel(n_jobs=1)]: Done 135 out of 135 | elapsed: 422.9min finished


GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=False),
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('numeric',
                                                                         Pipeline(steps=[('scaler',
                                                                                          MinMaxScaler()),
                                                                                         ('power',
                                                                                          PowerTransformer())]),
                                                                         Index(['cont0', 'cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7',
       'cont8', 'cont9', 'cont10', 'cont11', 'cont12', 'cont13'],
      dt...
                                                     tree_method='gpu_hist',
                                                     validate_par

In [8]:
print("Best params")
print(search_xgbr.best_params_)
print("\nScore")
print(f'{search_xgbr.best_score_:.4f}')

Best params
{'model__booster': 'gbtree', 'model__colsample_bytree': 0.6, 'model__learning_rate': 0.1, 'model__max_depth': 2, 'model__n_estimators': 500, 'model__subsample': 0.3}

Score
-0.5469


In [9]:
# load test data
X_test = pd.read_csv("../input/30-days-of-ml/test.csv", index_col=0)

# predict
y_pred = search_xgbr.best_estimator_.predict(X_test)
# save the predictions to a CSV file
output = pd.DataFrame({'Id': X_test.index,
                       'target': y_pred})
output.to_csv('submission.csv', index=False)