In [1]:
# Standard python libraries
import os
import time

# Essential DS libraries
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import torch

# LightAutoML presets, task and report generation
from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.tasks import Task
from lightautoml.report.report_deco import ReportDeco

In [9]:
ROWID = ['f_0']
DATE = ['f_1']
CATEGORIES = [ f'f_{i}' for i in range(2,33) ]
BINARY = [ f'f_{i}' for i in range(33,42) ]
NUMERICAL = [ f'f_{i}' for i in range(42,80) ]
IS_CLICKED = ['is_clicked']
IS_INSTALLED =['is_installed']

In [17]:
N_THREADS = 24
N_FOLDS = 3
RANDOM_STATE = 42
TEST_SIZE = 0.2
TIMEOUT = 30000
TARGET_NAME = 'is_clicked'

In [3]:
DATASET_DIR = '../Data'
DATASET_NAME = 'miss_combine.csv'
DATASET_FULLNAME = os.path.join(DATASET_DIR, DATASET_NAME)

In [4]:
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

In [5]:
data = pd.read_csv('../Data/miss_combine.csv')
data.head()

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,f_72,f_73,f_74,f_75,f_76,f_77,f_78,f_79,is_clicked,is_installed
0,2541275,49,30131,7152,16170,25604,25613,27941,21218,869,...,3.426729,0.571121,0.115692,1.156922,0.269948,0.0,0.0,0.0,1,1
1,2541956,49,20095,563,22861,25604,22651,27941,19203,869,...,0.571121,0.571121,0.115692,1.156922,0.269948,0.0,0.0,0.0,0,0
2,2542133,55,20095,563,22861,25604,21280,27941,21621,23218,...,0.0,0.0,0.0,1.156922,0.038564,0.0,0.0,0.0,0,1
3,2542375,57,17758,22294,29040,25604,15836,27941,21218,21533,...,5.711215,2.284486,0.115692,1.156922,0.269948,0.0,0.0,0.0,0,0
4,2542442,47,11077,7152,18575,15908,16861,27941,21218,9638,...,1.142243,0.0,0.038564,1.156922,0.269948,0.0,37.384575,0.0,0,0


## Imputation

In [7]:
from tqdm import tqdm

data['f_30'].fillna(data['f_30'].mode()[0],inplace=True)
data['f_31'].fillna(data['f_31'].mode()[0],inplace=True)
fmiss = "f_43,f_51,f_58,f_59,f_64,f_65,f_66,f_67,f_68,f_69,f_70".split(',')
for f in tqdm(fmiss,desc="NUM IMPUTE"):
    data[f].fillna(data[f].mean(),inplace=True)

NUM IMPUTE: 100%|█████████████████████████████████████| 11/11 [00:00<00:00, 40.52it/s]


In [11]:
tr_data, te_data = train_test_split(
    data,
    test_size=TEST_SIZE,
    stratify=data[TARGET_NAME],
    random_state=RANDOM_STATE
)

print(f'Data splitted. Parts sizes: tr_data = {tr_data.shape}, te_data = {te_data.shape}')

tr_data.head()

Data splitted. Parts sizes: tr_data = (2788681, 82), te_data = (697171, 82)


Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,f_72,f_73,f_74,f_75,f_76,f_77,f_78,f_79,is_clicked,is_installed
93993,3201672,49,12577,22294,2059,21545,4018,27941,18800,869,...,5.711215,3.99785,0.115692,1.156922,0.269948,0.0,0.0,0.0,0,0
976084,1079823,61,14378,22294,11449,25604,14544,27941,21218,9638,...,1.142243,0.0,0.115692,1.156922,0.269948,0.0,0.0,0.0,1,0
796623,3224317,48,20095,563,31686,21545,590,27941,18800,23218,...,1.142243,1.142243,0.115692,1.156922,0.269948,0.0,0.0,0.0,0,1
1635437,3434437,59,17769,22294,3192,19072,3715,27941,19606,6675,...,1.142243,0.571121,0.115692,1.156922,0.269948,0.0,0.0,0.0,1,1
2015105,667997,47,30131,22294,32544,21545,8148,27941,19606,9638,...,0.0,0.0,0.115692,1.156922,0.269948,0.0,0.0,0.0,0,0


In [18]:
task = Task('binary')

roles = {
    'target': TARGET_NAME,
    'drop': ['f_0','is_installed']
}

In [19]:
automl = TabularAutoML(
    task = task,
    timeout = TIMEOUT,
    cpu_limit = N_THREADS,
    reader_params = {'n_jobs': N_THREADS, 'cv': N_FOLDS, 'random_state': RANDOM_STATE}
)

In [20]:
%%time
oof_pred = automl.fit_predict(tr_data, roles = roles, verbose = 1)

[22:48:48] Stdout logging level is INFO.
[22:48:48] Task: binary

[22:48:48] Start automl preset with listed constraints:
[22:48:48] - time: 30000.00 seconds
[22:48:48] - CPU: 24 cores
[22:48:48] - memory: 16 GB

[22:48:48] [1mTrain data shape: (2788681, 82)[0m

[22:49:01] Layer [1m1[0m train process start. Time left 29987.46 secs
[22:51:41] Start fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m ...
[22:53:52] Fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m finished. score = [1m0.8390470114588647[0m
[22:53:52] [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m fitting and predicting completed
[22:53:52] Time left 29696.41 secs

[22:55:37] [1mSelector_LightGBM[0m fitting and predicting completed
[22:59:12] Start fitting [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m ...
[23:04:26] Fitting [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m finished. score = [1m0.8703582974179124[0m
[23:04:26] [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m fitting and predicting completed
[23:04:26] Start hyperparameters optimization for [1mLvl_0_Pipe_1_


Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU


[23:21:14] Fitting [1mLvl_0_Pipe_1_Mod_2_CatBoost[0m finished. score = [1m0.8666007322957252[0m
[23:21:14] [1mLvl_0_Pipe_1_Mod_2_CatBoost[0m fitting and predicting completed
[23:21:14] Start hyperparameters optimization for [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m ... Time budget is 300.00 secs


Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU


[23:27:01] Hyperparameters optimization for [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m completed
[23:27:01] Start fitting [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m ...


Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU


[23:30:38] Fitting [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m finished. score = [1m0.8684865328261325[0m
[23:30:38] [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m fitting and predicting completed
[23:30:38] Time left 27489.98 secs

[23:30:38] [1mLayer 1 training completed.[0m

[23:30:39] Blending: optimization starts with equal weights and score [1m0.8687050580540939[0m
[23:31:13] Blending: iteration [1m0[0m: score = [1m0.8711547960071095[0m, weights = [1m[0.         0.31884772 0.5385006  0.         0.14265174][0m
[23:31:46] Blending: iteration [1m1[0m: score = [1m0.8711591959434202[0m, weights = [1m[0.         0.36634642 0.4950008  0.         0.13865277][0m
[23:32:19] Blending: iteration [1m2[0m: score = [1m0.8711591980573493[0m, weights = [1m[0.        0.3650221 0.4955125 0.        0.1394654][0m
[23:32:51] Blending: iteration [1m3[0m: score = [1m0.8711591978256757[0m, weights = [1m[0.         0.36400992 0.4963024  0.         0.13968773][0m
[23:33:23] Blending:

In [21]:
%%time

te_pred = automl.predict(te_data)
print(f'Prediction for te_data:\n{te_pred}\nShape = {te_pred.shape}')

Prediction for te_data:
array([[0.0317101 ],
       [0.0729503 ],
       [0.08340527],
       ...,
       [0.02408476],
       [0.84660566],
       [0.06643729]], dtype=float32)
Shape = (697171, 1)
CPU times: user 8min 18s, sys: 3.02 s, total: 8min 21s
Wall time: 46.1 s


In [22]:
test = pd.read_csv('../Data/test/000000000000.csv',sep='\t')
test['f_30'].fillna(test['f_30'].mode()[0],inplace=True)
test['f_31'].fillna(test['f_31'].mode()[0],inplace=True)
for f in fmiss:
    test[f].fillna(test[f].mean(),inplace=True)

In [24]:
feat = ['f_1', 'f_2', 'f_3', 'f_4', 'f_5', 'f_6', 'f_7', 'f_8', 'f_9',
 'f_10', 'f_11', 'f_12', 'f_13', 'f_14', 'f_15', 'f_16', 'f_17', 'f_18',
 'f_19', 'f_20', 'f_21', 'f_22', 'f_23', 'f_24', 'f_25', 'f_26', 'f_27',
 'f_28', 'f_29', 'f_30', 'f_31', 'f_32', 'f_33', 'f_34', 'f_35', 'f_36',
 'f_37', 'f_38', 'f_39', 'f_40', 'f_41', 'f_42', 'f_43', 'f_44', 'f_45',
 'f_46', 'f_47', 'f_48', 'f_49', 'f_50', 'f_51', 'f_52', 'f_53', 'f_54',
 'f_55', 'f_56', 'f_57', 'f_58', 'f_59', 'f_60', 'f_61', 'f_62', 'f_63',
 'f_64', 'f_65', 'f_66', 'f_67', 'f_68', 'f_69', 'f_70', 'f_71', 'f_72',
 'f_73', 'f_74', 'f_75', 'f_76', 'f_77', 'f_78', 'f_79']

In [25]:
test_pred_click = automl.predict(test[feat])

In [26]:
test_pred_click.shape

(160973, 1)

In [27]:
test_pred_click

array([[0.49451855],
       [0.13921647],
       [0.31095907],
       ...,
       [0.2254554 ],
       [0.47794285],
       [0.99840707]], dtype=float32)

In [30]:
N_THREADS = 24
N_FOLDS = 3
RANDOM_STATE = 42
TEST_SIZE = 0.1
TIMEOUT = 30000
TARGET_NAME = 'is_installed'

In [31]:
tr_data, te_data = train_test_split(
    data,
    test_size=TEST_SIZE,
    stratify=data[TARGET_NAME],
    random_state=RANDOM_STATE
)

print(f'Data splitted. Parts sizes: tr_data = {tr_data.shape}, te_data = {te_data.shape}')

tr_data.head()

Data splitted. Parts sizes: tr_data = (3137266, 82), te_data = (348586, 82)


Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,f_72,f_73,f_74,f_75,f_76,f_77,f_78,f_79,is_clicked,is_installed
3485143,746725,55,27426,22294,24261,25604,3973,27941,19606,23218,...,1.142243,0.0,0.115692,1.156922,0.269948,0.0,0.0,0.0,0,0
349435,131359,60,20095,563,22861,19475,21280,27941,18800,14659,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
1622325,88112,65,20095,563,22861,15908,21280,27941,18800,31372,...,1.142243,0.571121,0.115692,1.156922,0.269948,0.0,0.0,0.0,0,1
1871046,3449110,46,3346,22294,6767,25604,1159,27941,19203,14659,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
2020768,404023,65,27426,22294,32095,19475,16299,27941,21621,31372,...,0.0,0.0,0.0,1.156922,0.269948,0.0,0.0,0.0,0,0


In [32]:
task = Task('binary')

roles = {
    'target': TARGET_NAME,
    'drop': ['f_0','is_clicked']
}

In [33]:
automl = TabularAutoML(
    task = task,
    timeout = TIMEOUT,
    cpu_limit = N_THREADS,
    reader_params = {'n_jobs': N_THREADS, 'cv': N_FOLDS, 'random_state': RANDOM_STATE}
)

In [34]:
%%time
oof_pred = automl.fit_predict(tr_data, roles = roles, verbose = 1)

[23:52:58] Stdout logging level is INFO.
[23:52:58] Task: binary

[23:52:58] Start automl preset with listed constraints:
[23:52:58] - time: 30000.00 seconds
[23:52:58] - CPU: 24 cores
[23:52:58] - memory: 16 GB

[23:52:58] [1mTrain data shape: (3137266, 82)[0m

[23:53:17] Layer [1m1[0m train process start. Time left 29980.80 secs
[23:56:16] Start fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m ...
[23:59:18] Fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m finished. score = [1m0.8456481992823612[0m
[23:59:18] [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m fitting and predicting completed
[23:59:18] Time left 29620.25 secs

[00:01:44] [1mSelector_LightGBM[0m fitting and predicting completed
[00:05:56] Start fitting [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m ...
[00:14:49] Fitting [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m finished. score = [1m0.8821181574375545[0m
[00:14:49] [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m fitting and predicting completed
[00:14:49] Start hyperparameters optimization for [1mLvl_0_Pipe_1_

Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU


[00:36:25] Fitting [1mLvl_0_Pipe_1_Mod_2_CatBoost[0m finished. score = [1m0.8768268250578473[0m
[00:36:25] [1mLvl_0_Pipe_1_Mod_2_CatBoost[0m fitting and predicting completed
[00:36:25] Start hyperparameters optimization for [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m ... Time budget is 300.00 secs


Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU


[00:41:59] Hyperparameters optimization for [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m completed
[00:41:59] Start fitting [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m ...


Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU


[00:45:52] Fitting [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m finished. score = [1m0.8782814941443563[0m
[00:45:52] [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m fitting and predicting completed
[00:45:52] Time left 26826.18 secs

[00:45:52] [1mLayer 1 training completed.[0m

[00:45:53] Blending: optimization starts with equal weights and score [1m0.8797322000920119[0m
[00:46:31] Blending: iteration [1m0[0m: score = [1m0.8831237705233866[0m, weights = [1m[0.         0.34372616 0.6562738  0.         0.        ][0m
[00:47:09] Blending: iteration [1m1[0m: score = [1m0.8831334070770801[0m, weights = [1m[0.         0.40243042 0.5975696  0.         0.        ][0m
[00:47:47] Blending: iteration [1m2[0m: score = [1m0.8831334070770801[0m, weights = [1m[0.         0.40243042 0.5975696  0.         0.        ][0m
[00:47:47] Blending: no score update. Terminated

[00:47:47] [1mAutoml preset training completed in 3288.59 seconds[0m

[00:47:47] Model description:
Final prediction

In [35]:
test_install_pred = automl.predict(test[feat])

In [44]:
test_install_pred.shape,type(test_install_pred)

((160973, 1), lightautoml.dataset.np_pd_dataset.NumpyDataset)

In [45]:
test_install_pred = test_install_pred.to_numpy()
test_pred_click = test_pred_click.to_numpy()

In [50]:
test_install_pred.shape,type(test_install_pred.to_numpy)

((160973, 1), method)

In [61]:
inst = test_install_pred.data

In [63]:
inst.shape

(160973, 1)

In [58]:
np.array(inst)

AssertionError: Numpy dataset support only np.ndarray features

In [40]:
test['f_0'].values

array([64505, 64506, 64507, ..., 16242, 16243, 16244])

In [64]:
import numpy as np
result = np.vstack([test['f_0'].to_numpy(dtype=int),test_pred_click.data.reshape(-1),test_install_pred.data.reshape(-1)])

In [65]:
result.shape

(3, 160973)

In [66]:
result = result.T

In [67]:
result.shape

(160973, 3)

In [69]:
final = pd.DataFrame(result,columns=['RowId','is_clicked','is_installed'])

In [70]:
final.head()

Unnamed: 0,RowId,is_clicked,is_installed
0,64505.0,0.494519,0.399739
1,64506.0,0.139216,0.303988
2,64507.0,0.310959,0.058482
3,64508.0,0.292092,0.309709
4,64509.0,0.406814,0.102375


In [71]:
final.to_csv('../Data/LightautoML_results.csv', sep ='\t', index=False)