## Import

In [7]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
matplotlib.rcParams['font.family'] ='Malgun Gothic'
matplotlib.rcParams['axes.unicode_minus'] =False

from tqdm.auto import tqdm
from pycaret.regression import *

## Load

In [2]:
train_df = pd.read_csv('./train_preprocessed.csv')
test_df = pd.read_csv('./test_preprocessed.csv')

## Without SMILES

우선 SMILES의 data가 없이 pycaret으로 학습해 보자.

In [5]:
train = train_df.drop('FPs', axis=1)
test = test_df.drop('FPs', axis=1)

train

Unnamed: 0,id,MLM,HLM,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea
0,TRAIN_0000,26.010,50.680,3.259,400.495,5,2,8,3.259,117.37
1,TRAIN_0001,29.270,50.590,2.169,301.407,2,1,2,2.172,73.47
2,TRAIN_0002,5.586,80.892,1.593,297.358,5,0,3,1.585,62.45
3,TRAIN_0003,5.710,2.000,4.771,494.652,6,0,5,3.475,92.60
4,TRAIN_0004,93.270,99.990,2.335,268.310,3,0,1,2.337,42.43
...,...,...,...,...,...,...,...,...,...,...
3493,TRAIN_3493,1.556,3.079,3.409,396.195,3,1,5,3.409,64.74
3494,TRAIN_3494,35.560,47.630,1.912,359.381,4,1,3,1.844,77.37
3495,TRAIN_3495,56.150,1.790,1.941,261.320,3,1,6,2.124,70.14
3496,TRAIN_3496,0.030,2.770,0.989,284.696,5,1,5,0.989,91.51


In [6]:
test

Unnamed: 0,id,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea
0,TEST_000,2.641,361.505,4,2,7,2.635,92.76
1,TEST_001,0.585,370.399,5,0,3,0.585,68.31
2,TEST_002,4.276,347.414,4,4,5,4.290,92.86
3,TEST_003,1.795,345.358,5,0,2,1.795,81.21
4,TEST_004,1.219,353.418,4,0,2,0.169,61.15
...,...,...,...,...,...,...,...,...
478,TEST_478,4.207,306.443,2,1,7,4.207,55.13
479,TEST_479,-0.608,335.398,5,0,1,-1.736,70.16
480,TEST_480,1.792,349.383,3,1,3,1.792,69.72
481,TEST_481,0.790,341.132,3,2,2,0.423,69.64


## Pycaret

### MLM

In [8]:
MLM_setup = setup(session_id=0, data=train.drop(['HLM', 'id'], axis=1), target='MLM')
MLM_models = compare_models(n_select=3, sort='RMSE')
MLM_tuned_models = [tune_model(model, optimize="RMSE") for model in MLM_models]
MLM_blended_model = blend_models(MLM_tuned_models, optimize="RMSE")

Unnamed: 0,Description,Value
0,Session id,0
1,Target,MLM
2,Target type,Regression
3,Original data shape,"(3498, 8)"
4,Transformed data shape,"(3498, 8)"
5,Transformed train set shape,"(2448, 8)"
6,Transformed test set shape,"(1050, 8)"
7,Numeric features,7
8,Rows with missing values,0.1%
9,Preprocess,True


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,26.5446,1016.6533,31.8637,0.1918,1.6442,73.8416,0.209
br,Bayesian Ridge,27.2671,1035.7185,32.173,0.1761,1.6771,77.2597,0.139
ridge,Ridge Regression,27.2287,1035.7933,32.174,0.176,1.675,77.0732,0.145
lr,Linear Regression,27.2281,1035.8019,32.1741,0.176,1.675,77.0703,0.548
lasso,Lasso Regression,27.4776,1038.3422,32.2139,0.174,1.6881,78.6511,0.148
llar,Lasso Least Angle Regression,27.4776,1038.342,32.2139,0.174,1.6881,78.6509,0.136
en,Elastic Net,27.8007,1048.0846,32.3656,0.1663,1.6993,80.8391,0.148
rf,Random Forest Regressor,27.0515,1062.7147,32.5885,0.1549,1.6478,74.6264,0.351
huber,Huber Regressor,26.4508,1065.9876,32.6343,0.152,1.6059,65.9541,0.151
ada,AdaBoost Regressor,29.193,1078.8358,32.8264,0.1433,1.7911,90.2509,0.153


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,27.5317,1087.2591,32.9736,0.1698,1.6527,107.0983
1,26.7543,988.7838,31.4449,0.2076,1.7123,69.8116
2,25.7292,921.3906,30.3544,0.2802,1.5695,40.347
3,27.896,1077.241,32.8214,0.1162,1.7603,84.5836
4,26.7841,1004.8637,31.6996,0.2307,1.6615,76.4657
5,26.1149,989.1688,31.4511,0.1641,1.6463,52.3429
6,27.2877,1103.3756,33.2171,0.216,1.6484,84.8329
7,25.7218,931.2161,30.5158,0.2604,1.57,64.9159
8,25.7746,938.9967,30.6431,0.199,1.6171,59.7337
9,26.7632,1019.3451,31.9272,0.1577,1.7228,77.1099


Fitting 10 folds for each of 10 candidates, totalling 100 fits


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,27.3798,1064.2545,32.6229,0.1874,1.639,104.2685
1,27.6662,1062.2419,32.5921,0.1487,1.7495,77.4868
2,26.548,936.0677,30.5952,0.2687,1.5938,40.0906
3,27.6646,1053.8624,32.4632,0.1354,1.7588,83.282
4,27.0292,997.582,31.5845,0.2362,1.6796,78.9405
5,26.439,993.4615,31.5192,0.1604,1.6659,50.0732
6,28.3216,1123.1692,33.5137,0.2019,1.6652,92.1066
7,27.0597,1019.9011,31.9359,0.1899,1.6322,69.2772
8,26.9746,1022.2867,31.9732,0.1279,1.6494,78.8974
9,27.5885,1084.3571,32.9296,0.1039,1.7376,98.1759


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,27.3001,1060.5865,32.5666,0.1902,1.6372,104.6657
1,27.6533,1064.6868,32.6295,0.1467,1.7477,77.1761
2,26.5513,937.542,30.6193,0.2676,1.5927,39.8439
3,27.6273,1053.1053,32.4516,0.136,1.7574,83.1274
4,26.9774,997.1182,31.5772,0.2366,1.6766,78.8554
5,26.3757,992.0314,31.4965,0.1616,1.6616,49.8538
6,28.2989,1124.0275,33.5265,0.2013,1.664,91.4984
7,27.0236,1018.332,31.9113,0.1912,1.6312,68.8143
8,26.9183,1022.5024,31.9766,0.1278,1.6471,79.797
9,27.5874,1087.5958,32.9787,0.1013,1.7364,97.2503


Fitting 10 folds for each of 10 candidates, totalling 100 fits


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,27.3536,1057.7107,32.5225,0.1924,1.6407,105.3431
1,27.2712,1025.5756,32.0246,0.1781,1.7309,74.8215
2,26.1905,916.5938,30.2753,0.2839,1.5841,40.0915
3,27.5642,1044.6877,32.3216,0.1429,1.7559,83.6599
4,26.8101,986.8373,31.414,0.2445,1.668,78.0842
5,26.215,978.6414,31.2832,0.1729,1.6547,50.7397
6,27.8271,1103.5149,33.2192,0.2159,1.6548,89.4723
7,26.4263,975.3535,31.2306,0.2253,1.6177,67.6652
8,26.3621,976.1488,31.2434,0.1673,1.6408,72.798
9,27.1863,1046.1264,32.3439,0.1355,1.7332,90.8263


### HLM

In [10]:
HLM_setup = setup(session_id=0, data=train.drop(['MLM', 'id'], axis=1), target='HLM')
HLM_models = compare_models(n_select=3, sort='RMSE')
HLM_tuned_models = [tune_model(model, optimize="RMSE") for model in HLM_models]
HLM_blended_model = blend_models(HLM_tuned_models, optimize="RMSE")

Unnamed: 0,Description,Value
0,Session id,0
1,Target,HLM
2,Target type,Regression
3,Original data shape,"(3498, 8)"
4,Transformed data shape,"(3498, 8)"
5,Transformed train set shape,"(2448, 8)"
6,Transformed test set shape,"(1050, 8)"
7,Numeric features,7
8,Rows with missing values,0.1%
9,Preprocess,True


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,27.9035,1075.3094,32.7704,0.1686,1.3545,41.3193,0.261
br,Bayesian Ridge,28.2873,1085.0279,32.9203,0.1604,1.3701,43.6282,0.195
lr,Linear Regression,28.2588,1085.0578,32.9208,0.1604,1.3686,43.4617,0.585
ridge,Ridge Regression,28.2592,1085.054,32.9208,0.1604,1.3686,43.4638,0.216
lasso,Lasso Regression,28.4875,1089.0902,32.9824,0.1573,1.3775,44.1714,0.213
llar,Lasso Least Angle Regression,28.4876,1089.0913,32.9824,0.1573,1.3775,44.1717,0.196
ada,AdaBoost Regressor,28.8894,1093.7942,33.0593,0.154,1.3864,43.4716,0.21
en,Elastic Net,28.7136,1095.936,33.0861,0.1521,1.3835,45.2021,0.197
huber,Huber Regressor,28.2466,1105.5817,33.2301,0.1444,1.3586,41.7268,0.2
lar,Least Angle Regression,28.2194,1107.431,33.2626,0.143,1.3603,42.3274,0.218


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,28.8391,1122.5712,33.5048,0.1769,1.3943,31.835
1,27.5561,1081.4352,32.8852,0.1542,1.4982,37.1778
2,27.4039,997.5852,31.5846,0.223,1.3152,31.7656
3,29.0076,1106.0208,33.2569,0.1808,1.4256,63.5346
4,26.4986,963.2341,31.036,0.2412,1.2918,22.5885
5,26.3464,933.6853,30.5563,0.1824,1.3095,12.4946
6,28.5685,1109.2112,33.3048,0.1684,1.3481,32.6088
7,27.6831,1055.3572,32.4863,0.1777,1.2478,67.5605
8,28.6854,1115.4704,33.3987,0.1205,1.353,54.128
9,29.2751,1176.3044,34.2973,0.1314,1.4442,69.644


Fitting 10 folds for each of 10 candidates, totalling 100 fits


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,28.4182,1082.0992,32.8953,0.2066,1.3764,31.7387
1,27.724,1070.9949,32.7261,0.1624,1.4977,38.6794
2,27.4113,986.0969,31.4022,0.232,1.3128,28.765
3,29.0732,1119.6936,33.4618,0.1707,1.4239,58.5411
4,26.89,999.3683,31.6128,0.2128,1.3028,23.3166
5,26.8466,981.5172,31.3292,0.1406,1.332,13.329
6,28.9644,1112.8625,33.3596,0.1657,1.3564,34.5399
7,28.6648,1110.1678,33.3192,0.135,1.2689,69.9052
8,28.9487,1168.2888,34.1802,0.0789,1.3577,59.5929
9,29.9324,1219.1885,34.9169,0.0997,1.4722,77.8748


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,28.3362,1078.6569,32.8429,0.2091,1.3741,31.6289
1,27.6994,1072.2018,32.7445,0.1614,1.4969,38.5045
2,27.38,987.6843,31.4274,0.2308,1.3119,28.5642
3,29.0413,1119.0553,33.4523,0.1712,1.4219,58.3818
4,26.8737,1000.5091,31.6308,0.2119,1.3018,23.2553
5,26.8244,980.6828,31.3159,0.1413,1.3299,13.2717
6,28.9314,1111.1639,33.3341,0.167,1.3547,34.3605
7,28.6769,1112.2311,33.3501,0.1334,1.2674,69.405
8,28.9181,1168.0513,34.1768,0.0791,1.3562,60.283
9,29.9069,1220.3419,34.9334,0.0989,1.471,76.9622


Fitting 10 folds for each of 2 candidates, totalling 20 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,28.4887,1084.8929,32.9377,0.2045,1.3811,31.7336
1,27.4588,1062.6349,32.5981,0.1689,1.4963,38.1159
2,27.2872,980.1526,31.3074,0.2366,1.3127,29.696
3,28.9536,1103.6275,33.2209,0.1826,1.4238,60.1512
4,26.6856,978.5776,31.2822,0.2291,1.2972,23.0522
5,26.6162,956.8877,30.9336,0.1621,1.3219,13.0308
6,28.6773,1102.4321,33.2029,0.1735,1.3514,33.8328
7,28.1681,1078.7945,32.845,0.1595,1.256,68.9536
8,28.7425,1134.8885,33.6881,0.1052,1.357,57.999
9,29.6569,1196.6706,34.5929,0.1164,1.46,74.826


## Predict

In [12]:
Fin_MLM_model = finalize_model(MLM_blended_model)
Fin_HLM_model = finalize_model(HLM_blended_model)

test = test.drop('id', axis=1)

MLM_preds = predict_model(Fin_MLM_model, data=test)
HLM_preds = predict_model(Fin_HLM_model, data=test)

## Submission

In [14]:
submission = pd.read_csv('./sample_submission.csv')
submission['MLM'] = MLM_preds['prediction_label']
submission['HLM'] = HLM_preds['prediction_label']
submission.to_csv('./submission.csv', index = False)

![image.png](attachment:image.png)

## With SMILES

data mol을 사용하여 SMILES를 활용한 features를 추가한다.

In [16]:
!pip install datamol --user

Collecting datamol
  Downloading datamol-0.11.3-py3-none-any.whl (381 kB)
Collecting loguru
  Downloading loguru-0.7.2-py3-none-any.whl (62 kB)
Collecting importlib-resources
  Using cached importlib_resources-6.0.1-py3-none-any.whl (34 kB)
Collecting selfies
  Downloading selfies-2.1.1-py3-none-any.whl (35 kB)
Collecting win32-setctime>=1.0.0
  Downloading win32_setctime-1.1.0-py3-none-any.whl (3.6 kB)
Installing collected packages: win32-setctime, selfies, loguru, importlib-resources, datamol
Successfully installed datamol-0.11.3 importlib-resources-6.0.1 loguru-0.7.2 selfies-2.1.1 win32-setctime-1.1.0




In [18]:
!pip install molfeat --user

Collecting molfeat
  Downloading molfeat-0.9.3-py3-none-any.whl (163 kB)
Collecting s3fs>=2021.9
  Downloading s3fs-2023.9.1-py3-none-any.whl (28 kB)
Collecting mordredcommunity
  Downloading mordredcommunity-2.0.3-py3-none-any.whl (175 kB)
Collecting pmapper
  Downloading pmapper-1.0.3-py3-none-any.whl (566 kB)
Collecting gcsfs>=2021.9
  Downloading gcsfs-2023.9.1-py2.py3-none-any.whl (33 kB)
Collecting pydantic
  Downloading pydantic-2.3.0-py3-none-any.whl (374 kB)
Collecting google-cloud-storage
  Downloading google_cloud_storage-2.10.0-py2.py3-none-any.whl (114 kB)
Collecting fsspec>=2021.9
  Downloading fsspec-2023.9.1-py3-none-any.whl (173 kB)
Collecting aiobotocore~=2.5.4
  Downloading aiobotocore-2.5.4-py3-none-any.whl (73 kB)
Collecting botocore<1.31.18,>=1.31.17
  Downloading botocore-1.31.17-py3-none-any.whl (11.1 MB)
Collecting aioitertools<1.0.0,>=0.5.1
  Downloading aioitertools-0.11.0-py3-none-any.whl (23 kB)
Collecting jmespath<2.0.0,>=0.7.1
  Downloading jmespath-1.0.1

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-gpu 2.6.0 requires numpy~=1.19.2, but you have numpy 1.24.3 which is incompatible.
tensorflow-gpu 2.6.0 requires six~=1.15.0, but you have six 1.16.0 which is incompatible.
tensorflow-gpu 2.6.0 requires typing-extensions~=3.7.4, but you have typing-extensions 4.7.1 which is incompatible.
tensorboard 2.6.0 requires google-auth<2,>=1.6.3, but you have google-auth 2.23.0 which is incompatible.


In [29]:
import datamol as dm

from rdkit.Chem import SaltRemover
from molfeat.trans.fp import FPVecTransformer
from molfeat.trans.concat import FeatConcat
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor

In [26]:
train = pd.read_csv('./train.csv').drop('id', axis=1)
test = pd.read_csv('./test.csv').drop('id', axis=1)

In [30]:
ETC_COLUMNS = ["AlogP", "Molecular_Weight", "Num_H_Acceptors", "Num_H_Donors", "Num_RotatableBonds", "LogD", "Molecular_PolarSurfaceArea"]
AVAILABLE_FPS = ['maccs', 'avalon', 'ecfp', 'fcfp', 'topological', 'atompair', 'rdkit', 'pattern', 'layered', 'secfp', 'erg', 'estate', 'avalon-count', 'rdkit-count', 'ecfp-count', 'fcfp-count', 'topological-count', 'atompair-count',
                 'cats2D', 'pharm2D', 'scaffoldkeys', 'skeys']
MODEL = "v9_2"
SEED = 0

dm.disable_rdkit_log()

def preprocess_mol(row):
    mol = dm.to_mol(row["SMILES"], ordered=True)
    mol = dm.fix_mol(mol)
    mol = dm.sanitize_mol(mol, sanifix=True, charge_neutral=False)
    mol = dm.standardize_mol(
        mol,
        disconnect_metals=False,
        normalize=True,
        reionize=True,
        uncharge=False,
        stereo=True,
    )

    #mol = SaltRemover.SaltRemover().StripMol(mol, dontRemoveEverything=True)    
    row["Standard_Smiles"] = dm.to_smiles(mol)
    
    return row

def fill_na(df, imputer=None):
    if imputer is None:
        imputer = IterativeImputer(estimator=RandomForestRegressor(n_jobs=-1), random_state=SEED)        
        df[ETC_COLUMNS] = imputer.fit_transform(df[ETC_COLUMNS].to_numpy())
    
        return pd.DataFrame(df), imputer
    else:
        df[ETC_COLUMNS] = imputer.transform(df[ETC_COLUMNS].to_numpy())
    
        return pd.DataFrame(df)

def extract_features(df):    
    _df = df.apply(preprocess_mol, axis=1)
    
    fps = []
    for fp in AVAILABLE_FPS:
        fps.append(FPVecTransformer(fp, dtype=np.float64, n_jobs=-1))
    
    featurizer = FeatConcat(fps, dtype=np.float64)
    smiles = _df["Standard_Smiles"].to_list()
    descriptors = featurizer(smiles)
            
    etcs = _df[ETC_COLUMNS].to_numpy()
    
    return pd.DataFrame(np.concatenate([descriptors, etcs], axis=1))

df, imputer = fill_na(train)
df = extract_features(df)
df[["MLM", "HLM"]] = train[["MLM", "HLM"]]

df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,32833,32834,32835,32836,32837,32838,32839,32840,MLM,HLM
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3.259,400.495,5.0,2.0,8.0,3.259,117.37,26.010,50.680
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.169,301.407,2.0,1.0,2.0,2.172,73.47,29.270,50.590
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.593,297.358,5.0,0.0,3.0,1.585,62.45,5.586,80.892
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.771,494.652,6.0,0.0,5.0,3.475,92.60,5.710,2.000
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.335,268.310,3.0,0.0,1.0,2.337,42.43,93.270,99.990
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3493,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3.409,396.195,3.0,1.0,5.0,3.409,64.74,1.556,3.079
3494,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.912,359.381,4.0,1.0,3.0,1.844,77.37,35.560,47.630
3495,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.941,261.320,3.0,1.0,6.0,2.124,70.14,56.150,1.790
3496,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.989,284.696,5.0,1.0,5.0,0.989,91.51,0.030,2.770


In [38]:
def train(df, target):    
 
    _setup = setup(data=df, target=target, train_size=0.8, session_id=SEED, transformation=False, normalize=False, use_gpu=True)
    
    models = compare_models(sort="RMSE", include=["rf", "gbr", "lightgbm", "xgboost", "lr"], n_select=2)
    tuned_models = [tune_model(model, optimize="RMSE") for model in models]        
    blended_models = blend_models(tuned_models, optimize="RMSE")
    return blended_models

MLM_model = train(df.drop('HLM', axis=1), 'MLM')
HLM_model = train(df.drop('MLM', axis=1), 'HLM')

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1660 SUPER, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score 0.500000


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1660 SUPER, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score 0.500000


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1660 SUPER, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score 0.500000


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1660 SUPER, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score 0.500000


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1660 SUPER, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score 0.500000


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1660 SUPER, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score 0.500000


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1660 SUPER, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score 0.500000


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1660 SUPER, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score 0.500000


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1660 SUPER, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score 0.500000


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1660 SUPER, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score 0.500000


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1660 SUPER, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score 0.500000


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1660 SUPER, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score 0.500000


Unnamed: 0,Description,Value
0,Session id,42
1,Target,MLM
2,Target type,Regression
3,Original data shape,"(3498, 32842)"
4,Transformed data shape,"(3498, 32842)"
5,Transformed train set shape,"(2798, 32842)"
6,Transformed test set shape,"(700, 32842)"
7,Numeric features,32841
8,Preprocess,True
9,Imputation type,simple


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1660 SUPER, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score 0.500000


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1660 SUPER, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score 0.500000


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
rf,Random Forest Regressor,25.4562,937.5639,30.5953,0.2502,1.5954,61.6411,68.074
gbr,Gradient Boosting Regressor,25.5871,945.1339,30.7135,0.2443,1.6129,63.4071,140.009
lightgbm,Light Gradient Boosting Machine,24.9653,950.3817,30.7778,0.2404,1.5663,59.4412,16.221
xgboost,Extreme Gradient Boosting,25.7295,1040.3489,32.2155,0.1683,1.578,57.4089,39.563
lr,Linear Regression,4033.8853,38485194.8,5588.6319,-30979.7445,5.2677,9645.0277,2.308


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,26.4755,972.9546,31.1922,0.2497,1.4978,49.0605
1,28.5831,1160.7972,34.0705,0.1263,1.5961,37.9466
2,27.0444,994.7491,31.5396,0.2028,1.603,84.9601
3,25.2741,881.8028,29.6952,0.2717,1.6957,52.2069
4,26.7427,1023.8295,31.9973,0.1739,1.6398,80.6144
5,25.2233,889.7386,29.8285,0.2392,1.6513,47.5788
6,25.2056,913.4596,30.2235,0.2998,1.6644,116.3592
7,25.9672,910.6565,30.1771,0.2737,1.6912,63.7316
8,25.3248,896.921,29.9486,0.2907,1.591,53.5914
9,27.1028,1031.9303,32.1237,0.1366,1.7913,88.2538


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,26.2386,994.0771,31.529,0.2334,1.4686,34.7681
1,27.9663,1107.6752,33.2818,0.1663,1.5562,39.9902
2,25.4186,920.7368,30.3436,0.2621,1.5223,70.6787
3,25.0238,867.3012,29.45,0.2836,1.6681,51.5097
4,26.2127,1013.5346,31.8361,0.1822,1.6149,87.2511
5,24.3102,868.9675,29.4783,0.257,1.6219,42.8206
6,24.8823,918.5949,30.3083,0.2959,1.6055,108.7013
7,25.4632,891.6648,29.8608,0.2889,1.6652,36.0294
8,24.3504,857.5288,29.2836,0.3218,1.5306,52.3661
9,27.12,1034.9431,32.1705,0.134,1.759,85.1562


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,25.4894,925.1034,30.4155,0.2866,1.4608,40.1507
1,27.4419,1092.743,33.0567,0.1775,1.553,33.9115
2,25.6843,938.037,30.6274,0.2483,1.5555,73.8826
3,24.6687,847.8809,29.1184,0.2997,1.6752,48.9813
4,25.9669,1005.0582,31.7027,0.189,1.6135,86.0336
5,24.1249,851.3329,29.1776,0.2721,1.6174,43.9619
6,24.0023,862.1439,29.3623,0.3391,1.6074,109.3335
7,24.9569,851.1779,29.175,0.3212,1.6534,50.5643
8,24.836,879.5064,29.6565,0.3044,1.5577,49.7561
9,26.4879,1003.459,31.6774,0.1604,1.7511,88.6209


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1660 SUPER, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score 0.500000


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1660 SUPER, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score 0.500000


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1660 SUPER, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score 0.500000


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1660 SUPER, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score 0.500000


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1660 SUPER, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score 0.500000


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1660 SUPER, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score 0.500000


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1660 SUPER, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score 0.500000


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1660 SUPER, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score 0.500000


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1660 SUPER, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score 0.500000


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1660 SUPER, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score 0.500000


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1660 SUPER, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score 0.500000


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1660 SUPER, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score 0.500000


Unnamed: 0,Description,Value
0,Session id,42
1,Target,HLM
2,Target type,Regression
3,Original data shape,"(3498, 32842)"
4,Transformed data shape,"(3498, 32842)"
5,Transformed train set shape,"(2798, 32842)"
6,Transformed test set shape,"(700, 32842)"
7,Numeric features,32841
8,Preprocess,True
9,Imputation type,simple


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1660 SUPER, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score 0.500000


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1660 SUPER, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score 0.500000


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,27.1294,1015.4808,31.8431,0.2186,1.3296,32.9131,135.862
rf,Random Forest Regressor,27.1673,1018.7258,31.8946,0.2158,1.3211,33.8369,228.711
lightgbm,Light Gradient Boosting Machine,26.767,1035.3942,32.1494,0.2034,1.3024,32.5031,14.866
xgboost,Extreme Gradient Boosting,27.6866,1135.0909,33.6622,0.1259,1.3138,33.2122,34.938
lr,Linear Regression,2757.93,21624492.0438,3771.4643,-16410.5312,4.1095,2059.4645,10.926


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,28.1637,1056.0559,32.497,0.2273,1.2072,9.4555
1,29.8385,1211.5947,34.808,0.0569,1.3216,10.9955
2,27.8907,1052.2551,32.4385,0.1955,1.3758,67.9217
3,26.5844,985.6559,31.3952,0.2063,1.3123,38.5956
4,28.3509,1108.3782,33.2923,0.1857,1.4249,39.8399
5,25.4657,908.2931,30.1379,0.2454,1.3669,26.0114
6,27.7936,1063.4222,32.6102,0.1653,1.2804,35.1117
7,27.5744,1060.0667,32.5587,0.2488,1.4173,13.3952
8,24.918,848.6648,29.1319,0.3226,1.1647,18.1553
9,28.3343,1086.3991,32.9606,0.1583,1.4284,72.3529


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,27.9264,1030.4789,32.1011,0.246,1.2087,9.6525
1,28.597,1115.6505,33.4014,0.1316,1.3093,11.2811
2,29.0936,1112.9591,33.361,0.1491,1.4013,72.6106
3,26.1433,966.0168,31.0808,0.2221,1.3256,38.7469
4,28.3911,1093.5099,33.0683,0.1966,1.4476,38.8819
5,26.2839,924.2486,30.4015,0.2321,1.3859,24.0419
6,27.3398,1021.326,31.9582,0.1984,1.283,35.9009
7,28.3515,1073.9097,32.7706,0.239,1.4391,14.3275
8,25.7586,900.0721,30.0012,0.2816,1.2265,19.0835
9,28.0237,1052.3403,32.4398,0.1847,1.4443,80.67


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,27.5354,1014.5257,31.8516,0.2577,1.1941,9.1273
1,28.1049,1093.2973,33.065,0.149,1.2964,10.8249
2,28.5435,1083.6131,32.9183,0.1715,1.3865,72.7286
3,25.864,943.863,30.7224,0.24,1.3013,34.0328
4,27.8592,1082.8201,32.9062,0.2045,1.4255,39.8875
5,25.86,908.7514,30.1455,0.245,1.3648,24.2795
6,26.6977,990.3822,31.4703,0.2227,1.2591,37.5266
7,27.1411,1009.8461,31.7781,0.2844,1.4166,13.14
8,25.0774,858.3379,29.2974,0.3149,1.1924,18.6838
9,27.4396,1029.8982,32.092,0.2021,1.4091,73.492


In [39]:
Fin_MLM_model = finalize_model(MLM_model)
Fin_HLM_model = finalize_model(HLM_model)

df_test = fill_na(test, imputer)
df_test = extract_features(df_test)

MLM_preds = predict_model(Fin_MLM_model, data=df_test)
HLM_preds = predict_model(Fin_HLM_model, data=df_test)

submission = pd.read_csv('./sample_submission.csv')
submission['MLM'] = MLM_preds['prediction_label']
submission['HLM'] = HLM_preds['prediction_label']
submission.to_csv('./submission2.csv', index = False)

![image.png](attachment:image.png)