In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from tqdm import tqdm
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, log_loss
import optuna


class Train_Split:
    
    ROWID = ['f_0']
    DATE = ['f_1']
    CATEGORIES = [ f'f_{i}' for i in range(2,33) ]
    BINARY = [ f'f_{i}' for i in range(33,42) ]
    NUMERICAL = [ f'f_{i}' for i in range(42,80) ]
    IS_CLICKED = ['is_clicked']
    IS_INSTALLED =['is_installed']

    def __init__(self, val_type='random', class_type='binary',split_date=66,impute=True):

        print("Loading the data")
        self.data = pd.read_csv('../Data/miss_combine.csv')
        self.impute = impute
        self.val_type = val_type
        self.class_type = class_type
        self.split_date = split_date
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.impute_data()
        self.train_test_split()

    def impute_data(self):
        if self.impute:
            self.data['f_30'].fillna(self.data['f_30'].mode()[0],inplace=True)
            self.data['f_31'].fillna(self.data['f_31'].mode()[0],inplace=True)
            fmiss = "f_43,f_51,f_58,f_59,f_64,f_65,f_66,f_67,f_68,f_69,f_70".split(',')
            for f in tqdm(fmiss,desc="NUM IMPUTE"):
                self.data[f].fillna(self.data[f].mean(),inplace=True)

    def get_split(self):
        return self.X_train, self.X_test, self.y_train, self.y_test

    def train_test_split(self):
        print(f"Spliting the Data based on {self.val_type}")
        if self.val_type == 'random':
            self.random_split()
        elif self.val_type == 'time':
            self.time_split()
        elif self.val_type == 'No':
            self.final_split()
        else:
            raise Exception('Invalid validation type')
    
    # def get_label(self,data):
    #     '''
    #     data: Numpy array
    #     '''
    #     if self.class_type == 'binary':
    #         return data
    #     elif self.class_type == 'multi':
    #         labels = []
    #         for a, b in zip(data[:,0], data[:,1]):
    #             if a==0 and b==0:# None
    #                 labels.append(0)
    #             elif a==1 and b==0:# Clicked
    #                 labels.append(1)
    #             elif a==0 and b==1:# Installed
    #                 labels.append(2)
    #             elif a==1 and b==1:# Clicked and Installed
    #                 labels.append(3)
    #         return np.array(labels)

    def random_split(self):
        """
        Randomly split the data into train and test set
        """
        y = self.data[Train_Split.IS_CLICKED + Train_Split.IS_INSTALLED]
        X = self.data.drop(Train_Split.IS_CLICKED + Train_Split.IS_INSTALLED, axis=1)
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    def time_split(self):
        """
        Split the data into train and test set based on Date
        """
        self.X_train = self.data[self.data[Train_Split.DATE[0]] < self.split_date ].drop(Train_Split.IS_CLICKED + Train_Split.IS_INSTALLED, axis=1)
        self.X_test = self.data[self.data[Train_Split.DATE[0]] >= self.split_date ].drop(Train_Split.IS_CLICKED + Train_Split.IS_INSTALLED, axis=1)
        self.y_train = self.data[self.data[Train_Split.DATE[0]] < self.split_date ][Train_Split.IS_CLICKED + Train_Split.IS_INSTALLED]
        self.y_test = self.data[self.data[Train_Split.DATE[0]] >= self.split_date ][Train_Split.IS_CLICKED + Train_Split.IS_INSTALLED]
        print(f"X_train:{self.X_train.shape}, X_test:{self.X_test.shape} , y_train:{self.y_train.shape} , y_test:{self.y_test.shape}")

    def final_split(self):
        self.X_train = self.data.drop(Train_Split.IS_CLICKED + Train_Split.IS_INSTALLED, axis=1).values
        self.y_train = self.get_label(self.data[Train_Split.IS_CLICKED + Train_Split.IS_INSTALLED].values)
        self.X_test = None
        self.y_test = None

In [2]:
train = Train_Split(val_type='time',class_type='binary',split_date=66,impute=True)
X_train, X_test, y_train, y_test = train.get_split()

Loading the data


NUM IMPUTE: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11/11 [00:00<00:00, 43.21it/s]


Spliting the Data based on time
X_train:(3387880, 80), X_test:(97972, 80) , y_train:(3387880, 2) , y_test:(97972, 2)


In [83]:
X_1 = X_train[X_train['f_1']>55]
y_1 = y_train[X_train['f_1']>55]

In [85]:
X_1.shape,y_1.shape

((1543247, 80), (1543247, 2))

In [86]:
import gc
gc.collect()

505

In [87]:
use_features = Train_Split.CATEGORIES  + Train_Split.BINARY + Train_Split.NUMERICAL
#['f_24','f_2', 'f_68', 'f_59', 'f_19', 'f_40', 'f_4', 'f_22', 'f_50', 'f_37', 'f_51', 'f_70', 'f_6', 'f_20', 'f_66', 'f_69','f_1']


In [88]:
model_for = 'install'

In [93]:
def objective(trail):
    params = {
        'max_depth':trail.suggest_int('max_depth',3,8),
        'learning_rate':trail.suggest_float('learning_rate',0.05,0.3),
        'n_estimators':trail.suggest_int('n_estimators',100,650),
        # 'gamma':trail.suggest_loguniform('gamma',0.01,1),
        # 'colsample_bytree':trail.suggest_loguniform('colsample_bytree',0.7,1),
        'tree_method':'gpu_hist',
        'objective':'binary:logistic'
    }
    target = Train_Split.IS_CLICKED[0]
    if model_for == 'install':
        target = Train_Split.IS_INSTALLED[0]
    print(target)
    model = XGBClassifier(**params)
    # print("Training the model")
    model.fit(X_1[use_features],y_1[target])
    # print("Training Done")
    # print(model.feature_names_in_)
    # print(model.feature_importances_)
    y_pred = model.predict(X_test[use_features])
    score = log_loss(y_test[target],y_pred)
    # print(f"F1 Score: {f1}")
    # print(gc.collect())
    return score

In [94]:
model_for = 'install'
print(f"The MODEL is For :{model_for}")
study_install = optuna.create_study(direction='minimize',study_name='install_log_loss')
study_install.optimize(objective, n_trials=50)

[32m[I 2023-05-21 02:04:13,033][0m A new study created in memory with name: install_log_loss[0m


The MODEL is For :install
is_installed


[32m[I 2023-05-21 02:04:23,286][0m Trial 0 finished with value: 6.26214017922681 and parameters: {'max_depth': 5, 'learning_rate': 0.12604060245644888, 'n_estimators': 486}. Best is trial 0 with value: 6.26214017922681.[0m


is_installed


[32m[I 2023-05-21 02:04:32,066][0m Trial 1 finished with value: 6.221954070301745 and parameters: {'max_depth': 6, 'learning_rate': 0.10355639080242214, 'n_estimators': 342}. Best is trial 1 with value: 6.221954070301745.[0m


is_installed


[32m[I 2023-05-21 02:04:38,701][0m Trial 2 finished with value: 6.142641912554637 and parameters: {'max_depth': 7, 'learning_rate': 0.2555500890986713, 'n_estimators': 186}. Best is trial 2 with value: 6.142641912554637.[0m


is_installed


[32m[I 2023-05-21 02:04:48,248][0m Trial 3 finished with value: 6.269903612708728 and parameters: {'max_depth': 5, 'learning_rate': 0.1232437898443851, 'n_estimators': 457}. Best is trial 2 with value: 6.142641912554637.[0m


is_installed


[32m[I 2023-05-21 02:04:56,267][0m Trial 4 finished with value: 6.2579002163053365 and parameters: {'max_depth': 3, 'learning_rate': 0.08409050917510306, 'n_estimators': 474}. Best is trial 2 with value: 6.142641912554637.[0m


is_installed


[32m[I 2023-05-21 02:05:08,657][0m Trial 5 finished with value: 6.078826563006255 and parameters: {'max_depth': 8, 'learning_rate': 0.14327126075451213, 'n_estimators': 424}. Best is trial 5 with value: 6.078826563006255.[0m


is_installed


[32m[I 2023-05-21 02:05:14,489][0m Trial 6 finished with value: 6.411273363450029 and parameters: {'max_depth': 4, 'learning_rate': 0.2704164591559542, 'n_estimators': 225}. Best is trial 5 with value: 6.078826563006255.[0m


is_installed


[32m[I 2023-05-21 02:05:22,835][0m Trial 7 finished with value: 6.375674278305857 and parameters: {'max_depth': 7, 'learning_rate': 0.29711042307953883, 'n_estimators': 261}. Best is trial 5 with value: 6.078826563006255.[0m


is_installed


[32m[I 2023-05-21 02:05:37,896][0m Trial 8 finished with value: 6.385901946555611 and parameters: {'max_depth': 7, 'learning_rate': 0.11218627503188201, 'n_estimators': 627}. Best is trial 5 with value: 6.078826563006255.[0m


is_installed


[32m[I 2023-05-21 02:05:47,628][0m Trial 9 finished with value: 6.469092021870403 and parameters: {'max_depth': 5, 'learning_rate': 0.24803818004209283, 'n_estimators': 459}. Best is trial 5 with value: 6.078826563006255.[0m


is_installed


[32m[I 2023-05-21 02:06:05,371][0m Trial 10 finished with value: 6.122182405534095 and parameters: {'max_depth': 8, 'learning_rate': 0.05022219319434426, 'n_estimators': 612}. Best is trial 5 with value: 6.078826563006255.[0m


is_installed


[32m[I 2023-05-21 02:06:23,665][0m Trial 11 finished with value: 6.077063460676576 and parameters: {'max_depth': 8, 'learning_rate': 0.055201173980057465, 'n_estimators': 650}. Best is trial 11 with value: 6.077063460676576.[0m


is_installed


[32m[I 2023-05-21 02:06:29,269][0m Trial 12 finished with value: 6.206447256117131 and parameters: {'max_depth': 8, 'learning_rate': 0.16831146752298073, 'n_estimators': 105}. Best is trial 11 with value: 6.077063460676576.[0m


is_installed


[32m[I 2023-05-21 02:06:45,856][0m Trial 13 finished with value: 6.171898616841584 and parameters: {'max_depth': 8, 'learning_rate': 0.05495977979478239, 'n_estimators': 565}. Best is trial 11 with value: 6.077063460676576.[0m


is_installed


[32m[I 2023-05-21 02:06:54,859][0m Trial 14 finished with value: 6.436303440689028 and parameters: {'max_depth': 6, 'learning_rate': 0.16942626024325477, 'n_estimators': 350}. Best is trial 11 with value: 6.077063460676576.[0m


is_installed


[32m[I 2023-05-21 02:07:10,006][0m Trial 15 finished with value: 6.288954170593218 and parameters: {'max_depth': 8, 'learning_rate': 0.20901824478961625, 'n_estimators': 552}. Best is trial 11 with value: 6.077063460676576.[0m


is_installed


[32m[I 2023-05-21 02:07:18,889][0m Trial 16 finished with value: 6.380607971194686 and parameters: {'max_depth': 7, 'learning_rate': 0.1478692903752141, 'n_estimators': 300}. Best is trial 11 with value: 6.077063460676576.[0m


is_installed


[32m[I 2023-05-21 02:07:28,904][0m Trial 17 finished with value: 6.211376786646413 and parameters: {'max_depth': 6, 'learning_rate': 0.07590659420875265, 'n_estimators': 394}. Best is trial 11 with value: 6.077063460676576.[0m


is_installed


[32m[I 2023-05-21 02:07:44,578][0m Trial 18 finished with value: 6.3005881191720245 and parameters: {'max_depth': 8, 'learning_rate': 0.20221743059899544, 'n_estimators': 544}. Best is trial 11 with value: 6.077063460676576.[0m


is_installed


[32m[I 2023-05-21 02:07:59,804][0m Trial 19 finished with value: 6.348881204836538 and parameters: {'max_depth': 7, 'learning_rate': 0.1434058889154375, 'n_estimators': 648}. Best is trial 11 with value: 6.077063460676576.[0m


is_installed


[32m[I 2023-05-21 02:08:07,300][0m Trial 20 finished with value: 6.228283915111396 and parameters: {'max_depth': 3, 'learning_rate': 0.09401422928278289, 'n_estimators': 405}. Best is trial 11 with value: 6.077063460676576.[0m


is_installed


[32m[I 2023-05-21 02:08:24,782][0m Trial 21 finished with value: 6.135583896292761 and parameters: {'max_depth': 8, 'learning_rate': 0.0548825499812397, 'n_estimators': 602}. Best is trial 11 with value: 6.077063460676576.[0m


is_installed


[32m[I 2023-05-21 02:08:41,516][0m Trial 22 finished with value: 6.074946841749433 and parameters: {'max_depth': 8, 'learning_rate': 0.07168758532319397, 'n_estimators': 592}. Best is trial 22 with value: 6.074946841749433.[0m


is_installed


[32m[I 2023-05-21 02:08:54,997][0m Trial 23 finished with value: 6.2709613467873035 and parameters: {'max_depth': 7, 'learning_rate': 0.07444907345783042, 'n_estimators': 520}. Best is trial 22 with value: 6.074946841749433.[0m


is_installed


[32m[I 2023-05-21 02:09:11,248][0m Trial 24 finished with value: 6.213147577099021 and parameters: {'max_depth': 8, 'learning_rate': 0.09738295517139547, 'n_estimators': 588}. Best is trial 22 with value: 6.074946841749433.[0m


is_installed


[32m[I 2023-05-21 02:09:24,561][0m Trial 25 finished with value: 6.273428854312351 and parameters: {'max_depth': 7, 'learning_rate': 0.07120921070118662, 'n_estimators': 509}. Best is trial 22 with value: 6.074946841749433.[0m


is_installed


[32m[I 2023-05-21 02:09:42,201][0m Trial 26 finished with value: 6.102448156314094 and parameters: {'max_depth': 8, 'learning_rate': 0.08954295479639861, 'n_estimators': 643}. Best is trial 22 with value: 6.074946841749433.[0m


is_installed


[32m[I 2023-05-21 02:09:52,689][0m Trial 27 finished with value: 6.305506476622447 and parameters: {'max_depth': 6, 'learning_rate': 0.11555941619302589, 'n_estimators': 434}. Best is trial 22 with value: 6.074946841749433.[0m


is_installed


[32m[I 2023-05-21 02:10:03,238][0m Trial 28 finished with value: 6.206080664813805 and parameters: {'max_depth': 4, 'learning_rate': 0.07468875745736338, 'n_estimators': 577}. Best is trial 22 with value: 6.074946841749433.[0m


is_installed


[32m[I 2023-05-21 02:10:18,188][0m Trial 29 finished with value: 6.230775654098192 and parameters: {'max_depth': 8, 'learning_rate': 0.13380145247881564, 'n_estimators': 531}. Best is trial 22 with value: 6.074946841749433.[0m


is_installed


[32m[I 2023-05-21 02:10:31,092][0m Trial 30 finished with value: 6.2099776068130605 and parameters: {'max_depth': 7, 'learning_rate': 0.11118990524610242, 'n_estimators': 507}. Best is trial 22 with value: 6.074946841749433.[0m


is_installed


[32m[I 2023-05-21 02:10:48,762][0m Trial 31 finished with value: 6.165910903351814 and parameters: {'max_depth': 8, 'learning_rate': 0.08796769475518135, 'n_estimators': 645}. Best is trial 22 with value: 6.074946841749433.[0m


is_installed


[32m[I 2023-05-21 02:11:05,431][0m Trial 32 finished with value: 6.150750807244073 and parameters: {'max_depth': 8, 'learning_rate': 0.09750888729232449, 'n_estimators': 599}. Best is trial 22 with value: 6.074946841749433.[0m


is_installed


[32m[I 2023-05-21 02:11:23,519][0m Trial 33 finished with value: 6.134884845034379 and parameters: {'max_depth': 8, 'learning_rate': 0.0648972157707115, 'n_estimators': 644}. Best is trial 22 with value: 6.074946841749433.[0m


is_installed


[32m[I 2023-05-21 02:11:37,886][0m Trial 34 finished with value: 6.286131008160509 and parameters: {'max_depth': 7, 'learning_rate': 0.10529475248363981, 'n_estimators': 578}. Best is trial 22 with value: 6.074946841749433.[0m


is_installed


[32m[I 2023-05-21 02:11:55,138][0m Trial 35 finished with value: 6.081650337550663 and parameters: {'max_depth': 8, 'learning_rate': 0.08339819711481322, 'n_estimators': 622}. Best is trial 22 with value: 6.074946841749433.[0m


is_installed


[32m[I 2023-05-21 02:12:05,223][0m Trial 36 finished with value: 6.204680856545772 and parameters: {'max_depth': 7, 'learning_rate': 0.059921580692561496, 'n_estimators': 331}. Best is trial 22 with value: 6.074946841749433.[0m


is_installed


[32m[I 2023-05-21 02:12:14,653][0m Trial 37 finished with value: 6.208209207676821 and parameters: {'max_depth': 4, 'learning_rate': 0.12584843703854695, 'n_estimators': 489}. Best is trial 22 with value: 6.074946841749433.[0m


is_installed


[32m[I 2023-05-21 02:12:21,071][0m Trial 38 finished with value: 6.343575460608036 and parameters: {'max_depth': 6, 'learning_rate': 0.08112557026326543, 'n_estimators': 181}. Best is trial 22 with value: 6.074946841749433.[0m


is_installed


[32m[I 2023-05-21 02:12:36,318][0m Trial 39 finished with value: 6.231829136040838 and parameters: {'max_depth': 7, 'learning_rate': 0.06550221034908366, 'n_estimators': 611}. Best is trial 22 with value: 6.074946841749433.[0m


is_installed


[32m[I 2023-05-21 02:12:46,634][0m Trial 40 finished with value: 6.257203376810556 and parameters: {'max_depth': 5, 'learning_rate': 0.10399017830520296, 'n_estimators': 479}. Best is trial 22 with value: 6.074946841749433.[0m


is_installed


[32m[I 2023-05-21 02:13:03,735][0m Trial 41 finished with value: 6.144758049953912 and parameters: {'max_depth': 8, 'learning_rate': 0.08975945890214311, 'n_estimators': 620}. Best is trial 22 with value: 6.074946841749433.[0m


is_installed


[32m[I 2023-05-21 02:13:21,229][0m Trial 42 finished with value: 6.065784603862669 and parameters: {'max_depth': 8, 'learning_rate': 0.0855917644049105, 'n_estimators': 629}. Best is trial 42 with value: 6.065784603862669.[0m


is_installed


[32m[I 2023-05-21 02:13:38,022][0m Trial 43 finished with value: 6.033698844547735 and parameters: {'max_depth': 8, 'learning_rate': 0.05129104038630489, 'n_estimators': 558}. Best is trial 43 with value: 6.033698844547735.[0m


is_installed


[32m[I 2023-05-21 02:13:54,352][0m Trial 44 finished with value: 6.037928997359063 and parameters: {'max_depth': 8, 'learning_rate': 0.06276070047783043, 'n_estimators': 555}. Best is trial 43 with value: 6.033698844547735.[0m


is_installed


[32m[I 2023-05-21 02:14:11,271][0m Trial 45 finished with value: 6.04603798182488 and parameters: {'max_depth': 8, 'learning_rate': 0.05326058267617299, 'n_estimators': 563}. Best is trial 43 with value: 6.033698844547735.[0m


is_installed


[32m[I 2023-05-21 02:14:25,999][0m Trial 46 finished with value: 6.237116565887342 and parameters: {'max_depth': 7, 'learning_rate': 0.05170757587779564, 'n_estimators': 555}. Best is trial 43 with value: 6.033698844547735.[0m


is_installed


[32m[I 2023-05-21 02:14:41,949][0m Trial 47 finished with value: 6.116545270505635 and parameters: {'max_depth': 8, 'learning_rate': 0.0630518214418549, 'n_estimators': 541}. Best is trial 43 with value: 6.033698844547735.[0m


is_installed


[32m[I 2023-05-21 02:14:56,486][0m Trial 48 finished with value: 6.009722330866276 and parameters: {'max_depth': 8, 'learning_rate': 0.050657372906177604, 'n_estimators': 452}. Best is trial 48 with value: 6.009722330866276.[0m


is_installed


[32m[I 2023-05-21 02:15:10,597][0m Trial 49 finished with value: 6.010431543178843 and parameters: {'max_depth': 8, 'learning_rate': 0.06333207093595607, 'n_estimators': 450}. Best is trial 48 with value: 6.009722330866276.[0m


In [91]:
study_install.best_params

{'max_depth': 8, 'learning_rate': 0.07243965921786173, 'n_estimators': 396}

In [30]:
imp = "0.17838217 0.00943216 0.01728662 0.00554243 0.01393594 0. 0.00235423 0.0077137  0.00542087 0.00716849 0.00452244 0.00160236 0.00750723 0.00121989 0.00470682 0.00147401 0.0022138  0.0241547 0.01138127 0.00751544 0.01613538 0.00981167 0.2898561  0. 0. 0. 0. 0. 0.00346196 0.00097365 0.0010202 0.00064199 0.00651024 0.00695342 0.00122922 0.01494968 0.00134197 0.00089351 0.02034765 0.00220879 0.0081035  0.00887755 0.00097518 0.00109279 0.00096075 0.00162583 0.00512414 0.00503206 0.01502846 0.01483065 0.00102621 0.00079396 0.00116319 0.00185047 0.00402559 0.0090543  0.00873815 0.06055596 0.00610948 0.00142578 0.00089418 0.00159682 0.00822201 0.00551445 0.01136009 0.0050891 0.06939875 0.01096893 0.01427525 0.00089395 0.00434032 0.00372613 0.00209797 0.00791273 0.00101883 0.00032105 0.00482761 0.00127782"
out = re.sub('\s+',' ',imp)

In [32]:
imp = np.array([float(x) for x in out.split()])

In [33]:
imp

array([0.17838217, 0.00943216, 0.01728662, 0.00554243, 0.01393594,
       0.        , 0.00235423, 0.0077137 , 0.00542087, 0.00716849,
       0.00452244, 0.00160236, 0.00750723, 0.00121989, 0.00470682,
       0.00147401, 0.0022138 , 0.0241547 , 0.01138127, 0.00751544,
       0.01613538, 0.00981167, 0.2898561 , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.00346196, 0.00097365,
       0.0010202 , 0.00064199, 0.00651024, 0.00695342, 0.00122922,
       0.01494968, 0.00134197, 0.00089351, 0.02034765, 0.00220879,
       0.0081035 , 0.00887755, 0.00097518, 0.00109279, 0.00096075,
       0.00162583, 0.00512414, 0.00503206, 0.01502846, 0.01483065,
       0.00102621, 0.00079396, 0.00116319, 0.00185047, 0.00402559,
       0.0090543 , 0.00873815, 0.06055596, 0.00610948, 0.00142578,
       0.00089418, 0.00159682, 0.00822201, 0.00551445, 0.01136009,
       0.0050891 , 0.06939875, 0.01096893, 0.01427525, 0.00089395,
       0.00434032, 0.00372613, 0.00209797, 0.00791273, 0.00101

In [34]:
feat = [f'f_{i}' for i in range(2,80)]

In [36]:
feat_imp = []
for x,y in zip(feat,imp):
    feat_imp.append([x,y])

In [37]:
feat_imp.sort(key=lambda x:x[1],reverse=True)

In [42]:
use_feat = []
feat_imp

[['f_24', 0.2898561],
 ['f_2', 0.17838217],
 ['f_68', 0.06939875],
 ['f_59', 0.06055596],
 ['f_19', 0.0241547],
 ['f_40', 0.02034765],
 ['f_4', 0.01728662],
 ['f_22', 0.01613538],
 ['f_50', 0.01502846],
 ['f_37', 0.01494968],
 ['f_51', 0.01483065],
 ['f_70', 0.01427525],
 ['f_6', 0.01393594],
 ['f_20', 0.01138127],
 ['f_66', 0.01136009],
 ['f_69', 0.01096893],
 ['f_23', 0.00981167],
 ['f_3', 0.00943216],
 ['f_57', 0.0090543],
 ['f_43', 0.00887755],
 ['f_58', 0.00873815],
 ['f_64', 0.00822201],
 ['f_42', 0.0081035],
 ['f_75', 0.00791273],
 ['f_9', 0.0077137],
 ['f_21', 0.00751544],
 ['f_14', 0.00750723],
 ['f_11', 0.00716849],
 ['f_35', 0.00695342],
 ['f_34', 0.00651024],
 ['f_60', 0.00610948],
 ['f_5', 0.00554243],
 ['f_65', 0.00551445],
 ['f_10', 0.00542087],
 ['f_48', 0.00512414],
 ['f_67', 0.0050891],
 ['f_49', 0.00503206],
 ['f_78', 0.00482761],
 ['f_16', 0.00470682],
 ['f_12', 0.00452244],
 ['f_72', 0.00434032],
 ['f_56', 0.00402559],
 ['f_73', 0.00372613],
 ['f_30', 0.00346196],


In [45]:
sel_feat = [a[0] for a in feat_imp if a[1]>0.01]

In [46]:
sel_feat

['f_24',
 'f_2',
 'f_68',
 'f_59',
 'f_19',
 'f_40',
 'f_4',
 'f_22',
 'f_50',
 'f_37',
 'f_51',
 'f_70',
 'f_6',
 'f_20',
 'f_66',
 'f_69']

In [28]:
out

["'f_2'",
 "'f_3'",
 "'f_4'",
 "'f_5'",
 "'f_6'",
 "'f_7'",
 "'f_8'",
 "'f_9'",
 "'f_10'",
 "'f_11'",
 "'f_12'",
 "'f_13'",
 "'f_14'",
 "'f_15'",
 "'f_16'",
 "'f_17'",
 "'f_18'",
 "'f_19'",
 "'f_20'",
 "'f_21'",
 "'f_22'",
 "'f_23'",
 "'f_24'",
 "'f_25'",
 "'f_26'",
 "'f_27'",
 "'f_28'",
 "'f_29'",
 "'f_30'",
 "'f_31'",
 "'f_32'",
 "'f_33'",
 "'f_34'",
 "'f_35'",
 "'f_36'",
 "'f_37'",
 "'f_38'",
 "'f_39'",
 "'f_40'",
 "'f_41'",
 "'f_42'",
 "'f_43'",
 "'f_44'",
 "'f_45'",
 "'f_46'",
 "'f_47'",
 "'f_48'",
 "'f_49'",
 "'f_50'",
 "'f_51'",
 "'f_52'",
 "'f_53'",
 "'f_54'",
 "'f_55'",
 "'f_56'",
 "'f_57'",
 "'f_58'",
 "'f_59'",
 "'f_60'",
 "'f_61'",
 "'f_62'",
 "'f_63'",
 "'f_64'",
 "'f_65'",
 "'f_66'",
 "'f_67'",
 "'f_68'",
 "'f_69'",
 "'f_70'",
 "'f_71'",
 "'f_72'",
 "'f_73'",
 "'f_74'",
 "'f_75'",
 "'f_76'",
 "'f_77'",
 "'f_78'",
 "'f_79'"]

In [29]:
import re
s = "0.21497796 0.01153717 0.02135733 0.00629702 0.01811617 0. 0.00287854 0.00986653 0.00634902 0.00833008 0.00577091 0.00193968 0.00845783 0.00142394 0.00575012 0.00178409 0.00252029 0.03162423 0.01419182 0.0109225  0.01585338 0.01828695 0.14107488 0. 0. 0. 0. 0. 0.0042021 0.00139228 0.00099792 0.00072804 0.00859207 0.00801966 0.00114426 0.01613013 0.00143669 0.00126554 0.02456514 0.00217038 0.00993669 0.00967928 0.00120302 0.00108286 0.00124789 0.00214677 0.00535804 0.00665624 0.01935415 0.01779572 0.00145307 0.00087535 0.00132587 0.00218426 0.00468974 0.01121161 0.01066885 0.07310767 0.00645644 0.00164837 0.00104059 0.0018528  0.00998066 0.00652136 0.01387253 0.00636589 0.0793817  0.0146549  0.01643551 0.00099639 0.00538003 0.00438112 0.00196106 0.00995385 0.00109404 0. 0.00632514 0.00176593"
help(re.sub)

Help on function sub in module re:

sub(pattern, repl, string, count=0, flags=0)
    Return the string obtained by replacing the leftmost
    non-overlapping occurrences of the pattern in string by the
    replacement repl.  repl can be either a string or a callable;
    if a string, backslash escapes in it are processed.  If it is
    a callable, it's passed the Match object and must return
    a replacement string to be used.



In [19]:
out[0].split()

['1.9438429e-01',
 '1.6285684e-02',
 '2.5130723e-02',
 '6.5109739e-03',
 '2.3423744e-02',
 '0.0000000e+00',
 '2.3214186e-03',
 '1.1480905e-02',
 '5.3082770e-03',
 '1.1682530e-02',
 '6.9747460e-03',
 '1.9848947e-03',
 '1.3027168e-02',
 '1.4560192e-03',
 '4.5677680e-03',
 '2.2417831e-03',
 '4.3391953e-03',
 '2.0857105e-02',
 '1.3643338e-02',
 '1.4366889e-02',
 '2.3184188e-02',
 '3.0816399e-02',
 '9.7623985e-04',
 '0.0000000e+00',
 '0.0000000e+00',
 '0.0000000e+00',
 '0.0000000e+00',
 '0.0000000e+00',
 '2.8751746e-03',
 '0.0000000e+00',
 '1.6775664e-03',
 '1.1767709e-04',
 '1.0563515e-02',
 '1.1739828e-02',
 '4.0201025e-04',
 '1.8141903e-02',
 '5.7640596e-04',
 '1.0956990e-03',
 '2.5143741e-02',
 '1.9552805e-03',
 '1.4696764e-02',
 '1.4745467e-02',
 '1.1228217e-02',
 '5.7176698e-04',
 '3.0037931e-03',
 '4.1578928e-04',
 '3.9065001e-03',
 '6.0447785e-03',
 '8.6019915e-03',
 '2.0537341e-02',
 '3.4810763e-02',
 '1.6497660e-03',
 '3.2469802e-04',
 '1.3316880e-03',
 '3.9778287e-03',
 '8.402911

In [8]:
use_features

['f_2',
 'f_3',
 'f_4',
 'f_5',
 'f_6',
 'f_7',
 'f_8',
 'f_9',
 'f_10',
 'f_11',
 'f_12',
 'f_13',
 'f_14',
 'f_15',
 'f_16',
 'f_17',
 'f_18',
 'f_19',
 'f_20',
 'f_21',
 'f_22',
 'f_23',
 'f_24',
 'f_25',
 'f_26',
 'f_27',
 'f_28',
 'f_29',
 'f_30',
 'f_31',
 'f_32',
 'f_33',
 'f_34',
 'f_35',
 'f_36',
 'f_37',
 'f_38',
 'f_39',
 'f_40',
 'f_41']

In [81]:
def objective_click(trail):
    params = {
        'max_depth':trail.suggest_int('max_depth',3,8),
        'learning_rate':trail.suggest_float('learning_rate',0.02,0.3),
        'n_estimators':trail.suggest_int('n_estimators',100,1000),
        # 'gamma':trail.suggest_loguniform('gamma',0.01,1),
        # 'colsample_bytree':trail.suggest_loguniform('colsample_bytree',0.7,1),
        'tree_method':'gpu_hist',
        # 'alpha': trail.suggest_float('alpha',0.1,10),
        'objective':'binary:logistic'
    }
    target = Train_Split.IS_CLICKED[0]
    if model_for == 'install':
        target = Train_Split.IS_INSTALLED[0]
    print(target)
    model = XGBClassifier(**params)
    model.fit(X_train[use_features],y_train[target])
    y_pred = model.predict(X_test[use_features])
    score = log_loss(y_test[target],y_pred)
    return score

In [21]:
best_till_now = {'max_depth': 4, 'learning_rate': 0.08751617649545007, 'n_estimators': 549}

In [22]:
model_for = 'click'
study_click = optuna.create_study(direction='minimize',study_name=model_for)
study_click.optimize(objective_click, n_trials=30)

[32m[I 2023-05-20 19:50:21,911][0m A new study created in memory with name: click[0m


is_clicked


[32m[I 2023-05-20 19:50:36,009][0m Trial 0 finished with value: 9.248499371972907 and parameters: {'max_depth': 7, 'learning_rate': 0.08927040039177586, 'n_estimators': 502, 'alpha': 6.267046892953481}. Best is trial 0 with value: 9.248499371972907.[0m


is_clicked


[32m[I 2023-05-20 19:50:41,002][0m Trial 1 finished with value: 9.754369844503136 and parameters: {'max_depth': 3, 'learning_rate': 0.0893329752430829, 'n_estimators': 237, 'alpha': 4.738167132530829}. Best is trial 0 with value: 9.248499371972907.[0m


is_clicked


[32m[I 2023-05-20 19:50:54,940][0m Trial 2 finished with value: 9.49351958649039 and parameters: {'max_depth': 3, 'learning_rate': 0.13873625099855783, 'n_estimators': 902, 'alpha': 2.7818675624926223}. Best is trial 0 with value: 9.248499371972907.[0m


is_clicked


[32m[I 2023-05-20 19:51:05,323][0m Trial 3 finished with value: 9.320420922919254 and parameters: {'max_depth': 6, 'learning_rate': 0.13557493508044577, 'n_estimators': 407, 'alpha': 1.3946565150043084}. Best is trial 0 with value: 9.248499371972907.[0m


is_clicked


[32m[I 2023-05-20 19:51:14,860][0m Trial 4 finished with value: 9.462828159084165 and parameters: {'max_depth': 3, 'learning_rate': 0.07281143990400513, 'n_estimators': 572, 'alpha': 5.807544394345558}. Best is trial 0 with value: 9.248499371972907.[0m


is_clicked


[32m[I 2023-05-20 19:51:24,220][0m Trial 5 finished with value: 9.330288822870736 and parameters: {'max_depth': 6, 'learning_rate': 0.21027758767234792, 'n_estimators': 358, 'alpha': 3.357160160786229}. Best is trial 0 with value: 9.248499371972907.[0m


is_clicked


[32m[I 2023-05-20 19:51:49,817][0m Trial 6 finished with value: 9.397282402132259 and parameters: {'max_depth': 8, 'learning_rate': 0.2131245780537924, 'n_estimators': 861, 'alpha': 5.8015764585297855}. Best is trial 0 with value: 9.248499371972907.[0m


is_clicked


[32m[I 2023-05-20 19:52:02,326][0m Trial 7 finished with value: 9.230521091940933 and parameters: {'max_depth': 7, 'learning_rate': 0.17695514226413217, 'n_estimators': 435, 'alpha': 8.81591202288747}. Best is trial 7 with value: 9.230521091940933.[0m


is_clicked


[32m[I 2023-05-20 19:52:15,838][0m Trial 8 finished with value: 9.278467598273895 and parameters: {'max_depth': 5, 'learning_rate': 0.26087941596360414, 'n_estimators': 663, 'alpha': 0.8189691066080612}. Best is trial 7 with value: 9.230521091940933.[0m


is_clicked


[32m[I 2023-05-20 19:52:31,525][0m Trial 9 finished with value: 9.295037990163424 and parameters: {'max_depth': 6, 'learning_rate': 0.16243939548160383, 'n_estimators': 662, 'alpha': 5.584500757468629}. Best is trial 7 with value: 9.230521091940933.[0m


is_clicked


[32m[I 2023-05-20 19:52:38,189][0m Trial 10 finished with value: 9.328178382190998 and parameters: {'max_depth': 8, 'learning_rate': 0.27208403001394743, 'n_estimators': 162, 'alpha': 9.473122511133917}. Best is trial 7 with value: 9.230521091940933.[0m


is_clicked


[32m[I 2023-05-20 19:52:51,656][0m Trial 11 finished with value: 9.24425931927505 and parameters: {'max_depth': 7, 'learning_rate': 0.03144281820139695, 'n_estimators': 429, 'alpha': 8.375457313968043}. Best is trial 7 with value: 9.230521091940933.[0m


is_clicked


[32m[I 2023-05-20 19:53:02,487][0m Trial 12 finished with value: 9.319347967663123 and parameters: {'max_depth': 7, 'learning_rate': 0.03566164016611359, 'n_estimators': 327, 'alpha': 9.342497607834302}. Best is trial 7 with value: 9.230521091940933.[0m


is_clicked


[32m[I 2023-05-20 19:53:17,412][0m Trial 13 finished with value: 9.248138918109536 and parameters: {'max_depth': 7, 'learning_rate': 0.027652444660004383, 'n_estimators': 485, 'alpha': 8.027671037831631}. Best is trial 7 with value: 9.230521091940933.[0m


is_clicked


[32m[I 2023-05-20 19:53:33,911][0m Trial 14 finished with value: 9.452954619543569 and parameters: {'max_depth': 5, 'learning_rate': 0.02240421358446873, 'n_estimators': 727, 'alpha': 7.90180723164347}. Best is trial 7 with value: 9.230521091940933.[0m


is_clicked


[32m[I 2023-05-20 19:53:39,196][0m Trial 15 finished with value: 9.236151795715818 and parameters: {'max_depth': 8, 'learning_rate': 0.17554590512542023, 'n_estimators': 107, 'alpha': 9.89484730810572}. Best is trial 7 with value: 9.230521091940933.[0m


is_clicked


[32m[I 2023-05-20 19:53:44,991][0m Trial 16 finished with value: 9.457921252203233 and parameters: {'max_depth': 8, 'learning_rate': 0.1894060862600023, 'n_estimators': 125, 'alpha': 9.794405166267465}. Best is trial 7 with value: 9.230521091940933.[0m


is_clicked


[32m[I 2023-05-20 19:53:54,094][0m Trial 17 finished with value: 9.35004185983912 and parameters: {'max_depth': 8, 'learning_rate': 0.23581568470512554, 'n_estimators': 253, 'alpha': 7.0736961027955125}. Best is trial 7 with value: 9.230521091940933.[0m


is_clicked


[32m[I 2023-05-20 19:54:12,217][0m Trial 18 finished with value: 9.260489383533836 and parameters: {'max_depth': 4, 'learning_rate': 0.2871477119118311, 'n_estimators': 994, 'alpha': 9.848235826639572}. Best is trial 7 with value: 9.230521091940933.[0m


is_clicked


[32m[I 2023-05-20 19:54:20,509][0m Trial 19 finished with value: 9.355327314605208 and parameters: {'max_depth': 7, 'learning_rate': 0.1718323874514926, 'n_estimators': 251, 'alpha': 8.768265859782922}. Best is trial 7 with value: 9.230521091940933.[0m


is_clicked


[32m[I 2023-05-20 19:54:25,898][0m Trial 20 finished with value: 9.27422778225923 and parameters: {'max_depth': 8, 'learning_rate': 0.12791932239059522, 'n_estimators': 111, 'alpha': 7.078702265269721}. Best is trial 7 with value: 9.230521091940933.[0m


is_clicked


[32m[I 2023-05-20 19:54:38,327][0m Trial 21 finished with value: 9.380710296329974 and parameters: {'max_depth': 7, 'learning_rate': 0.18032009629679302, 'n_estimators': 423, 'alpha': 8.387495659767191}. Best is trial 7 with value: 9.230521091940933.[0m


is_clicked


[32m[I 2023-05-20 19:54:55,662][0m Trial 22 finished with value: 9.319021808377812 and parameters: {'max_depth': 7, 'learning_rate': 0.14924260871155415, 'n_estimators': 624, 'alpha': 8.608256651136369}. Best is trial 7 with value: 9.230521091940933.[0m


is_clicked


[32m[I 2023-05-20 19:55:04,996][0m Trial 23 finished with value: 9.293273231051415 and parameters: {'max_depth': 6, 'learning_rate': 0.11822517451258291, 'n_estimators': 331, 'alpha': 9.944615346436024}. Best is trial 7 with value: 9.230521091940933.[0m


is_clicked


[32m[I 2023-05-20 19:55:20,670][0m Trial 24 finished with value: 9.303506375660493 and parameters: {'max_depth': 8, 'learning_rate': 0.19234511520715927, 'n_estimators': 480, 'alpha': 8.94256919463412}. Best is trial 7 with value: 9.230521091940933.[0m


is_clicked


[32m[I 2023-05-20 19:55:28,157][0m Trial 25 finished with value: 9.32571391890146 and parameters: {'max_depth': 7, 'learning_rate': 0.16549194880834517, 'n_estimators': 213, 'alpha': 7.526979734665835}. Best is trial 7 with value: 9.230521091940933.[0m


is_clicked


[32m[I 2023-05-20 19:55:44,820][0m Trial 26 finished with value: 9.18222057932115 and parameters: {'max_depth': 5, 'learning_rate': 0.15074878324525845, 'n_estimators': 779, 'alpha': 8.906377941906895}. Best is trial 26 with value: 9.18222057932115.[0m


is_clicked


[32m[I 2023-05-20 19:56:00,850][0m Trial 27 finished with value: 9.16176902975268 and parameters: {'max_depth': 4, 'learning_rate': 0.15388003689584331, 'n_estimators': 840, 'alpha': 9.449337038741552}. Best is trial 27 with value: 9.16176902975268.[0m


is_clicked


[32m[I 2023-05-20 19:56:16,274][0m Trial 28 finished with value: 9.354970500766068 and parameters: {'max_depth': 4, 'learning_rate': 0.1526583543881208, 'n_estimators': 800, 'alpha': 8.921848422062654}. Best is trial 27 with value: 9.16176902975268.[0m


is_clicked


[32m[I 2023-05-20 19:56:31,227][0m Trial 29 finished with value: 9.307727044821249 and parameters: {'max_depth': 4, 'learning_rate': 0.10489409554436074, 'n_estimators': 769, 'alpha': 6.766526109004945}. Best is trial 27 with value: 9.16176902975268.[0m


In [61]:
X = pd.concat([X_train,X_test])

In [62]:
y = pd.concat([y_train,y_test])

In [65]:
len(use_features)

78

In [92]:
params =  {'max_depth': 4, 'learning_rate': 0.08751617649545007, 'n_estimators': 549}
params['tree_method']= 'gpu_hist'
params['objective'] = 'binary:logistic'
clk_model = XGBClassifier(**params)
clk_model.fit(X[use_features],y[Train_Split.IS_CLICKED])

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.08751617649545007,
              max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=4, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=549, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)

In [None]:
X_1

In [66]:
params = {'max_depth': 8, 'learning_rate': 0.07243965921786173, 'n_estimators': 396}
params['tree_method']= 'gpu_hist'
params['objective'] = 'binary:logistic'
install_model = XGBClassifier(**params)
install_model.fit(X[use_features],y[Train_Split.IS_INSTALLED])

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1177894283061201,
              max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=7, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=332, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)

In [67]:
test = pd.read_csv('../Data/test/000000000000.csv',sep='\t')

In [68]:
test['f_30'].fillna(test['f_30'].mode()[0],inplace=True)
test['f_31'].fillna(test['f_31'].mode()[0],inplace=True)

In [69]:
fmiss = "f_43,f_51,f_58,f_59,f_64,f_65,f_66,f_67,f_68,f_69,f_70".split(',')
for f in fmiss:
    test[f].fillna(X[f].mean(),inplace=True)

In [70]:
y_click_test = clk_model.predict_proba(test[use_features])

In [71]:
y_install_test = install_model.predict_proba(test[use_features])

In [72]:
import numpy as np
result = np.vstack([test['f_0'].to_numpy(dtype=int),y_click_test[:,1],y_install_test[:,1]]).T

In [73]:
result.shape

(160973, 3)

In [74]:
final = pd.DataFrame(result,columns=['RowId','is_clicked','is_installed'])

In [75]:
final.head()

Unnamed: 0,RowId,is_clicked,is_installed
0,64505.0,0.353222,0.203283
1,64506.0,0.082859,0.255049
2,64507.0,0.213517,0.050327
3,64508.0,0.142553,0.213543
4,64509.0,0.22259,0.264679


In [76]:
final['RowId']=final['RowId'].astype('int')

In [77]:
from datetime import datetime
now = datetime.now()
final.to_csv(f'../Data/final_results/xgb_optuna_log_loss_{now}.csv', sep ='\t', index=False)