In [1]:
from tqdm import tqdm
from xgboost import XGBClassifier
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

ROWID = ['f_0']
DATE = ['f_1']
CATEGORIES = [ f'f_{i}' for i in range(2,33) ]
BINARY = [ f'f_{i}' for i in range(33,42) ]
NUMERICAL = [ f'f_{i}' for i in range(42,80) ]
IS_CLICKED = ['is_clicked']
IS_INSTALLED =['is_installed']

missing = pd.read_csv('../Data/miss_combine.csv')

## Imputation

In [2]:
missing['f_30'].fillna(missing['f_30'].mode()[0],inplace=True)
missing['f_31'].fillna(missing['f_31'].mode()[0],inplace=True)
fmiss = "f_43,f_51,f_58,f_59,f_64,f_65,f_66,f_67,f_68,f_69,f_70".split(',')
for f in tqdm(fmiss,desc="NUM IMPUTE"):
    missing[f].fillna(missing[f].mean(),inplace=True)

NUM IMPUTE: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11/11 [00:00<00:00, 42.75it/s]


## DataSplit

In [3]:
click_train = missing
install_train = missing

## Feature Used

In [4]:
click_feat_list = CATEGORIES + BINARY + NUMERICAL#['f_43','f_48','f_50','f_66','f_68','f_69','f_70','f_72','f_73']
install_feat_list = CATEGORIES + BINARY + NUMERICAL#['f_58','f_59','f_50','f_68']

## Multiclass Labeling

In [8]:
from tqdm import tqdm

def get_label(df):
    label = []
    for i in tqdm(df.index,desc="Genrating Label"):
        if df.loc[i,IS_CLICKED[0]]==0 and df.loc[i,IS_INSTALLED[0]]==0:
            label.append(0)
        elif df.loc[i,IS_CLICKED[0]]==1 and df.loc[i,IS_INSTALLED[0]]==0:
            label.append(1)
        elif df.loc[i,IS_CLICKED[0]]==0 and df.loc[i,IS_INSTALLED[0]]==1:
            label.append(2)
        elif df.loc[i,IS_CLICKED[0]]==1 and df.loc[i,IS_INSTALLED[0]]==1:
            label.append(3)
    return np.array(label)

y = get_label(missing[IS_CLICKED+IS_INSTALLED])
X = missing[click_feat_list]

In [9]:
len(X),len(y)

(3485852, 3485852)

## Modeling

In [10]:
from xgboost import XGBClassifier
clk_install_classifier = XGBClassifier(learning_rate=0.1, n_estimators=500, objective='binary:logistic',
                            verbosity=2,tree_method='gpu_hist')

clk_install_classifier.fit(X,y)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=500, n_jobs=None, num_parallel_tree=None,
              objective='multi:softprob', predictor=None, ...)

In [11]:
test = pd.read_csv('../Data/test/000000000000.csv',sep='\t')

## Imputation

In [12]:
test['f_30'].fillna(test['f_30'].mode()[0],inplace=True)
test['f_31'].fillna(test['f_31'].mode()[0],inplace=True)

In [13]:
for f in fmiss:
    test[f].fillna(test[f].mean(),inplace=True)

In [14]:
result = clk_install_classifier.predict_proba(test[click_feat_list])

In [16]:
result[:10,:]

array([[0.4052128 , 0.34118497, 0.07511163, 0.17849064],
       [0.6783799 , 0.11592828, 0.19982924, 0.00586255],
       [0.76791996, 0.11383574, 0.0057997 , 0.11244455],
       [0.58538187, 0.25127017, 0.14965762, 0.01369032],
       [0.5868619 , 0.16496325, 0.00642839, 0.24174644],
       [0.8238815 , 0.09498648, 0.00100242, 0.08012962],
       [0.09273871, 0.26796114, 0.17327672, 0.46602347],
       [0.67384773, 0.12378673, 0.17808606, 0.02427943],
       [0.4265508 , 0.37442064, 0.05048933, 0.14853925],
       [0.5353052 , 0.12203039, 0.30681863, 0.03584579]], dtype=float32)