In [1]:
from tqdm import tqdm
from xgboost import XGBClassifier
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

ROWID = ['f_0']
DATE = ['f_1']
CATEGORIES = [ f'f_{i}' for i in range(2,33) ]
BINARY = [ f'f_{i}' for i in range(33,42) ]
NUMERICAL = [ f'f_{i}' for i in range(42,80) ]
IS_CLICKED = ['is_clicked']
IS_INSTALLED =['is_installed']

In [2]:
missing = pd.read_csv('../Data/miss_combine.csv')

## Imputation

In [3]:
missing['f_30'].fillna(missing['f_30'].mode()[0],inplace=True)
missing['f_31'].fillna(missing['f_31'].mode()[0],inplace=True)
fmiss = "f_43,f_51,f_58,f_59,f_64,f_65,f_66,f_67,f_68,f_69,f_70".split(',')
for f in tqdm(fmiss,desc="NUM IMPUTE"):
    missing[f].fillna(missing[f].mean(),inplace=True)

NUM IMPUTE: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11/11 [00:00<00:00, 48.45it/s]


## Data Split

In [4]:
click_train = missing
install_train = missing

## Feature Used

In [5]:
click_feat_list = CATEGORIES + BINARY + NUMERICAL#['f_43','f_48','f_50','f_66','f_68','f_69','f_70','f_72','f_73']
install_feat_list = CATEGORIES + BINARY + NUMERICAL#['f_58','f_59','f_50','f_68']

## Modelling

In [6]:
clk_inst = click_train[IS_CLICKED[0]] | click_train[IS_INSTALLED[0]] 

In [7]:
clk_inst.value_counts()

0    2360893
1    1124959
dtype: int64

In [8]:
2360893/1124959

2.098648039617444

In [9]:
y_train = clk_inst
X_train = click_train[click_feat_list]
# X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.1,stratify=y)

In [10]:
clk_install_classifier = XGBClassifier(learning_rate=0.1, n_estimators=500, objective='binary:logistic',
                            verbosity=2,tree_method='gpu_hist',scale_pos_weight=2.09)

clk_install_classifier.fit(X_train,y_train)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=500, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)

In [11]:
y_clk_inst_pred_prob = clk_install_classifier.predict_proba(X_test)

In [13]:
from sklearn.metrics import precision_recall_curve
precision, recall, thresholds = precision_recall_curve(~y_test+2, y_clk_inst_pred_prob[:,0])

In [14]:
len(thresholds)

325279

In [15]:
f1 = 2*precision*recall/(precision+recall)

In [16]:
np.max(f1)

0.8856548856548858

In [17]:
idx = 207522
print("TH: ",thresholds[idx],"Recall: ",recall[idx],"Precision: ",precision[idx],"F1: ",f1[idx])

TH:  0.755195 Recall:  0.47357787284510144 Precision:  0.9298807365391972 F1:  0.6275510204081634


In [20]:
y_clk_inst_pred_prob[:,0]>0.755

array([False, False, False, ...,  True,  True, False])

In [13]:
y_clk_inst_pred_prob = clk_install_classifier.predict_proba(X_train)

In [14]:
X_train_filter = X_train[y_clk_inst_pred_prob[:,0]<0.755]

In [16]:
len(X_train),len(X_train_filter)

(3485852, 2290514)

In [17]:
y_train_filter = y_train[y_clk_inst_pred_prob[:,0]<0.755]

In [18]:
len(y_train_filter)

2290514

In [19]:
y_clicked = click_train[y_clk_inst_pred_prob[:,0]<0.755][IS_CLICKED]
y_installed = click_train[y_clk_inst_pred_prob[:,0]<0.755][IS_INSTALLED]

In [20]:
len(y_clicked),len(y_installed)

(2290514, 2290514)

In [21]:
clk_classifier = XGBClassifier(learning_rate=0.1, n_estimators=500, objective='binary:logistic',
                            verbosity=2,tree_method='gpu_hist')

clk_classifier.fit(X_train_filter,y_clicked)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=500, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)

In [25]:
install_classifier = XGBClassifier(learning_rate=0.1, n_estimators=500, objective='binary:logistic',
                            verbosity=2,tree_method='gpu_hist')

install_classifier.fit(X_train_filter,y_installed)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=500, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)

In [23]:
test = pd.read_csv('../Data/test/000000000000.csv',sep='\t')

In [24]:
test['f_30'].fillna(test['f_30'].mode()[0],inplace=True)
test['f_31'].fillna(test['f_31'].mode()[0],inplace=True)
for f in fmiss:
    test[f].fillna(test[f].mean(),inplace=True)

In [26]:
out = clk_install_classifier.predict_proba(test[click_feat_list])

In [27]:
len(out)

160973

In [28]:
import numpy as np
result = np.vstack([test['f_0'].to_numpy(dtype=int),np.zeros(len(out)),np.zeros(len(out))])

In [30]:
results = result.T

In [32]:
results = pd.DataFrame(results,columns=['RowId','is_clicked','is_installed'])

In [33]:
results.head()

Unnamed: 0,RowId,is_clicked,is_installed
0,64505.0,0.0,0.0
1,64506.0,0.0,0.0
2,64507.0,0.0,0.0
3,64508.0,0.0,0.0
4,64509.0,0.0,0.0


In [34]:
test_filter = test[out[:,0]<0.755]

In [36]:
click_pred = clk_classifier.predict_proba(test_filter[click_feat_list])

In [37]:
install_pred = install_classifier.predict_proba(test_filter[click_feat_list])

In [40]:
len(click_pred[:,1])

138490

In [41]:
len(results[out[:,0]<0.755][IS_CLICKED])

138490

In [43]:
results.loc[out[:,0]<0.755,IS_CLICKED].shape

(138490, 1)

In [44]:
click_pred[:,1].reshape(-1,1)

array([[0.44466478],
       [0.08684335],
       [0.28133684],
       ...,
       [0.16697733],
       [0.36568078],
       [0.99873596]], dtype=float32)

In [45]:
results.loc[out[:,0]<0.755,IS_CLICKED] = click_pred[:,1].reshape(-1,1)

In [46]:
results.head()

Unnamed: 0,RowId,is_clicked,is_installed
0,64505.0,0.444665,0.0
1,64506.0,0.086843,0.0
2,64507.0,0.281337,0.0
3,64508.0,0.178738,0.0
4,64509.0,0.594419,0.0


In [47]:
results.loc[out[:,0]<0.755,IS_INSTALLED] = install_pred[:,1].reshape(-1,1)

In [52]:
results.to_csv('../Data/Hierachical_results.csv', sep ='\t', index=False)