In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os
import re
import math
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from lightgbm import LGBMRegressor, LGBMClassifier
from xgboost import XGBRegressor, XGBClassifier
from catboost import CatBoostRegressor, CatBoostClassifier
import lightgbm as lgb
import xgboost as xgb
import catboost as cab

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold, GridSearchCV
from sklearn.decomposition import LatentDirichletAllocation, NMF, TruncatedSVD
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor, StackingRegressor
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier, StackingClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn import metrics
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.neighbors import KNeighborsClassifier
from tqdm import tqdm

import matplotlib.pyplot as plt

import warnings

warnings.filterwarnings('ignore')

数据处理

In [2]:
test_data = pd.read_csv('data/test/000000000000.csv', sep='\t')
test_data.shape 

(160973, 80)

In [3]:
train_data = pd.read_csv('./data/train.csv', sep='\t')
train_data.shape 

(3485852, 82)

In [4]:
train_data = train_data.replace([np.inf, -np.inf], np.nan).fillna(0)
test_data = test_data.replace([np.inf, -np.inf], np.nan).fillna(0)

In [5]:
data = pd.concat([train_data, test_data])

In [6]:
cat_features = ['f_{}'.format(i) for i in range(1, 42)]
bin_features = ['f_{}'.format(i) for i in range(33, 42)]
num_features = ['f_{}'.format(i) for i in range(42, 80)]
date_features = ['f_1']

In [7]:
for f in cat_features:
    le = LabelEncoder()
    data[f] = le.fit_transform(data[f])

In [8]:
zero_count = data.eq(0).sum()  # 计算每列中值为 0 的数量
zero_ratio = zero_count / len(data)  # 计算每列中值为 0 的占比
print(zero_ratio)

f_0             0.000000
f_1             0.038446
f_2             0.005263
f_3             0.290339
f_4             0.002065
                  ...   
f_77            0.998190
f_78            0.973566
f_79            0.990726
is_clicked      0.745728
is_installed    0.789522
Length: 82, dtype: float64


In [9]:
data.shape 

(3646825, 82)

In [10]:
train_data[train_data['f_0'] == 64505]

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,f_72,f_73,f_74,f_75,f_76,f_77,f_78,f_79,is_clicked,is_installed
2596150,64505,58,3346,22294,6767,25604,5040,27941,19203,31372,...,0.571121,0.0,0.0,0.269948,0.0,0.0,0.0,0.0,0,0


In [11]:
test_data[test_data['f_0'] == 64505]

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,f_70,f_71,f_72,f_73,f_74,f_75,f_76,f_77,f_78,f_79
0,64505,67,26325,7152,21563,19475,31440,27941,21621,14659,...,1.519085,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
data.head()

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,f_72,f_73,f_74,f_75,f_76,f_77,f_78,f_79,is_clicked,is_installed
0,2541188,12,114,4,191,4,155,0,1,4,...,2.855607,2.284486,0.115692,1.156922,0.269948,0.0,0.0,0.0,0.0,0.0
1,2541440,15,21,4,390,3,559,0,2,3,...,0.0,0.0,0.0,1.156922,0.269948,0.0,0.0,0.0,0.0,0.0
2,2541480,20,131,4,228,3,3261,0,1,6,...,0.571121,0.0,0.0,1.156922,0.269948,0.0,0.0,0.0,1.0,0.0
3,2541780,18,73,2,317,3,3843,0,2,0,...,0.0,0.0,0.0,0.347077,0.0,0.0,0.0,0.0,0.0,0.0
4,2541833,15,44,4,519,3,5179,0,4,3,...,0.0,0.0,0.115692,1.156922,0.269948,0.0,0.0,0.0,0.0,0.0


In [13]:
# zero_ratio = zero_ratio.sort_values(ascending=False)
# zero_ratio.apply(lambda x: round(x, 5))
# zero_ratio.to_csv('./output/zero.csv')

In [14]:
num_features_select = []
for fea in num_features:
    if zero_ratio.loc[fea] < 0.4:
        num_features_select.append(fea)

In [15]:
num_features_select

['f_42',
 'f_43',
 'f_51',
 'f_55',
 'f_57',
 'f_58',
 'f_59',
 'f_64',
 'f_65',
 'f_66',
 'f_67',
 'f_70',
 'f_74',
 'f_75',
 'f_76']

In [16]:
for i in tqdm(range(len(num_features_select)), total=len(num_features_select)):
    for j in range(i + 1, len(num_features_select)):
        data[f'{num_features_select[i]}+{num_features_select[j]}'] = data[num_features_select[i]] + data[num_features_select[j]]
        data[f'{num_features_select[i]}-{num_features_select[j]}'] = data[num_features_select[i]] - data[num_features_select[j]]
        data[f'{num_features_select[i]}*{num_features_select[j]}'] = data[num_features_select[i]] * data[num_features_select[j]]
        data[f'{num_features_select[i]}/{num_features_select[j]}'] = data[num_features_select[i]] / (data[num_features_select[j]] + 1e-7)
data.shape 

100%|██████████| 15/15 [00:42<00:00,  2.85s/it]


(3646825, 502)

In [17]:
for i in tqdm(range(len(bin_features)), total=len(bin_features)):
    for j in range(i + 1, len(bin_features)):
        data[f'{bin_features[i]}and{bin_features[j]}'] = data[bin_features[i]] & data[bin_features[j]]
        data[f'{bin_features[i]}or{bin_features[j]}'] = data[bin_features[i]] | data[bin_features[j]]
        data[f'{bin_features[i]}xor{bin_features[j]}'] = data[bin_features[i]] ^ data[bin_features[j]]
data.shape 

100%|██████████| 9/9 [00:17<00:00,  1.99s/it]


(3646825, 610)

In [18]:
np.save('data/data_aug_0dot4.npy', data)

In [19]:
# data = np.load('data/data_aug_0dot4.npy')
# data.shape 

In [52]:
features = [fe for fe in data.columns if fe not in ['is_clicked', 'is_installed', 'f_0', 'label']]
len(features)

607

In [21]:
train = data[~data['is_clicked'].isna()]
test = data[data['is_clicked'].isna()]
is_clicked_label = train['is_clicked'].astype('int')
is_installed_label = train['is_installed'].astype('int')

train.shape, test.shape, is_clicked_label.shape, is_installed_label.shape 

((3485852, 610), (160973, 610), (3485852,), (3485852,))

In [22]:
seed = 42
K = 5

In [23]:
lgb_params = {
    'objective': 'binary',
    'boosting_type': 'gbdt',
    # 'metric': 'binary_logloss',  # auc??
    'metric': 'binary_logloss',
    'learning_rate': 0.05,
    'num_leaves': 2 ** 6,
    'max_depth': 8,
    'tree_learner': 'serial',
    'colsample_bytree': 0.8,
    'subsample_freq': 1,
    'subsample': 0.8,
    'num_boost_round': 3000,
    'max_bin': 255,
    'verbose': -1,
    'nthread' : -1,
    'seed': seed,
    'bagging_seed': seed,
    'feature_fraction_seed': seed,
    'early_stopping_rounds': 100,
    # 'device': 'gpu',  # 设置使用 GPU 加速
    # 'gpu_platform_id': 0,  # 设置 GPU 平台 id
    # 'gpu_device_id': 0  # 设置 GPU 设备 id
}

is_clicked

In [13]:
label = is_clicked_label

In [14]:
label = is_clicked_label

KF = StratifiedKFold(n_splits=K, random_state=seed, shuffle=True)
feat_imp_lgb_cli = list()

oof_lgb_cli = np.zeros(len(train))
predictions_lgb_cli = np.zeros((len(test)))
print(len(features))

# 模型训练
for fold_, (trn_idx, val_idx) in enumerate(KF.split(train.values, label.values)):
    print("fold n°{}".format(fold_))
    trn_data = lgb.Dataset(train.iloc[trn_idx][features], label=label.iloc[trn_idx])
    val_data = lgb.Dataset(train.iloc[val_idx][features], label=label.iloc[val_idx])
    
    num_round = 3000
    clf = lgb.train(
        lgb_params,
        trn_data,
        num_round,
        valid_sets=[trn_data, val_data],
        verbose_eval=300,
        early_stopping_rounds=100,
    )

    oof_lgb_cli[val_idx] = clf.predict(train.iloc[val_idx][features], num_iteration=clf.best_iteration)
    predictions_lgb_cli[:] += clf.predict(test[features], num_iteration=clf.best_iteration) / K
    feat_imp_lgb_cli.append(clf.feature_importance())

79
fold n°0
Training until validation scores don't improve for 100 rounds
[300]	training's binary_logloss: 0.326966	valid_1's binary_logloss: 0.327212
[600]	training's binary_logloss: 0.315277	valid_1's binary_logloss: 0.317277
[900]	training's binary_logloss: 0.310265	valid_1's binary_logloss: 0.314412
[1200]	training's binary_logloss: 0.306852	valid_1's binary_logloss: 0.313149
[1500]	training's binary_logloss: 0.303901	valid_1's binary_logloss: 0.312309
[1800]	training's binary_logloss: 0.301191	valid_1's binary_logloss: 0.31171
[2100]	training's binary_logloss: 0.298683	valid_1's binary_logloss: 0.311242
[2400]	training's binary_logloss: 0.296324	valid_1's binary_logloss: 0.310938
[2700]	training's binary_logloss: 0.294051	valid_1's binary_logloss: 0.310649
[3000]	training's binary_logloss: 0.291819	valid_1's binary_logloss: 0.310385
Did not meet early stopping. Best iteration is:
[3000]	training's binary_logloss: 0.291819	valid_1's binary_logloss: 0.310385
fold n°1
Training until 

In [15]:
logloss = metrics.log_loss(label, [1 if i >= 0.5 else 0 for i in oof_lgb_cli], labels=[0, 1])
logloss2 = metrics.log_loss(label, oof_lgb_cli)

acc = metrics.roc_auc_score(label, oof_lgb_cli)
precision = metrics.precision_score(label, [1 if i >= 0.5 else 0 for i in oof_lgb_cli])
recall = metrics.recall_score(label, [1 if i >= 0.5 else 0 for i in oof_lgb_cli])
f1 = metrics.f1_score(label, [1 if i >= 0.5 else 0 for i in oof_lgb_cli])

print(f"Logloss: {logloss:.4f}, {logloss2:.4f}, AUC: {acc:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")

Logloss: 4.0459, 0.3113, AUC: 0.8730, Precision: 0.8972, Recall: 0.5276, F1 Score: 0.6644


In [21]:
avg_imp_cli = pd.DataFrame(feat_imp_lgb_cli, columns = features).apply(np.mean, axis = 0).sort_values(ascending=False)
avg_imp_cli.describe()

count       79.000000
mean      2390.579747
std       2535.470191
min          0.000000
25%        309.900000
50%       1537.000000
75%       3903.800000
max      10211.200000
dtype: float64

In [22]:
avg_imp_cli

f_4     10211.2
f_11     9546.8
f_15     9148.8
f_42     8856.2
f_6      8218.0
         ...   
f_29        0.0
f_7         0.0
f_28        0.0
f_27        0.0
f_26        0.0
Length: 79, dtype: float64

# is_installed

In [24]:
label = is_installed_label

lightgbm

In [25]:
lgb_params = {
    'objective': 'binary',
    'boosting_type': 'gbdt',
    # 'metric': 'binary_logloss',  # auc??
    'metric': 'binary_logloss',
    'learning_rate': 0.05,
    'num_leaves': 2 ** 6,
    'max_depth': 8,
    'tree_learner': 'serial',
    'colsample_bytree': 0.8,
    'subsample_freq': 1,
    'subsample': 0.8,
    'num_boost_round': 3000,
    'max_bin': 255,
    'verbose': -1,
    'nthread' : -1,
    'seed': seed,
    'bagging_seed': seed,
    'feature_fraction_seed': seed,
    'early_stopping_rounds': 100,
    # 'device': 'gpu',  # 设置使用 GPU 加速
    # 'gpu_platform_id': 0,  # 设置 GPU 平台 id
    # 'gpu_device_id': 0  # 设置 GPU 设备 id
}

In [49]:
features = select_fea_lgb

In [43]:
label = is_installed_label

KF = StratifiedKFold(n_splits=K, random_state=seed, shuffle=True)
feat_imp_lgb_ins = list()

oof_lgb_ins = np.zeros(len(train))
predictions_lgb_ins = np.zeros((len(test)))
print(len(features))

# 模型训练
for fold_, (trn_idx, val_idx) in enumerate(KF.split(train.values, label.values)):
    print("fold n°{}".format(fold_))
    trn_data = lgb.Dataset(train.iloc[trn_idx][features], label=label.iloc[trn_idx])
    val_data = lgb.Dataset(train.iloc[val_idx][features], label=label.iloc[val_idx])
    
    num_round = 3000
    clf = lgb.train(
        lgb_params,
        trn_data,
        num_round,
        valid_sets=[trn_data, val_data],
        verbose_eval=300,
        early_stopping_rounds=100,
    )

    oof_lgb_ins[val_idx] = clf.predict(train.iloc[val_idx][features], num_iteration=clf.best_iteration)
    predictions_lgb_ins[:] += clf.predict(test[features], num_iteration=clf.best_iteration) / K
    feat_imp_lgb_ins.append(clf.feature_importance())

152
fold n°0
Training until validation scores don't improve for 100 rounds
[300]	training's binary_logloss: 0.300029	valid_1's binary_logloss: 0.302257
[600]	training's binary_logloss: 0.292585	valid_1's binary_logloss: 0.297561
[900]	training's binary_logloss: 0.287908	valid_1's binary_logloss: 0.295758
[1200]	training's binary_logloss: 0.284132	valid_1's binary_logloss: 0.294829
[1500]	training's binary_logloss: 0.280837	valid_1's binary_logloss: 0.294326
[1800]	training's binary_logloss: 0.277738	valid_1's binary_logloss: 0.293933
[2100]	training's binary_logloss: 0.27487	valid_1's binary_logloss: 0.293705
[2400]	training's binary_logloss: 0.272139	valid_1's binary_logloss: 0.293521
[2700]	training's binary_logloss: 0.269412	valid_1's binary_logloss: 0.293293
[3000]	training's binary_logloss: 0.266835	valid_1's binary_logloss: 0.293169
Did not meet early stopping. Best iteration is:
[3000]	training's binary_logloss: 0.266835	valid_1's binary_logloss: 0.293169
fold n°1
Training until

In [44]:
# logloss = metrics.log_loss(label, [1 if i >= 0.5 else 0 for i in oof_lgb_ins], labels=[0, 1])
logloss = metrics.log_loss(label, oof_lgb_ins)
acc = metrics.roc_auc_score(label, oof_lgb_ins)
precision = metrics.precision_score(label, [1 if i >= 0.5 else 0 for i in oof_lgb_ins])
recall = metrics.recall_score(label, [1 if i >= 0.5 else 0 for i in oof_lgb_ins])
f1 = metrics.f1_score(label, [1 if i >= 0.5 else 0 for i in oof_lgb_ins])

print(f"Logloss: {logloss:.4f}, AUC: {acc:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")

Logloss: 0.2932, AUC: 0.8809, Precision: 0.7924, Recall: 0.3800, F1 Score: 0.5137


In [50]:
submission = pd.DataFrame()
submission["RowId"] = test_data["f_0"]
submission["is_clicked"] = np.random.random((test_data.shape[0]))
submission["is_installed"] = predictions_lgb_ins
submission.to_csv('./output/lgb_42_sep_train_logloss_float_{}.csv'.format(len(features)), index=False, sep='\t')

In [38]:
avg_imp_lgb_ins = pd.DataFrame(feat_imp_lgb_ins, columns = features).apply(np.mean, axis = 0).sort_values(ascending=False)
avg_imp_lgb_ins.describe()

count     607.000000
mean      301.620099
std       490.934147
min         0.000000
25%        51.400000
50%       196.800000
75%       357.000000
max      5365.600000
dtype: float64

In [39]:
select_fea_lgb = []
for i in range(len(avg_imp_lgb_ins)):
    if avg_imp_lgb_ins.iloc[i] >= 357.000000:
        select_fea_lgb.append(avg_imp_lgb_ins.index[i])
print(len(select_fea_lgb))
print(select_fea_lgb)

152
['f_11', 'f_15', 'f_4', 'f_6', 'f_17', 'f_61', 'f_1', 'f_12', 'f_62', 'f_14', 'f_63', 'f_2', 'f_13', 'f_16', 'f_18', 'f_54', 'f_72', 'f_42/f_57', 'f_5', 'f_56', 'f_8', 'f_42/f_55', 'f_42/f_74', 'f_9', 'f_10', 'f_55/f_57', 'f_52', 'f_42*f_51', 'f_42*f_57', 'f_42+f_51', 'f_42/f_59', 'f_55-f_57', 'f_20', 'f_42*f_64', 'f_42-f_51', 'f_42/f_64', 'f_42*f_65', 'f_42*f_55', 'f_42/f_51', 'f_57/f_74', 'f_42*f_59', 'f_42/f_76', 'f_42/f_65', 'f_57+f_58', 'f_53', 'f_19', 'f_32', 'f_57+f_59', 'f_57-f_59', 'f_55+f_58', 'f_42/f_70', 'f_42/f_75', 'f_57+f_67', 'f_57-f_67', 'f_57-f_58', 'f_55-f_58', 'f_42/f_58', 'f_55+f_59', 'f_42*f_58', 'f_42*f_70', 'f_55+f_67', 'f_51/f_57', 'f_51*f_57', 'f_55-f_59', 'f_55-f_67', 'f_73', 'f_64/f_65', 'f_51/f_55', 'f_57*f_64', 'f_43-f_66', 'f_42*f_74', 'f_68', 'f_57*f_65', 'f_42*f_43', 'f_51/f_64', 'f_42+f_59', 'f_69', 'f_51*f_55', 'f_51+f_57', 'f_57/f_64', 'f_57+f_70', 'f_58/f_59', 'f_43+f_57', 'f_55/f_74', 'f_55*f_64', 'f_59-f_67', 'f_57-f_70', 'f_75/f_76', 'f_42/f_

xgboost

In [31]:
xgb_params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'random_state': seed,
    'tree_method': 'hist',
    'booster': 'gbtree',
    'n_estimators': 1000, 
    'learning_rate': 0.1,
    'max_depth': 6,
    'min_child_weight': 12, 
    'subsample': 0.8,
    'colsample_bytree': 1.0, 
    'alpha': 0.7,
    'lambda': 0.1, 
    'gamma': 0.1, 
    "nthread": -1
}

In [51]:
xgb_params = {
    'booster': 'gbtree',
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'gamma': 1,
    'min_child_weight': 1.5,
    'max_depth': 6,
    'lambda': 10,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'colsample_bylevel': 0.7,
    'eta': 0.05,
    'tree_method': 'exact',
    'seed': seed,
    'nthread': -1
}

In [58]:
features = select_fea_xgb

In [59]:
label = is_installed_label

KF = StratifiedKFold(n_splits=K, random_state=seed, shuffle=True)
feat_imp_xgb_ins = list()

oof_xgb_ins = np.zeros(len(train))
predictions_xgb_ins = np.zeros((len(test)))
print(len(features))

# 模型训练
for fold_, (trn_idx, val_idx) in enumerate(KF.split(train.values, label.values)):
    print("fold n°{}".format(fold_))
    X_train, X_val = train.iloc[trn_idx][features], train.iloc[val_idx][features]
    y_train, y_val = label.iloc[trn_idx], label.iloc[val_idx]
    trn_data = xgb.DMatrix(X_train, label=y_train)
    val_data = xgb.DMatrix(X_val, label=y_val)

    num_round = 3000
    clf = xgb.train(
        xgb_params,
        trn_data,
        num_round,
        evals = [(trn_data, 'train'), (val_data, 'val')],
        verbose_eval=300,
        early_stopping_rounds=100,
    )

    oof_xgb_ins[val_idx] = clf.predict(val_data, iteration_range=(0, clf.best_iteration))
    predictions_xgb_ins[:] += clf.predict(xgb.DMatrix(test[features]), iteration_range=(0, clf.best_iteration)) / K
    feat_imp_xgb_ins.append(clf.get_score())

147
fold n°0
[0]	train-logloss:0.66521	val-logloss:0.66521
[300]	train-logloss:0.30761	val-logloss:0.30914
[600]	train-logloss:0.29922	val-logloss:0.30242
[900]	train-logloss:0.29474	val-logloss:0.29952
[1200]	train-logloss:0.29183	val-logloss:0.29817
[1500]	train-logloss:0.28935	val-logloss:0.29729
[1800]	train-logloss:0.28724	val-logloss:0.29671
[2100]	train-logloss:0.28535	val-logloss:0.29633
[2400]	train-logloss:0.28354	val-logloss:0.29602
[2700]	train-logloss:0.28178	val-logloss:0.29573
[2999]	train-logloss:0.28021	val-logloss:0.29558
fold n°1
[0]	train-logloss:0.66521	val-logloss:0.66522
[300]	train-logloss:0.30789	val-logloss:0.30923
[600]	train-logloss:0.29972	val-logloss:0.30265
[900]	train-logloss:0.29526	val-logloss:0.29973
[1200]	train-logloss:0.29212	val-logloss:0.29816
[1500]	train-logloss:0.28965	val-logloss:0.29727
[1800]	train-logloss:0.28742	val-logloss:0.29655
[2100]	train-logloss:0.28540	val-logloss:0.29599
[2400]	train-logloss:0.28362	val-logloss:0.29568
[2700]	tra

In [60]:
# logloss = metrics.log_loss(label, [1 if i >= 0.5 else 0 for i in oof_xgb_ins], labels=[0, 1])
logloss = metrics.log_loss(label, oof_xgb_ins)
acc = metrics.roc_auc_score(label, oof_xgb_ins)
precision = metrics.precision_score(label, [1 if i >= 0.5 else 0 for i in oof_xgb_ins])
recall = metrics.recall_score(label, [1 if i >= 0.5 else 0 for i in oof_xgb_ins])
f1 = metrics.f1_score(label, [1 if i >= 0.5 else 0 for i in oof_xgb_ins])

print(f"Logloss: {logloss:.4f}, AUC: {acc:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")

Logloss: 0.2956, AUC: 0.8785, Precision: 0.7938, Recall: 0.3696, F1 Score: 0.5043


In [61]:
submission = pd.DataFrame()
submission["RowId"] = test_data["f_0"]
submission["is_clicked"] = np.random.random((test_data.shape[0]))
submission["is_installed"] = predictions_xgb_ins
submission.to_csv('./output/xgb_42_sep_train_logloss_float_{}_newparam.csv'.format(len(features)), index=False, sep='\t')

In [56]:
avg_imp_xgb_ins = pd.DataFrame(feat_imp_xgb_ins, columns = features).apply(np.mean, axis = 0).sort_values(ascending=False)
avg_imp_xgb_ins.describe()

count     587.000000
mean      284.692930
std       433.657853
min         1.000000
25%        56.700000
50%       199.000000
75%       348.400000
max      4775.200000
dtype: float64

In [57]:
select_fea_xgb = []
for i in range(len(avg_imp_xgb_ins)):
    if avg_imp_xgb_ins.iloc[i] >= 348.400000:
        select_fea_xgb.append(avg_imp_xgb_ins.index[i])
print(len(select_fea_xgb))
print(select_fea_xgb)

147
['f_11', 'f_4', 'f_6', 'f_15', 'f_17', 'f_12', 'f_1', 'f_14', 'f_61', 'f_62', 'f_2', 'f_16', 'f_13', 'f_63', 'f_42/f_57', 'f_72', 'f_18', 'f_5', 'f_42/f_74', 'f_42/f_55', 'f_8', 'f_10', 'f_54', 'f_56', 'f_9', 'f_42*f_51', 'f_42*f_57', 'f_42-f_51', 'f_42+f_51', 'f_42*f_64', 'f_42/f_51', 'f_42/f_59', 'f_52', 'f_55/f_57', 'f_57-f_59', 'f_42*f_59', 'f_42*f_65', 'f_57+f_59', 'f_42*f_55', 'f_42/f_76', 'f_42+f_65', 'f_42/f_70', 'f_42/f_64', 'f_57+f_58', 'f_42-f_65', 'f_42/f_65', 'f_42/f_75', 'f_57-f_58', 'f_42*f_43', 'f_55+f_58', 'f_42*f_70', 'f_55-f_59', 'f_57-f_67', 'f_42/f_58', 'f_20', 'f_55-f_58', 'f_55+f_59', 'f_55+f_67', 'f_51+f_57', 'f_42*f_66', 'f_42*f_58', 'f_57+f_67', 'f_51-f_57', 'f_19', 'f_55-f_67', 'f_32', 'f_51*f_57', 'f_42/f_43', 'f_42+f_59', 'f_51/f_57', 'f_57/f_74', 'f_42-f_59', 'f_55-f_57', 'f_51/f_55', 'f_42/f_66', 'f_57*f_65', 'f_53', 'f_57+f_70', 'f_57*f_59', 'f_43-f_57', 'f_43+f_57', 'f_57-f_70', 'f_57*f_64', 'f_51+f_55', 'f_51*f_55', 'f_73', 'f_43-f_66', 'f_43+f_55'

catboost

In [34]:
cbc_params = {
    'random_state': seed,
    'loss_function': 'Logloss',
    'eval_metric': 'Logloss',
    'verbose': False,
    'learning_rate': 0.05,
    'depth': 5,
    'rsm': 0.2020238568794654,
    # 'min_data_in_leaf': 255,
    # 'l2_leaf_reg': 5,
    # 'subsample': 0.7,
    # 'use_best_model': True,
    # 'max_leaves': 12,
    # 'metric_period': 500,
    'n_estimators': 3000,
    'thread_count': -1
}

In [35]:
KF = StratifiedKFold(n_splits=K, random_state=seed, shuffle=True)
feat_imp_cbc_ins = list()

oof_cbc_ins = np.zeros(len(train))
predictions_cbc_ins = np.zeros((len(test)))
print(len(features))

model = CatBoostClassifier(**cbc_params)

# 模型训练
for fold_, (trn_idx, val_idx) in enumerate(KF.split(train.values, label.values)):
    print("fold n°{}".format(fold_))

    X_train, X_val = train.iloc[trn_idx][features], train.iloc[val_idx][features]
    y_train, y_val = label.iloc[trn_idx], label.iloc[val_idx]
    
    model.fit(X_train, y_train, eval_set=(X_val, y_val), 
              cat_features=cat_features,
              early_stopping_rounds=100, verbose=300, use_best_model=True)

    oof_cbc_ins[val_idx] += (model.predict_proba(X_val)[:, 1])
    predictions_cbc_ins += (model.predict_proba(test[features])[:, 1]) / K
    feat_imp_cbc_ins.append(model.feature_importances_)

607
fold n°0
0:	learn: 0.6491594	test: 0.6491298	best: 0.6491298 (0)	total: 1.82s	remaining: 1h 31m 12s
300:	learn: 0.3001315	test: 0.2990618	best: 0.2990618 (300)	total: 6m 27s	remaining: 57m 55s
600:	learn: 0.2937823	test: 0.2928453	best: 0.2928453 (600)	total: 12m 59s	remaining: 51m 53s


KeyboardInterrupt: 

In [None]:
# logloss = metrics.log_loss(label, [1 if i >= 0.5 else 0 for i in oof_cbc_ins], labels=[0, 1])
logloss = metrics.log_loss(label, oof_cbc_ins)
acc = metrics.roc_auc_score(label, oof_cbc_ins)
precision = metrics.precision_score(label, [1 if i >= 0.5 else 0 for i in oof_cbc_ins])
recall = metrics.recall_score(label, [1 if i >= 0.5 else 0 for i in oof_cbc_ins])
f1 = metrics.f1_score(label, [1 if i >= 0.5 else 0 for i in oof_cbc_ins])

print(f"Logloss: {logloss:.4f}, AUC: {acc:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")

NameError: name 'metrics' is not defined

In [None]:
submission = pd.DataFrame()
submission["RowId"] = test_data["f_0"]
submission["is_clicked"] = np.random.random((test_data.shape[0]))
submission["is_installed"] = predictions_cbc_ins
submission.to_csv('./output/cbc_42_sep_train_logloss_float_{}.csv'.format(len(features)), index=False, sep='\t')

NameError: name 'pd' is not defined

In [None]:
avg_imp_cbc_ins = pd.DataFrame(feat_imp_cbc_ins, columns = features).apply(np.mean, axis = 0).sort_values(ascending=False)
avg_imp_cbc_ins.describe()

In [None]:
select_fea_cbc = []
for i in range(len(avg_imp_cbc_ins)):
    if avg_imp_cbc_ins.iloc[i] >= 1068.650000:
        select_fea_cbc.append(avg_imp_cbc_ins.index[i])
print(len(select_fea_cbc))
print(select_fea_cbc)

# 模型融合

In [38]:
file_list = ['xgb_42_sep_train_logloss_float', 'cbc_42_sep_train_logloss_float', 'lgb_42_sep_train_logloss_float']
weight_list = [1, 2, 3]
res_list = []
for file in file_list:
    res_list.append(pd.read_csv('./output/{}.csv'.format(file), sep='\t')['is_installed'].values)

In [44]:
res_wei = 0
for i, res_ in enumerate(res_list):
    res_wei += res_ * weight_list[i]
res_wei /= sum(weight_list)

In [45]:
res_list

[array([0.35779583, 0.67529753, 0.05848108, ..., 0.02568051, 0.11943987,
        0.1113366 ]),
 array([0.34246523, 0.49886412, 0.12743884, ..., 0.20235799, 0.24234735,
        0.1216858 ]),
 array([0.35230845, 0.45921298, 0.08272543, ..., 0.02216217, 0.14859903,
        0.06889433])]

In [46]:
res_wei

array([0.34994194, 0.50844412, 0.09358917, ..., 0.08281383, 0.17498861,
       0.0935652 ])

In [47]:
submission = pd.DataFrame()
submission["RowId"] = test_data["f_0"]
submission["is_clicked"] = np.random.random((test_data.shape[0]))
submission["is_installed"] = res_wei
submission.to_csv('./output/xgb_lgb_cbc_1_2_3.csv', index=False, sep='\t')

# 输出

In [26]:
seed

42

In [33]:
submission = pd.DataFrame()
submission["RowId"] = test_data["f_0"]
submission["is_clicked"] = np.random.random((test_data.shape[0]))
submission["is_installed"] = predictions_cbc_ins
submission.to_csv('./output/cbc_42_sep_train_logloss_float.csv', index=False, sep='\t')

In [24]:
submission = pd.DataFrame()
submission["RowId"] = test_data["f_0"]
submission["is_clicked"] = predictions_lgb_cli
submission["is_installed"] = predictions_lgb_ins
submission.to_csv('./output/lgb_42_sep_train_logloss_float.csv', index=False, sep='\t')

In [26]:
submission = pd.DataFrame()
submission["RowId"] = test_data["f_0"]
submission["is_clicked"] = np.round(predictions_lgb_cli, 5)
submission["is_installed"] = np.round(predictions_lgb_ins, 5)
submission.to_csv('./output/lgb_42_sep_train_logloss_float_round5.csv', index=False, sep='\t')

In [28]:
submission = pd.DataFrame()
submission["RowId"] = test_data["f_0"]
submission["is_clicked"] = np.random.random((test_data.shape[0]))
submission["is_installed"] = predictions_lgb_ins
submission.to_csv('./output/lgb_42_sep_train_logloss_float_ins_pred_cli_rand.csv', index=False, sep='\t')

In [27]:
submission

Unnamed: 0,RowId,is_clicked,is_installed
0,64505,0.54069,0.35231
1,64506,0.20442,0.45921
2,64507,0.63765,0.08273
3,64508,0.30246,0.33364
4,64509,0.91326,0.30514
...,...,...,...
160968,16240,0.67619,0.42696
160969,16241,0.09272,0.00923
160970,16242,0.22014,0.02216
160971,16243,0.50215,0.14860
