In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os
import re
import math
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from lightgbm import LGBMRegressor, LGBMClassifier
from xgboost import XGBRegressor, XGBClassifier
from catboost import CatBoostRegressor, CatBoostClassifier
import lightgbm as lgb
import xgboost as xgb
import catboost as cab

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold, GridSearchCV
from sklearn.decomposition import LatentDirichletAllocation, NMF, TruncatedSVD
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor, StackingRegressor
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier, StackingClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn import metrics
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.neighbors import KNeighborsClassifier
from tqdm import tqdm

import warnings

warnings.filterwarnings('ignore')

In [2]:
test_data = pd.read_csv('data/test/000000000000.csv', sep='\t')

In [3]:
test_data.head()

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,f_70,f_71,f_72,f_73,f_74,f_75,f_76,f_77,f_78,f_79
0,64505,67,26325,7152,21563,19475,31440,27941,21621,14659,...,1.519085,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,64506,67,20095,563,22861,19475,21280,27941,19203,14659,...,0.0,0.0,0.0,0.0,0.115692,1.156922,0.269948,0.0,0.0,0.0
2,64507,67,890,22294,18294,21545,20210,27941,18800,9638,...,0.982995,0.0,0.0,0.0,0.0,1.156922,0.269948,0.0,0.0,0.0
3,64508,67,20095,563,22861,25604,21280,27941,18800,14659,...,0.0,0.0,2.284486,0.0,0.0,1.156922,0.269948,0.0,0.0,0.0
4,64509,67,27426,22294,11338,19475,23855,27941,21218,9638,...,0.98604,0.0,0.0,0.0,0.077128,0.077128,0.077128,0.0,0.0,0.0


In [4]:
test_data.shape 

(160973, 80)

In [6]:
# train = []
# files = os.listdir('data/train/')
# for f in files:
#     df = pd.read_csv('data/train/{}'.format(f), sep='\t')
#     train.append(df)

In [7]:
# train_data = pd.concat(train)
# train_data.shape 

(3485852, 82)

In [27]:
# train_data.to_csv('./data/train.csv', index=False, sep='\t')

In [5]:
train_data = pd.read_csv('./data/train.csv', sep='\t')

In [6]:
train_data.shape 

(3485852, 82)

In [7]:
train_data.head()

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,f_72,f_73,f_74,f_75,f_76,f_77,f_78,f_79,is_clicked,is_installed
0,2541188,57,26325,22294,9018,25604,943,27941,19203,21533,...,2.855607,2.284486,0.115692,1.156922,0.269948,0.0,0.0,0.0,0,0
1,2541440,60,5156,22294,18971,21545,3448,27941,19606,14659,...,0.0,0.0,0.0,1.156922,0.269948,0.0,0.0,0.0,0,0
2,2541480,65,30256,22294,11104,21545,20366,27941,19203,31372,...,0.571121,0.0,0.0,1.156922,0.269948,0.0,0.0,0.0,1,0
3,2541780,63,17216,7152,15742,21545,23877,27941,19606,869,...,0.0,0.0,0.0,0.347077,0.0,0.0,0.0,0.0,0,0
4,2541833,60,9317,22294,26866,21545,32370,27941,21218,14659,...,0.0,0.0,0.115692,1.156922,0.269948,0.0,0.0,0.0,0,0


In [8]:
train_data['f_0'].unique().shape 

(3485852,)

In [9]:
data = pd.concat([train_data, test_data])

In [10]:
data.shape 

(3646825, 82)

In [11]:
data.head()

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,f_72,f_73,f_74,f_75,f_76,f_77,f_78,f_79,is_clicked,is_installed
0,2541188,57,26325,22294,9018,25604,943,27941,19203,21533,...,2.855607,2.284486,0.115692,1.156922,0.269948,0.0,0.0,0.0,0.0,0.0
1,2541440,60,5156,22294,18971,21545,3448,27941,19606,14659,...,0.0,0.0,0.0,1.156922,0.269948,0.0,0.0,0.0,0.0,0.0
2,2541480,65,30256,22294,11104,21545,20366,27941,19203,31372,...,0.571121,0.0,0.0,1.156922,0.269948,0.0,0.0,0.0,1.0,0.0
3,2541780,63,17216,7152,15742,21545,23877,27941,19606,869,...,0.0,0.0,0.0,0.347077,0.0,0.0,0.0,0.0,0.0,0.0
4,2541833,60,9317,22294,26866,21545,32370,27941,21218,14659,...,0.0,0.0,0.115692,1.156922,0.269948,0.0,0.0,0.0,0.0,0.0


In [12]:
train_data['f_1'].unique(), test_data['f_1'].unique()

(array([57, 60, 65, 63, 52, 46, 58, 55, 50, 62, 47, 56, 51, 48, 45, 64, 49,
        61, 53, 59, 54, 66]),
 array([67]))

In [13]:
len(train_data['f_1'].unique())

22

In [14]:
pd.value_counts(train_data.dtypes)

int64      42
float64    40
dtype: int64

In [15]:
train_data.isna().any()

f_0             False
f_1             False
f_2             False
f_3             False
f_4             False
                ...  
f_77            False
f_78            False
f_79            False
is_clicked      False
is_installed    False
Length: 82, dtype: bool

In [41]:
columns = train_data.columns
for col in columns:
    if train_data[col].isna().any():
        print(col, train_data[col].isna().sum())

f_30 1666968
f_31 1666968
f_43 181427
f_51 181427
f_58 181427
f_59 181427
f_64 181427
f_65 181427
f_66 181427
f_67 181427
f_68 181427
f_69 181427
f_70 181427


In [45]:
train_data[train_data['f_43'].isna()][['f_0', 'f_1', 'f_51', 'f_58']]

Unnamed: 0,f_0,f_1,f_51,f_58
11,2543681,55,,
22,2330527,45,,
25,2331092,46,,
64,2156779,45,,
116,485177,62,,
...,...,...,...,...
115784,1367997,50,,
115793,1370838,52,,
115835,2588476,65,,
115850,1778587,45,,


In [47]:
tmp = train_data[train_data['f_43'].isna()]
for col in columns:
    if tmp[col].isna().any():
        print(col, tmp[col].isna().sum())

f_30 91259
f_31 91259
f_43 181427
f_51 181427
f_58 181427
f_59 181427
f_64 181427
f_65 181427
f_66 181427
f_67 181427
f_68 181427
f_69 181427
f_70 181427


In [40]:
for col in test_data.columns:
    if test_data[col].isna().any():
        print(col, test_data[col].isna().sum())

f_30 43867
f_31 43867
f_43 9767
f_51 9767
f_58 9767
f_59 9767
f_64 9767
f_65 9767
f_66 9767
f_67 9767
f_68 9767
f_69 9767
f_70 9767


In [49]:
tmp = test_data[test_data['f_43'].isna()]
for col in tmp.columns:
    if tmp[col].isna().any():
        print(col, tmp[col].isna().sum())

f_30 2506
f_31 2506
f_43 9767
f_51 9767
f_58 9767
f_59 9767
f_64 9767
f_65 9767
f_66 9767
f_67 9767
f_68 9767
f_69 9767
f_70 9767


In [18]:
train_data.shape 

(3485852, 82)

In [19]:
train_data.nunique()

f_0             3485852
f_1                  22
f_2                 136
f_3                   5
f_4                 633
                 ...   
f_77                  4
f_78                 13
f_79                  7
is_clicked            2
is_installed          2
Length: 82, dtype: int64

In [20]:
for col in columns:
    print(col, train_data[col].nunique())

f_0 3485852
f_1 22
f_2 136
f_3 5
f_4 633
f_5 6
f_6 5167
f_7 1
f_8 6
f_9 7
f_10 3
f_11 24
f_12 26
f_13 329
f_14 19
f_15 5801
f_16 10
f_17 49
f_18 901
f_19 19
f_20 55
f_21 34
f_22 24
f_23 4
f_24 4
f_25 3
f_26 2
f_27 2
f_28 2
f_29 2
f_30 2
f_31 2
f_32 4
f_33 2
f_34 2
f_35 2
f_36 2
f_37 2
f_38 2
f_39 2
f_40 2
f_41 2
f_42 8808
f_43 1721
f_44 21
f_45 23
f_46 11
f_47 27
f_48 25
f_49 18
f_50 33
f_51 1741
f_52 169
f_53 98
f_54 209
f_55 379
f_56 217
f_57 506
f_58 1724
f_59 1520
f_60 394
f_61 817
f_62 1383
f_63 402
f_64 1635
f_65 1636
f_66 1722
f_67 1636
f_68 446
f_69 368
f_70 1665
f_71 4
f_72 11
f_73 8
f_74 4
f_75 31
f_76 8
f_77 4
f_78 13
f_79 7
is_clicked 2
is_installed 2


In [21]:
for col in test_data.columns:
    print(col, test_data[col].nunique())

f_0 160973
f_1 1
f_2 94
f_3 5
f_4 298
f_5 6
f_6 2210
f_7 1
f_8 6
f_9 2
f_10 3
f_11 24
f_12 18
f_13 103
f_14 17
f_15 2741
f_16 12
f_17 40
f_18 262
f_19 16
f_20 43
f_21 25
f_22 17
f_23 3
f_24 3
f_25 2
f_26 2
f_27 2
f_28 2
f_29 2
f_30 2
f_31 2
f_32 4
f_33 2
f_34 2
f_35 2
f_36 2
f_37 2
f_38 2
f_39 2
f_40 2
f_41 2
f_42 4258
f_43 86
f_44 11
f_45 12
f_46 6
f_47 12
f_48 16
f_49 12
f_50 17
f_51 88
f_52 96
f_53 62
f_54 124
f_55 252
f_56 157
f_57 343
f_58 87
f_59 80
f_60 98
f_61 340
f_62 610
f_63 187
f_64 83
f_65 83
f_66 86
f_67 83
f_68 21
f_69 18
f_70 84
f_71 4
f_72 11
f_73 8
f_74 4
f_75 31
f_76 8
f_77 3
f_78 6
f_79 5


In [22]:
for col in columns:
    print(col, data[col].nunique())

f_0 3485852
f_1 23
f_2 139
f_3 5
f_4 638
f_5 6
f_6 5234
f_7 1
f_8 6
f_9 7
f_10 3
f_11 24
f_12 26
f_13 331
f_14 19
f_15 5854
f_16 12
f_17 49
f_18 924
f_19 19
f_20 57
f_21 35
f_22 26
f_23 4
f_24 4
f_25 3
f_26 2
f_27 2
f_28 2
f_29 2
f_30 2
f_31 2
f_32 4
f_33 2
f_34 2
f_35 2
f_36 2
f_37 2
f_38 2
f_39 2
f_40 2
f_41 2
f_42 8882
f_43 1806
f_44 21
f_45 23
f_46 11
f_47 27
f_48 27
f_49 20
f_50 34
f_51 1828
f_52 171
f_53 102
f_54 212
f_55 389
f_56 220
f_57 516
f_58 1810
f_59 1593
f_60 402
f_61 825
f_62 1396
f_63 407
f_64 1717
f_65 1718
f_66 1807
f_67 1718
f_68 465
f_69 379
f_70 1748
f_71 4
f_72 11
f_73 8
f_74 4
f_75 31
f_76 8
f_77 4
f_78 13
f_79 7
is_clicked 2
is_installed 2


In [23]:
train_data['f_30'].unique()

array([ 0., nan,  1.])

In [24]:
train_data['f_31'].unique()

array([ 0., nan,  1.])

In [29]:
dates = data['f_1'].unique().tolist()

In [30]:
dates

[57,
 60,
 65,
 63,
 52,
 46,
 58,
 55,
 50,
 62,
 47,
 56,
 51,
 48,
 45,
 64,
 49,
 61,
 53,
 59,
 54,
 66,
 67]

In [33]:
for da in dates:
    print(da, data[data['f_1'] == da].shape)

57 (179008, 82)
60 (134746, 82)
65 (147011, 82)
63 (136202, 82)
52 (165538, 82)
46 (234077, 82)
58 (121408, 82)
55 (159477, 82)
50 (214404, 82)
62 (210448, 82)
47 (220135, 82)
56 (187460, 82)
51 (136566, 82)
48 (142226, 82)
45 (140206, 82)
64 (136762, 82)
49 (191145, 82)
61 (136206, 82)
53 (121879, 82)
59 (153996, 82)
54 (118980, 82)
66 (97972, 82)
67 (160973, 82)


In [34]:
len(dates)


23

In [53]:
for i in range(2, 33):
    f = 'f_{}'.format(i)
    print(f, data[f].nunique())

f_2 139
f_3 5
f_4 638
f_5 6
f_6 5234
f_7 1
f_8 6
f_9 7
f_10 3
f_11 24
f_12 26
f_13 331
f_14 19
f_15 5854
f_16 12
f_17 49
f_18 924
f_19 19
f_20 57
f_21 35
f_22 26
f_23 4
f_24 4
f_25 3
f_26 2
f_27 2
f_28 2
f_29 2
f_30 2
f_31 2
f_32 4


In [50]:
for i in range(33, 42):
    f = 'f_{}'.format(i)
    print(f, data[f].unique())

f_33 [1 0]
f_34 [1 0]
f_35 [1 0]
f_36 [1 0]
f_37 [1 0]
f_38 [1 0]
f_39 [0 1]
f_40 [0 1]
f_41 [0 1]


In [51]:
for i in range(42, 80):
    f = 'f_{}'.format(i)
    print(f, data[f].unique())

f_42 [ 90.20135611  39.56673423   5.97743061 ... 401.49051667 231.6543592
 227.41231167]
f_43 [2.28287701 1.5812269  2.4092228  ... 0.19113174 3.02705843 0.73536886]
f_44 [ 0.          0.57112147  1.71336441  1.14224294  2.28448589  3.9978503
  2.85560736  3.42672883  7.42457913  6.28233618  5.71121471  4.56897177
 11.42242943  5.14009324  6.85345766  9.70906501 10.28018648  8.56682207
  7.9957006  13.13579384 12.56467237]
f_45 [ 0.          0.57112147  1.71336441  1.14224294  2.28448589  2.85560736
  3.9978503   3.42672883  7.42457913  6.28233618  5.71121471  4.56897177
 11.42242943  6.85345766  5.14009324  9.70906501 10.28018648  8.56682207
  9.13794354  7.9957006  13.13579384 12.56467237 14.27803678]
f_46 [0.         0.57112147 1.14224294 2.28448589 1.71336441 3.9978503
 2.85560736 5.71121471 3.42672883 4.56897177 5.14009324]
f_47 [ 0.          0.57112147  1.71336441  1.14224294  2.28448589  2.85560736
  3.9978503   4.56897177  3.42672883  8.56682207  6.28233618  6.85345766
  5.1400

In [52]:
f_col = []
for i in range(42, 80):
    f = 'f_{}'.format(i)
    f_col.append(f)

data[f_col].describe()

Unnamed: 0,f_42,f_43,f_44,f_45,f_46,f_47,f_48,f_49,f_50,f_51,...,f_70,f_71,f_72,f_73,f_74,f_75,f_76,f_77,f_78,f_79
count,3646825.0,3455631.0,3646825.0,3646825.0,3646825.0,3646825.0,3646825.0,3646825.0,3646825.0,3455631.0,...,3455631.0,3646825.0,3646825.0,3646825.0,3646825.0,3646825.0,3646825.0,3646825.0,3646825.0,3646825.0
mean,18.876,1.084403,0.007365268,0.009394906,0.003266683,0.01258501,0.02191617,0.01214932,0.03404248,9.676367,...,1.041742,0.04324664,0.8554965,0.2504329,0.06511,1.002406,0.2249009,0.06870399,1.075348,0.3605268
std,42.60112,0.915397,0.08579235,0.09854993,0.0521373,0.1203414,0.1397989,0.09934114,0.1820592,8.557606,...,1.094155,0.1840033,1.288103,0.5601684,0.05480815,0.3558279,0.0944249,1.626345,6.895088,3.814547
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.166491e-06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.930869,0.0007527682,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.472436,...,0.0001559339,0.0,0.0,0.0,0.0,1.156922,0.2699485,0.0,0.0,0.0
50%,10.37373,1.157403,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.783773,...,0.8758587,0.0,0.5711215,0.0,0.1156922,1.156922,0.2699485,0.0,0.0,0.0
75%,24.41106,1.739014,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.79169,...,1.456695,0.0,1.142243,0.5711215,0.1156922,1.156922,0.2699485,0.0,0.0,0.0
max,23481.82,117.6025,13.13579,14.27804,5.711215,47.40308,15.42028,12.56467,28.55607,30.72614,...,74.957,1.713364,5.711215,3.99785,0.1156922,1.156922,0.2699485,112.1537,485.9995,224.3075


In [56]:
train_data = train_data[~train_data['f_43'].isna()]
test_data = test_data[~test_data['f_43'].isna()]

train_data.shape, test_data.shape 

((3304425, 82), (151206, 80))

数据处理

In [16]:
train_data = train_data.replace([np.inf, -np.inf], np.nan).fillna(0)
test_data = test_data.replace([np.inf, -np.inf], np.nan).fillna(0)

In [17]:
train_data['label'] = (train_data['is_clicked'] * 2 + train_data['is_installed']).astype(int)

In [18]:
data = pd.concat([train_data, test_data])
data.shape 

(3646825, 83)

In [19]:
columns = data.columns
for col in columns:
    if data[col].isna().any():
        print(col, data[col].isna().sum())

is_clicked 160973
is_installed 160973
label 160973


In [20]:
data.head()

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,f_73,f_74,f_75,f_76,f_77,f_78,f_79,is_clicked,is_installed,label
0,2541188,57,26325,22294,9018,25604,943,27941,19203,21533,...,2.284486,0.115692,1.156922,0.269948,0.0,0.0,0.0,0.0,0.0,0.0
1,2541440,60,5156,22294,18971,21545,3448,27941,19606,14659,...,0.0,0.0,1.156922,0.269948,0.0,0.0,0.0,0.0,0.0,0.0
2,2541480,65,30256,22294,11104,21545,20366,27941,19203,31372,...,0.0,0.0,1.156922,0.269948,0.0,0.0,0.0,1.0,0.0,2.0
3,2541780,63,17216,7152,15742,21545,23877,27941,19606,869,...,0.0,0.0,0.347077,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2541833,60,9317,22294,26866,21545,32370,27941,21218,14659,...,0.0,0.115692,1.156922,0.269948,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
cat_features = ['f_{}'.format(i) for i in range(1, 42)]
num_features = ['f_{}'.format(i) for i in range(42, 80)]
date_features = ['f_1']

In [22]:
for f in cat_features:
    le = LabelEncoder()
    data[f] = le.fit_transform(data[f])

In [23]:
features = [fe for fe in data.columns if fe not in ['is_clicked', 'is_installed', 'f_0', 'label']]

In [24]:
len(features)

79

In [25]:
train = data[~data['label'].isna()]
test = data[data['label'].isna()]
label = train['label'].astype('int')

train.shape, test.shape, label.shape

((3485852, 83), (160973, 83), (3485852,))

In [26]:
seed = 42
K = 5
num_class = 4

In [27]:
lgb_params = {
    'objective': 'multiclass',
    'boosting_type': 'gbdt',
    'metric': 'multi_logloss',
    'num_class': num_class,  # 类别数量
    'n_jobs': 30,
    'learning_rate': 0.05,
    'num_leaves': 2 ** 6,
    'max_depth': 8,
    'tree_learner': 'serial',
    'colsample_bytree': 0.8,
    'subsample_freq': 1,
    'subsample': 0.8,
    'num_boost_round': 5000,
    'max_bin': 255,
    'verbose': -1,
    'seed': seed,
    'bagging_seed': seed,
    'feature_fraction_seed': seed,
    'early_stopping_rounds': 100,
    # 'device': 'gpu',  # 设置使用 GPU 加速
    # 'gpu_platform_id': 0,  # 设置 GPU 平台 id
    # 'gpu_device_id': 0  # 设置 GPU 设备 id
}

In [None]:
# 按天f_1划分数据？？？

In [28]:
KF = StratifiedKFold(n_splits=K, random_state=seed, shuffle=True)
feat_imp_lgb = list()
oof_lgb = np.zeros((len(train), num_class))
predictions_lgb = np.zeros((len(test), num_class))
print(len(features))

# 模型训练
for fold_, (trn_idx, val_idx) in enumerate(KF.split(train.values, label.values)):
    print("fold n°{}".format(fold_))
    trn_data = lgb.Dataset(train.iloc[trn_idx][features], label=label.iloc[trn_idx])
    val_data = lgb.Dataset(train.iloc[val_idx][features], label=label.iloc[val_idx])
    
    num_round = 3000
    clf = lgb.train(
        lgb_params,
        trn_data,
        num_round,
        valid_sets=[trn_data, val_data],
        verbose_eval=300,
        early_stopping_rounds=100,
    )

    oof_lgb[val_idx] = clf.predict(train.iloc[val_idx][features], num_iteration=clf.best_iteration)
    predictions_lgb[:] += clf.predict(test[features], num_iteration=clf.best_iteration) / K
    feat_imp_lgb.append(clf.feature_importance())

79
fold n°0


KeyboardInterrupt: 

In [138]:
y_pred_train = np.argmax(oof_lgb, axis=1)
y_pred_test = np.argmax(predictions_lgb, axis=1)

In [139]:
acc = metrics.accuracy_score(label, y_pred_train)
precision = metrics.precision_score(label, y_pred_train, average='macro')
recall = metrics.recall_score(label, y_pred_train, average='macro')
f1 = metrics.f1_score(label, y_pred_train, average='macro')

print(f"Accuracy: {acc:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")

Accuracy: 0.8058, Precision: 0.7572, Recall: 0.5580, F1 Score: 0.6219


In [144]:
y_pred_test.mean(), y_pred_train.mean(), label.mean()

(1.4892435377361422, 0.35223583789558477, 0.6136892788334101)

In [183]:
feat_imp_lgb

[array([21506, 14961,  3061, 31024, 10274, 32233,     0, 10191, 11289,
         6086, 34042, 14120, 15038, 16292, 44874, 11286, 18844, 14763,
         5632,  7587,  4257,  2022,   396,   203,    20,     0,     0,
            0,     0,   449,   153,  7027,  1519,  2440,  2201,  1784,
         1361,   846,   381,  1907,   955, 41214, 14499,   797,   832,
          694,  1346,  2001,  1660,  2985, 18339, 11450,  8760, 15981,
        21580, 17891, 28263, 20359, 20325,  3452, 24231, 22731, 18306,
        16815, 17017, 11065, 18680,  9234,  9531, 14831,  1645, 11525,
         5636,  3553,  9514,  3712,   111,  1182,   344], dtype=int32),
 array([20954, 14370,  2931, 30292,  9780, 30565,     0,  9605, 10858,
         5782, 32250, 13594, 14427, 15260, 42853, 10781, 18120, 14147,
         5464,  7488,  4068,  1941,   336,   180,    27,     1,     0,
            0,     0,   441,   134,  6665,  1556,  2370,  2043,  1670,
         1380,   760,   319,  1794,   867, 39240, 14047,   918,   898,
     

In [184]:
avg_imp = pd.DataFrame(feat_imp_lgb, columns = features).apply(np.mean, axis = 0).sort_values(ascending=False)
avg_imp.describe()

count       79.000000
mean      9688.367089
std      10113.997509
min          0.000000
25%       1343.500000
50%       6767.000000
75%      15598.200000
max      43919.200000
dtype: float64

In [170]:
is_installed = y_pred_test % 2
is_clicked = (y_pred_test - is_installed) / 2

In [172]:
submission = pd.DataFrame()
submission["RowId"] = test_data["f_0"]
submission["is_clicked"] = is_clicked.astype('int')
submission["is_installed"] = is_installed.astype('int')
submission.to_csv('./output/lgb_42_init.csv', index=False, sep='\t')

In [185]:
avg_imp.head(10)

f_15    43919.2
f_42    39977.0
f_11    33137.2
f_6     31465.2
f_4     30623.4
f_57    27446.8
f_61    23441.6
f_62    22207.8
f_1     21177.2
f_55    21004.0
dtype: float64

In [186]:
avg_imp

f_15    43919.2
f_42    39977.0
f_11    33137.2
f_6     31465.2
f_4     30623.4
         ...   
f_26        0.8
f_7         0.0
f_29        0.0
f_28        0.0
f_27        0.0
Length: 79, dtype: float64

In [None]:
print("AUC score: {}".format(metrics.roc_auc_score(label, oof_lgb)))
print("F1 score: {}".format(metrics.f1_score(label, [1 if i >= 0.5 else 0 for i in oof_lgb])))
print("Precision score: {}".format(metrics.precision_score(label, [1 if i >= 0.5 else 0 for i in oof_lgb])))
print("Recall score: {}".format(metrics.recall_score(label, [1 if i >= 0.5 else 0 for i in oof_lgb])))

In [140]:
xgb_params = {  # baseline-finetune
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'random_state': seed,
    'booster': 'gbtree',
    'n_estimators': 3000, 
    'learning_rate': 0.1,
    'max_depth': 15,
    'min_child_weight': 7, 
    'subsample': 0.8,
    'colsample_bytree': 0.6, 
    'colsample_bylevel': 1.0,
    'alpha': 0.8,
    'lambda': 13, 
    'gamma': 0.9, 
    'tree_method': 'exact',
}

In [141]:
KF = StratifiedKFold(n_splits=K, random_state=seed, shuffle=True)
feat_imp_xgb = list()

oof_xgb = np.zeros((len(train), num_class))
predictions_xgb = np.zeros((len(test), num_class))
print(len(features))

# 模型训练
for fold_, (trn_idx, val_idx) in enumerate(KF.split(train.values, label.values)):
    print("fold n°{}".format(fold_))
    X_train, X_val = train.iloc[trn_idx][features], train.iloc[val_idx][features]
    y_train, y_val = label.iloc[trn_idx], label.iloc[val_idx]
    trn_data = xgb.DMatrix(X_train, label=y_train)
    val_data = xgb.DMatrix(X_val, label=y_val)

    num_round = 3000
    clf = xgb.train(
        xgb_params,
        trn_data,
        num_round,
        evals = [(trn_data, 'train'), (val_data, 'val')],
        verbose_eval=300,
        early_stopping_rounds=100,
    )

    oof_xgb[val_idx] = clf.predict(val_data, iteration_range=(0, clf.best_iteration))
    predictions_xgb[:] += clf.predict(xgb.DMatrix(test[features]), iteration_range=(0, clf.best_iteration)) / K
    feat_imp_xgb.append(clf.get_score())

79
fold n°0
Parameters: { "n_estimators" } are not used.



XGBoostError: [12:37:16] ../src/objective/regression_obj.cu:148: label must be in [0,1] for logistic regression
Stack trace:
  [bt] (0) /home/panda/anaconda3/envs/libcityng/lib/python3.9/site-packages/xgboost/lib/libxgboost.so(+0x674193) [0x7ff7e7dbf193]
  [bt] (1) /home/panda/anaconda3/envs/libcityng/lib/python3.9/site-packages/xgboost/lib/libxgboost.so(+0x68d7b8) [0x7ff7e7dd87b8]
  [bt] (2) /home/panda/anaconda3/envs/libcityng/lib/python3.9/site-packages/xgboost/lib/libxgboost.so(+0x2e0492) [0x7ff7e7a2b492]
  [bt] (3) /home/panda/anaconda3/envs/libcityng/lib/python3.9/site-packages/xgboost/lib/libxgboost.so(XGBoosterUpdateOneIter+0x70) [0x7ff7e78875f0]
  [bt] (4) /home/panda/anaconda3/envs/libcityng/lib/python3.9/lib-dynload/../../libffi.so.7(+0x69dd) [0x7ff8a39c59dd]
  [bt] (5) /home/panda/anaconda3/envs/libcityng/lib/python3.9/lib-dynload/../../libffi.so.7(+0x6067) [0x7ff8a39c5067]
  [bt] (6) /home/panda/anaconda3/envs/libcityng/lib/python3.9/lib-dynload/_ctypes.cpython-39-x86_64-linux-gnu.so(+0x140f6) [0x7ff8a39df0f6]
  [bt] (7) /home/panda/anaconda3/envs/libcityng/lib/python3.9/lib-dynload/_ctypes.cpython-39-x86_64-linux-gnu.so(+0x1073e) [0x7ff8a39db73e]
  [bt] (8) /home/panda/anaconda3/envs/libcityng/bin/python(_PyObject_MakeTpCall+0x37f) [0x56168da129ef]



In [None]:
cbc_params = {
    'random_state': seed,
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',
    'verbose': False,
    'learning_rate': 0.05,
    'depth': 5,
    'rsm': 0.2020238568794654,
    # 'min_data_in_leaf': 255,
    # 'l2_leaf_reg': 5,
    # 'subsample': 0.7,
    # 'use_best_model': True,
    # 'max_leaves': 12,
    'metric_period': 500,
    'n_estimators': 3000,
}

In [None]:
KF = StratifiedKFold(n_splits=K, random_state=seed, shuffle=True)
feat_imp_cbc = list()

oof_cbc = np.zeros((len(train), num_class))
predictions_cbc = np.zeros((len(test), num_class))
print(len(features))

model = CatBoostClassifier(**cbc_params)

# 模型训练
for fold_, (trn_idx, val_idx) in enumerate(KF.split(train.values, label.values)):
    print("fold n°{}".format(fold_))

    X_train, X_val = train.iloc[trn_idx][features], train.iloc[val_idx][features]
    y_train, y_val = label.iloc[trn_idx], label.iloc[val_idx]
    
    model.fit(X_train, y_train, eval_set=(X_val, y_val), 
              cat_features=cat_features,
              early_stopping_rounds=500, verbose=5000, use_best_model=True)

    oof_cbc[val_idx] += (model.predict_proba(X_val)[:, 1])
    predictions_cbc += (model.predict_proba(test[features])[:, 1]) / K
    feat_imp_cbc.append(model.feature_importances_)