In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from datetime import timedelta
import time as datetime
from gensim.models import Word2Vec
# from feature_selector import FeatureSelector
from gensim.models import Word2Vec

from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer


from deepctr.models import xDeepFM
from sklearn.preprocessing import LabelEncoder,MinMaxScaler
from deepctr.feature_column import SparseFeat, DenseFeat, get_feature_names
import warnings
warnings.filterwarnings('ignore')

import gc
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "0"

In [None]:
# data = pd.read_pickle('../data/train_ctr_test.pickle')

data = pd.read_pickle('../data/train67_test.pickle')


In [None]:
data.columns

In [None]:
data.head()

In [None]:
# 内存压缩
def data_compression(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in tqdm(df.columns):
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [5]:
train = data[~data['label'].isna()]
test = data[data['label'].isna()]

# Target Encoder


In [6]:
#  对ID特征进行 Target Encoder
def kfold_stats_feature(train,test,feats,k):
    folds = StratifiedKFold(n_splits=k,shuffle=True,random_state=2020)
    train['fold'] = None
    for fold_,(trn_idx, val_idx) in enumerate(folds.split(train,train['label'])):
        train.loc[val_idx,'fold'] = fold_
        
    kfold_features = []
    for feat in tqdm(feats):
        nums_columns = ['label']
        for f in nums_columns:
            colname = feat + '_' + f + '_kfold_mean'
            kfold_features.append(colname)
            train[colname] = None
            for fold_,(trn_idx,val_idx) in enumerate(folds.split(train,train['label'])):
                order_label = train.iloc[trn_idx].groupby([feat])[f].mean()
                train.loc[train.fold == fold_, colname] = train.loc[train.fold == fold_, [feat]][feat].map(order_label)
                
                # fillna
                global_mean = train[f].mean()
                train.loc[train.fold == fold_, colname] = train.loc[train.fold == fold_, colname].fillna(global_mean)
            train[colname] = train[colname].astype(float)
            
            
        for f in nums_columns:
            colname = feat + '_' + f + '_kfold_mean'
            test[colname] = None
            order_label = train.groupby([feat])[f].mean()
            test[colname] = test[feat].map(order_label)
            test[colname] = test[colname].astype(float)        
    del train['fold']
    return train,test

In [7]:
target_encode_cols = ['task_id', 'adv_id', 'creat_type_cd', 'adv_prim_id', 
                  	  'dev_id', 'inter_type_cd', 'slot_id', 'spread_app_id', 'tags', 'app_first_class',]
train,test =kfold_stats_feature(train,test,target_encode_cols,5)

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [08:39<00:00, 51.99s/it]


In [9]:
len(train),len(test)

(12027161, 1000000)

（1）组合点击率特征
（2）slot_id的分布，
（3）下一次点击网络变化
df_feature['next_netmodel'] = df_feature.groupby(['deviceid'])[ 'netmodel'].shift(-1)
（4） 对uid 进行分组，得到每个task_id对应的多个u_id，word2vec,得到每个广告的vec表示形式
（5) 每个gender age  career  city city_rank device_name点击的广告行业的均值

In [10]:
#  对ID特征进行 Target Encoder
def mix_kfold_stats_feature(train,test,mix_1,mix_2,k):
    folds = StratifiedKFold(n_splits=k,shuffle=True,random_state=2020)
    train['fold'] = None
    for fold_,(trn_idx, val_idx) in enumerate(folds.split(train,train['label'])):
        train.loc[val_idx,'fold'] = fold_
        
    kfold_features = []
    for feat_1 in tqdm(mix_1):
        for feat_2 in mix_2:
            nums_columns = ['label']
            for f in nums_columns:
                mix1_mix_2_col = feat_1 +'_'+ feat_2
                colname = feat_1 +'_'+ feat_2 + '_' + f + '_kfold_mean'
                kfold_features.append(colname)
                train[colname] = None
                train[mix1_mix_2_col] =  train[feat_1].astype(str) + '_'+ train[feat_2].astype(str)
                test[mix1_mix_2_col] =  test[feat_1].astype(str) + '_'+ test[feat_2].astype(str)

                for fold_,(trn_idx,val_idx) in enumerate(folds.split(train,train['label'])):
                    order_label = train[[mix1_mix_2_col,f]].iloc[trn_idx].groupby([mix1_mix_2_col])[f].mean()
#                     print(order_label.reset_index())
                    train.loc[train.fold == fold_, colname] =\
                            train.loc[train.fold == fold_, [mix1_mix_2_col]][mix1_mix_2_col].map(order_label)

#                     # fillna
                    global_mean = train[f].mean()
                    train.loc[train.fold == fold_, colname] = train.loc[train.fold == fold_, colname].fillna(global_mean)
                train[colname] = train[colname].astype(float)


            for f in nums_columns:
                colname = feat_1 +'_'+ feat_2 + '_' + f + '_kfold_mean'
                test[colname] = None
                order_label = train[[mix1_mix_2_col,f]].groupby([mix1_mix_2_col])[f].mean()
                test[colname] = test[mix1_mix_2_col].map(order_label)
                test[colname] = test[colname].astype(float)
            del train[mix1_mix_2_col]
            del test[mix1_mix_2_col]
    del train['fold']
    del order_label
    gc.collect()
    return train,test

In [11]:
# 组合交叉点击率
# task_id(广告id) / ndu_name(广告行业) /adv_prim_id(广告主) 与gender age  career  city city_rank device_name
mix_1_traget_encode_cols = ['task_id','adv_id','indu_name','adv_prim_id','spread_app_id','tags','app_first_class','app_second_class']
mix_2_traget_encode_cols = ['age','gender','career','city','city_rank','device_name','device_size','residence','his_app_size']
train,test = mix_kfold_stats_feature(train,test,mix_1_traget_encode_cols,mix_2_traget_encode_cols,5)


100%|█████████████████████████████████████████████████████████████████████████████████| 8/8 [1:19:01<00:00, 592.70s/it]


In [15]:
data = pd.concat([train,test],sort=False).reset_index(drop=True)

In [16]:
data.shape

(13027161, 118)

In [23]:
data.tail()

Unnamed: 0,label,uid,task_id,adv_id,creat_type_cd,adv_prim_id,dev_id,inter_type_cd,slot_id,spread_app_id,...,app_first_class_his_app_size_label_kfold_mean,app_second_class_age_label_kfold_mean,app_second_class_gender_label_kfold_mean,app_second_class_career_label_kfold_mean,app_second_class_city_label_kfold_mean,app_second_class_city_rank_label_kfold_mean,app_second_class_device_name_label_kfold_mean,app_second_class_device_size_label_kfold_mean,app_second_class_residence_label_kfold_mean,app_second_class_his_app_size_label_kfold_mean
13027156,,1759244,2847,6231,6,112,60,3,17,78,...,0.020524,0.017155,0.019356,0.018434,0.017828,0.02203,0.015815,0.014989,0.019632,0.019863
13027157,,1009170,3382,5695,6,209,60,3,11,78,...,0.020524,0.0175,0.019356,0.018434,0.02069,0.02203,0.016237,0.019806,0.023406,0.019863
13027158,,2052980,4190,4425,8,142,60,5,12,80,...,0.020524,0.018463,0.025812,0.026042,0.028051,0.026682,0.020186,0.022671,0.025631,0.023153
13027159,,1721002,5362,6801,7,104,37,5,17,50,...,0.040085,0.038182,0.041787,0.0517,0.039499,0.03906,0.033092,0.032404,0.041509,0.037365
13027160,,2019155,1397,6167,7,162,60,5,17,70,...,0.020524,0.033163,0.040778,0.033345,0.035259,0.042065,0.033304,0.038918,0.035626,0.01677


In [22]:
del train
del test 
gc.collect()

56

#  Count Encoder

In [24]:
# Count Encoder
to_count = [['task_id'], ['adv_id'], ['creat_type_cd'], ['adv_prim_id'], 
            ['dev_id'], ['inter_type_cd'], ['slot_id'], ['spread_app_id'], ['tags'], ['app_first_class'],
            ['app_second_class'], ['age'], ['city'], ['city_rank'], ['device_name'], ['device_size'],
            ['career'], ['gender'], ['net_type'], ['residence'], ['his_app_size'], ['his_on_shelf_time'],
            ['app_score'], ['emui_dev'], ['list_time'], ['device_price'], ['up_life_duration'], ['up_membership_grade'],
            ['membership_life_duration'], ['consume_purchase'], ['communication_onlinerate'], ['communication_avgonline_30d'],
            ['indu_name']]

for i in tqdm(to_count):
    data['{}_count'.format('_'.join(i))] = data[i].groupby(i)[i].transform('count')
    

100%|██████████████████████████████████████████████████████████████████████████████████| 33/33 [03:06<00:00,  5.65s/it]


In [25]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13027161 entries, 0 to 13027160
Columns: 151 entries, label to indu_name_count
dtypes: float64(83), int64(67), object(1)
memory usage: 14.7+ GB


In [27]:
# 压缩内存
data = data_compression(data)

  0%|                                                                                          | 0/151 [00:00<?, ?it/s]

Memory usage of dataframe is 15007.79 MB


100%|████████████████████████████████████████████████████████████████████████████████| 151/151 [05:58<00:00,  2.38s/it]

Memory usage after optimization is: 4248.99 MB
Decreased by 71.7%





In [30]:
# data.isna().sum()
gc.collect()
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13027161 entries, 0 to 13027160
Columns: 151 entries, label to indu_name_count
dtypes: category(1), float16(83), int16(5), int32(34), int8(28)
memory usage: 4.1 GB


In [31]:
# task_id分组（交叉） Count Encoder
to_count_1 = ['adv_id','creat_type_cd','adv_prim_id','dev_id','spread_app_id','tags','app_first_class','app_second_class','indu_name','inter_type_cd']
for col in tqdm(to_count_1):
    data["{}_count".format("_".join(i))] = data[['task_id',col]].groupby(['task_id'])[col].transform('count')


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:06<00:00,  1.60it/s]


In [32]:
# u_id 分组（交叉） Count Encoder
# 同一个广告推荐给用户的次数
data['uid_adv_id_cnt'] = data[['uid','adv_id']].groupby(['uid','adv_id'])['adv_id'].transform('count')

In [33]:
# 每个广告在不同地区，age的推荐次数
mix_1_count_cols = ['task_id','adv_id','indu_name','adv_prim_id','spread_app_id','tags','app_first_class','app_second_class']
mix_2_count_cols = ['age','gender','career','city','city_rank','device_name','device_size','residence','his_app_size']
for i in tqdm(mix_1_count_cols):
    for j in mix_2_count_cols:
         data[i+'_'+j+'_'+'cnt'] = data[[i,j]].groupby([i,j])[j].transform('count')

100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [05:50<00:00, 43.79s/it]


In [34]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13027161 entries, 0 to 13027160
Columns: 224 entries, label to app_second_class_his_app_size_cnt
dtypes: category(1), float16(83), int16(5), int32(33), int64(74), int8(28)
memory usage: 11.3 GB


In [35]:
data.to_feather('../data/tmp/1.feathher')

In [38]:
data.columns[32]

'communication_onlinerate'

In [39]:
del data['communication_onlinerate']

In [40]:
# 压缩内存
data = data_compression(data)

  0%|                                                                                          | 0/223 [00:00<?, ?it/s]

Memory usage of dataframe is 11529.16 MB


100%|████████████████████████████████████████████████████████████████████████████████| 223/223 [02:56<00:00,  1.26it/s]

Memory usage after optimization is: 7665.40 MB
Decreased by 33.5%





In [41]:
data.to_feather('../data/tmp/1_reduce.feathher')

In [43]:
data.columns

Index(['label', 'uid', 'task_id', 'adv_id', 'creat_type_cd', 'adv_prim_id',
       'dev_id', 'inter_type_cd', 'slot_id', 'spread_app_id',
       ...
       'COUNT-2order_uid_app_first_class', 'STAT_uid_city_nunique_1',
       'STAT_uid_city_nunique_2', 'COUNT-2order_uid_city',
       'STAT_uid_device_name_nunique_1', 'STAT_uid_device_name_nunique_2',
       'COUNT-2order_uid_device_name', 'STAT_uid_net_type_nunique_1',
       'STAT_uid_net_type_nunique_2', 'COUNT-2order_uid_net_type'],
      dtype='object', length=253)

# Nunique Encoder

In [None]:
# Count Encoder
# ['uid','communication_onlinerate'],
to_group = [
    ['uid','task_id'], ['uid','adv_id'], ['uid','adv_prim_id'], ['uid','dev_id'], ['uid','slot_id'],
    ['uid','spread_app_id'], ['uid','app_first_class'], ['uid','city'], ['uid','device_name'], ['uid', 'net_type'],
     ['uid','list_time']
]

feature = pd.DataFrame()
for i in tqdm(to_group):
    data["STAT_{}_nunique_1".format("_".join(i))] = data[i].groupby(i[1])[i[0]].transform('nunique')
    data["STAT_{}_nunique_2".format("_".join(i))] = data[i].groupby(i[0])[i[1]].transform('nunique')
    data["COUNT-2order_{}".format("_".join(i))] = data[i].groupby(i)[i[0]].transform("count")

In [45]:
data.columns 

Index(['label', 'uid', 'task_id', 'adv_id', 'creat_type_cd', 'adv_prim_id',
       'dev_id', 'inter_type_cd', 'slot_id', 'spread_app_id',
       ...
       'COUNT-2order_uid_city', 'STAT_uid_device_name_nunique_1',
       'STAT_uid_device_name_nunique_2', 'COUNT-2order_uid_device_name',
       'STAT_uid_net_type_nunique_1', 'STAT_uid_net_type_nunique_2',
       'COUNT-2order_uid_net_type', 'STAT_uid_list_time_nunique_1',
       'STAT_uid_list_time_nunique_2', 'COUNT-2order_uid_list_time'],
      dtype='object', length=256)

In [47]:
# 压缩内存
data = data_compression(data)


  0%|                                                                                          | 0/256 [00:00<?, ?it/s][A

Memory usage of dataframe is 9491.68 MB



  0%|▎                                                                                 | 1/256 [00:00<00:57,  4.47it/s][A
  1%|▋                                                                                 | 2/256 [00:00<00:51,  4.96it/s][A
  2%|█▎                                                                                | 4/256 [00:00<00:42,  5.92it/s][A
  2%|█▉                                                                                | 6/256 [00:00<00:35,  7.01it/s][A
  3%|██▌                                                                               | 8/256 [00:00<00:30,  8.26it/s][A
  4%|███▏                                                                             | 10/256 [00:01<00:26,  9.43it/s][A
  5%|███▊                                                                             | 12/256 [00:01<00:23, 10.48it/s][A
  5%|████▍                                                                            | 14/256 [00:01<00:21, 11.26it/s][A
  6%|█████     

 59%|███████████████████████████████████████████████▌                                | 152/256 [00:26<00:14,  7.41it/s][A
 60%|███████████████████████████████████████████████▊                                | 153/256 [00:26<00:14,  7.24it/s][A
 60%|████████████████████████████████████████████████▏                               | 154/256 [00:26<00:14,  7.12it/s][A
 61%|████████████████████████████████████████████████▊                               | 156/256 [00:27<00:13,  7.47it/s][A
 62%|█████████████████████████████████████████████████▍                              | 158/256 [00:27<00:12,  7.71it/s][A
 62%|██████████████████████████████████████████████████                              | 160/256 [00:27<00:12,  7.87it/s][A
 63%|██████████████████████████████████████████████████▎                             | 161/256 [00:27<00:12,  7.50it/s][A
 63%|██████████████████████████████████████████████████▋                             | 162/256 [00:28<00:13,  7.08it/s][A
 64%|███████████

Memory usage after optimization is: 8622.03 MB
Decreased by 9.2%





In [5]:
# data.to_feather('../data/tmp/2_reduce.feathher')

74

In [2]:
%time data = pd.read_feather('../data/tmp/2_reduce.feathher')

Wall time: 8.67 s
Wall time: 8.67 s
Wall time: 8.67 s


# 统计特征

In [None]:
# to_group = [['task_id'], ['dev_id'], ['adv_prim_id'], ['adv_id'], ['inter_type_cd'], ['slot_id'], ['tags'], ['app_first_class']]
# to_inter =  [ 'age', 'city_rank', 'career', 'his_app_size', 'his_on_shelf_time', 'app_score', 'emui_dev', 'device_price', 'up_life_duration', 'communication_avgonline_30d']
# # to_calc = ['std','mean','min','max',lambda x:np.std(np.fft.fft(x))]
# to_calc = ['std','mean','min','max']

# for i in tqdm(to_group):
#     for j in to_inter:
#         for k in to_calc:
#             data['STAT_{}_{}_{}'.format('_'.join(i),j,k)] = data[i + [j]].groupby(i)[j].transform(k)



In [3]:
to_group = [['task_id'], ['dev_id'], ['adv_prim_id'], ['adv_id']]
to_inter =  [ 'age', 'city_rank', 'career', 'his_app_size', 'his_on_shelf_time', 'app_score', 'emui_dev', 'device_price', 'up_life_duration', 'communication_avgonline_30d']
# to_calc = ['std','mean','min','max',lambda x:np.std(np.fft.fft(x))]
to_calc = ['std','mean']

for i in tqdm(to_group):
    for j in to_inter:
        for k in to_calc:
            data['STAT_{}_{}_{}'.format('_'.join(i),j,k)] = data[i + [j]].groupby(i)[j].transform(k)



100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [06:36<00:00, 99.21s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [06:36<00:00, 99.21s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [06:36<00:00, 99.21s/it]


In [6]:
# 压缩
data = data_compression(data)
import gc
gc.collect()

  0%|                                                                                          | 0/336 [00:00<?, ?it/s]

Memory usage of dataframe is 16399.24 MB
Memory usage of dataframe is 16399.24 MB
Memory usage of dataframe is 16399.24 MB


100%|████████████████████████████████████████████████████████████████████████████████| 336/336 [04:09<00:00,  1.35it/s]




Memory usage after optimization is: 10584.97 MB
Decreased by 35.5%
Memory usage after optimization is: 10584.97 MB
Decreased by 35.5%


0

Memory usage after optimization is: 10584.97 MB
Decreased by 35.5%


0

0

In [8]:
data.to_feather('../data/tmp/3_reducer.feather')

In [9]:
to_group = [['inter_type_cd'], ['slot_id'], ['tags'], ['app_first_class']]
to_inter =  [ 'age', 'city_rank', 'career', 'his_app_size', 'his_on_shelf_time', 'app_score', 'emui_dev', 'device_price', 'up_life_duration', 'communication_avgonline_30d']
# to_calc = ['std','mean','min','max',lambda x:np.std(np.fft.fft(x))]
to_calc = ['std','mean']

for i in tqdm(to_group):
    for j in to_inter:
        for k in to_calc:
            data['STAT_{}_{}_{}'.format('_'.join(i),j,k)] = data[i + [j]].groupby(i)[j].transform(k)



100%|███████████████████████████████████████████████████████████████████████████████████| 4/4 [06:43<00:00, 103.13s/it]
100%|███████████████████████████████████████████████████████████████████████████████████| 4/4 [06:43<00:00, 100.79s/it]



In [10]:
# 压缩
data = data_compression(data)
import gc
gc.collect()

  0%|                                                                                          | 0/416 [00:00<?, ?it/s]

Memory usage of dataframe is 18536.11 MB
Memory usage of dataframe is 18536.11 MB
Memory usage of dataframe is 18536.11 MB


100%|████████████████████████████████████████████████████████████████████████████████| 416/416 [05:32<00:00,  1.25it/s]




Memory usage after optimization is: 12572.75 MB
Decreased by 32.2%
Memory usage after optimization is: 12572.75 MB
Decreased by 32.2%


15

Memory usage after optimization is: 12572.75 MB
Decreased by 32.2%


15

15

In [12]:
data.head()

Unnamed: 0,label,uid,task_id,adv_id,creat_type_cd,adv_prim_id,dev_id,inter_type_cd,slot_id,spread_app_id,...,STAT_app_first_class_app_score_std,STAT_app_first_class_app_score_mean,STAT_app_first_class_emui_dev_std,STAT_app_first_class_emui_dev_mean,STAT_app_first_class_device_price_std,STAT_app_first_class_device_price_mean,STAT_app_first_class_up_life_duration_std,STAT_app_first_class_up_life_duration_mean,STAT_app_first_class_communication_avgonline_30d_std,STAT_app_first_class_communication_avgonline_30d_mean
0,0.0,2162947,5936,2934,6,126,19,5,18,57,...,0.0,2.0,4.507812,19.1875,1.168945,3.376953,9.117188,12.34375,1.620117,11.15625
1,0.0,1802604,4189,6509,7,122,18,5,16,56,...,0.0,2.0,4.507812,19.1875,1.168945,3.376953,9.117188,12.34375,1.620117,11.15625
2,0.0,2221433,1689,2069,3,142,36,5,12,80,...,0.0,2.0,4.507812,19.1875,1.168945,3.376953,9.117188,12.34375,1.620117,11.15625
3,1.0,2100689,2926,4168,3,142,36,5,12,80,...,0.0,2.0,4.507812,19.1875,1.168945,3.376953,9.117188,12.34375,1.620117,11.15625
4,0.0,2101862,1086,1219,5,142,36,5,18,80,...,0.0,2.0,4.507812,19.1875,1.168945,3.376953,9.117188,12.34375,1.620117,11.15625


Unnamed: 0,label,uid,task_id,adv_id,creat_type_cd,adv_prim_id,dev_id,inter_type_cd,slot_id,spread_app_id,...,STAT_app_first_class_app_score_std,STAT_app_first_class_app_score_mean,STAT_app_first_class_emui_dev_std,STAT_app_first_class_emui_dev_mean,STAT_app_first_class_device_price_std,STAT_app_first_class_device_price_mean,STAT_app_first_class_up_life_duration_std,STAT_app_first_class_up_life_duration_mean,STAT_app_first_class_communication_avgonline_30d_std,STAT_app_first_class_communication_avgonline_30d_mean
0,0.0,2162947,5936,2934,6,126,19,5,18,57,...,0.0,2.0,4.507812,19.1875,1.168945,3.376953,9.117188,12.34375,1.620117,11.15625
1,0.0,1802604,4189,6509,7,122,18,5,16,56,...,0.0,2.0,4.507812,19.1875,1.168945,3.376953,9.117188,12.34375,1.620117,11.15625
2,0.0,2221433,1689,2069,3,142,36,5,12,80,...,0.0,2.0,4.507812,19.1875,1.168945,3.376953,9.117188,12.34375,1.620117,11.15625
3,1.0,2100689,2926,4168,3,142,36,5,12,80,...,0.0,2.0,4.507812,19.1875,1.168945,3.376953,9.117188,12.34375,1.620117,11.15625
4,0.0,2101862,1086,1219,5,142,36,5,18,80,...,0.0,2.0,4.507812,19.1875,1.168945,3.376953,9.117188,12.34375,1.620117,11.15625


Unnamed: 0,label,uid,task_id,adv_id,creat_type_cd,adv_prim_id,dev_id,inter_type_cd,slot_id,spread_app_id,...,STAT_app_first_class_app_score_std,STAT_app_first_class_app_score_mean,STAT_app_first_class_emui_dev_std,STAT_app_first_class_emui_dev_mean,STAT_app_first_class_device_price_std,STAT_app_first_class_device_price_mean,STAT_app_first_class_up_life_duration_std,STAT_app_first_class_up_life_duration_mean,STAT_app_first_class_communication_avgonline_30d_std,STAT_app_first_class_communication_avgonline_30d_mean
0,0.0,2162947,5936,2934,6,126,19,5,18,57,...,0.0,2.0,4.507812,19.1875,1.168945,3.376953,9.117188,12.34375,1.620117,11.15625
1,0.0,1802604,4189,6509,7,122,18,5,16,56,...,0.0,2.0,4.507812,19.1875,1.168945,3.376953,9.117188,12.34375,1.620117,11.15625
2,0.0,2221433,1689,2069,3,142,36,5,12,80,...,0.0,2.0,4.507812,19.1875,1.168945,3.376953,9.117188,12.34375,1.620117,11.15625
3,1.0,2100689,2926,4168,3,142,36,5,12,80,...,0.0,2.0,4.507812,19.1875,1.168945,3.376953,9.117188,12.34375,1.620117,11.15625
4,0.0,2101862,1086,1219,5,142,36,5,18,80,...,0.0,2.0,4.507812,19.1875,1.168945,3.376953,9.117188,12.34375,1.620117,11.15625


In [14]:
# 特征选择
data_len = len(data)
drop_feat = ['label','id','uid','pt_d']
feature_name = [i for i in data.columns if i not in drop_feat]
useless_col = []
for col in tqdm(feature_name):
    if (data[col].value_counts().iloc[0] / data_len) > 0.95:
        useless_col.append(col)

100%|████████████████████████████████████████████████████████████████████████████████| 413/413 [01:43<00:00,  3.99it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 413/413 [01:43<00:00,  3.99it/s]



In [15]:
useless_col

['membership_life_duration',
 'membership_life_duration_count',
 'STAT_uid_city_nunique_2',
 'STAT_uid_device_name_nunique_2',
 'STAT_uid_list_time_nunique_2',
 'STAT_task_id_his_app_size_std',
 'STAT_task_id_his_on_shelf_time_std',
 'STAT_task_id_app_score_std',
 'STAT_dev_id_his_on_shelf_time_std',
 'STAT_dev_id_app_score_std',
 'STAT_adv_id_his_app_size_std',
 'STAT_adv_id_his_on_shelf_time_std',
 'STAT_adv_id_app_score_std',
 'STAT_app_first_class_app_score_std']

['membership_life_duration',
 'membership_life_duration_count',
 'STAT_uid_city_nunique_2',
 'STAT_uid_device_name_nunique_2',
 'STAT_uid_list_time_nunique_2',
 'STAT_task_id_his_app_size_std',
 'STAT_task_id_his_on_shelf_time_std',
 'STAT_task_id_app_score_std',
 'STAT_dev_id_his_on_shelf_time_std',
 'STAT_dev_id_app_score_std',
 'STAT_adv_id_his_app_size_std',
 'STAT_adv_id_his_on_shelf_time_std',
 'STAT_adv_id_app_score_std',
 'STAT_app_first_class_app_score_std']

['membership_life_duration',
 'membership_life_duration_count',
 'STAT_uid_city_nunique_2',
 'STAT_uid_device_name_nunique_2',
 'STAT_uid_list_time_nunique_2',
 'STAT_task_id_his_app_size_std',
 'STAT_task_id_his_on_shelf_time_std',
 'STAT_task_id_app_score_std',
 'STAT_dev_id_his_on_shelf_time_std',
 'STAT_dev_id_app_score_std',
 'STAT_adv_id_his_app_size_std',
 'STAT_adv_id_his_on_shelf_time_std',
 'STAT_adv_id_app_score_std',
 'STAT_app_first_class_app_score_std']

In [18]:
data.drop(columns=useless_col,inplace=True)

In [None]:
# 回调 随机游走

嵌入特征
['uid', 'task_id'] ->[adv_id，slot_id,tags]
逆向思维：

In [20]:
data.to_feather('../data/tmp/feat_all_reduce.feather')

In [19]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13027161 entries, 0 to 13027160
Columns: 402 entries, label to STAT_app_first_class_communication_avgonline_30d_mean
dtypes: float16(233), int16(22), int32(110), int8(37)
memory usage: 12.0 GB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13027161 entries, 0 to 13027160
Columns: 402 entries, label to STAT_app_first_class_communication_avgonline_30d_mean
dtypes: float16(233), int16(22), int32(110), int8(37)
memory usage: 12.0 GB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13027161 entries, 0 to 13027160
Columns: 402 entries, label to STAT_app_first_class_communication_avgonline_30d_mean
dtypes: float16(233), int16(22), int32(110), int8(37)
memory usage: 12.0 GB


# train and test

In [2]:
%time data=pd.read_feather('../data/tmp/feat_all_reduce.feather')

Wall time: 51.5 s


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13027161 entries, 0 to 13027160
Columns: 402 entries, label to STAT_app_first_class_communication_avgonline_30d_mean
dtypes: float16(233), int16(22), int32(110), int8(37)
memory usage: 12.0 GB


In [2]:
# i_sparse_features = ['uid','task_id', 'adv_id', 'creat_type_cd', 'adv_prim_id',
#                    'dev_id', 'inter_type_cd', 'slot_id', 'spread_app_id', 'tags',
#                    'app_first_class', 'app_second_class','city', 'city_rank',
#                    'device_name','career', 'gender', 'net_type', 'residence', 
#                    'consume_purchase','indu_name','label']
# %time sparse_data = pd.read_feather('../data/data_feat_all_reduce.feather',columns = i_sparse_features)
# %time dense_feature_scaler = pd.read_feather('../data/dense_feature_scaler.feather')
# pt_d  = pd.read_feather('../data/data_feat_all_reduce.feather',columns = ['pt_d'])
# data = pd.concat([sparse_data,dense_feature_scaler],axis=1)
# data = pd.concat([data,pt_d],axis=1)

# sparse_features = ['task_id', 'adv_id', 'creat_type_cd', 'adv_prim_id',
#                    'dev_id', 'inter_type_cd', 'slot_id', 'spread_app_id', 'tags',
#                    'app_first_class', 'app_second_class','city', 'city_rank',
#                    'device_name','career', 'gender', 'net_type', 'residence', 
#                    'consume_purchase','indu_name',]
# sparse_data[sparse_features] = sparse_data[sparse_features].fillna('-1', )
# types_map = sparse_data.dtypes
# for feat in sparse_features:
#     sparse_data[feat] = LabelEncoder().fit_transform(sparse_data[feat]).astype(types_map[feat])

Wall time: 1.35 s
Wall time: 13.1 s


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6302870 entries, 0 to 6302869
Columns: 454 entries, uid to pt_d
dtypes: float16(432), int16(4), int32(1), int8(17)
memory usage: 5.2 GB


In [4]:
drop_feat = ['label','id','uid','pt_d','communication_onlinerate']
feature_name = [i for i in data.columns if i not in drop_feat]

In [5]:

# membership_life_duration communication_onlinerate
sparse_features = ['task_id', 'adv_id', 'creat_type_cd', 'adv_prim_id',
                   'dev_id', 'inter_type_cd', 'slot_id', 'spread_app_id', 'tags',
                   'app_first_class', 'app_second_class','city', 'city_rank',
                   'device_name','career', 'gender', 'net_type', 'residence', 
                   'consume_purchase','indu_name',]
dense_features_1 = ['age', 'device_size',  'his_app_size', 'his_on_shelf_time', 'app_score',
                   'emui_dev', 'list_time', 'device_price', 'up_life_duration',
                  'communication_avgonline_30d']
dense_features_2 = [i for i in feature_name if i not in (sparse_features+dense_features_1)]
dense_features = dense_features_2 + dense_features_1
target = ['label']


In [8]:
print('begin fillna....')
data[sparse_features] = data[sparse_features].fillna('-1', )
data[dense_features] = data[dense_features].fillna(0,)

print('begin labelEncoder....')
types_map = data.dtypes
for feat in sparse_features:
    data[feat] = LabelEncoder().fit_transform(data[feat]).astype(types_map[feat])
 



begin fillna....


In [None]:
tmp_feat = pd.DataFrame()
for feat in tqdm(dense_features):
    tmp_feat[feat] = MinMaxScaler(feature_range=(0,1)).fit_transform(data[[feat]]).astype(np.float16)[:,0]
    del data[feat]
    gc.collect()

100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [05:09<00:00,  6.18s/it]


In [13]:
gc.collect()

272

In [31]:
data = pd.concat([data,tmp_feat],axis=1)

In [33]:
# data.to_feather('../data/tmp/feat_all_reduce_scale.feather')

# 此处开始


In [3]:
%time data = pd.read_feather('../data/tmp/feat_all_reduce_scale.feather')

Wall time: 1min 27s


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13027161 entries, 0 to 13027160
Columns: 402 entries, label to communication_avgonline_30d
dtypes: float16(330), float64(50), int16(4), int32(1), int8(17)
memory usage: 13.2 GB


In [6]:
fixlen_feature_columns  = [SparseFeat(feat,vocabulary_size=data[feat].nunique(),embedding_dim=4) for feat in sparse_features]\
            + [DenseFeat(feat,1,) for feat in dense_features]

In [7]:
# 生成特征列
dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

In [8]:
# train = data[data['pt_d'] == 6].reset_index(drop=True)
# val = data[data['pt_d'] == 7].reset_index(drop=True)
train = data[data['pt_d'] != 8].reset_index(drop=True)
test = data[data['pt_d'] == 8].reset_index(drop=True)

In [9]:
del data 
import gc
gc.collect()

47

In [10]:
train_model_input = {name: train[name].values for name in feature_names}
# val_model_input = {name: val[name].values for name in feature_names}
test_model_input = {name: test[name].values for name in feature_names}

In [13]:
model = xDeepFM(linear_feature_columns,dnn_feature_columns,task='binary')
model.compile('adam','binary_crossentropy',metrics=['binary_crossentropy','accuracy'])

history = model.fit(train_model_input,train[target].values,batch_size=4096,epochs=2,
                    validation_data = (val_model_input,val[target].values))


Train on 6017985 samples, validate on 6009176 samples
Epoch 1/2
Epoch 2/2


In [14]:
pred_ans = model.predict(val_model_input,batch_size=4096*2)
roc_auc_score(val[target].values,pred_ans)

0.7950158895811508

In [11]:
# 第二次训练，验证集作为训练
model = xDeepFM(linear_feature_columns,dnn_feature_columns,task='binary')
model.compile('adam','binary_crossentropy',metrics=['binary_crossentropy','accuracy'])

history = model.fit(train_model_input,train[target].values,batch_size=4096,epochs=2,)

Train on 12027161 samples
Epoch 1/2
Epoch 2/2


In [25]:
pred_ans = model.predict(train_model_input,batch_size=4096*2)
roc_auc_score(train[target].values,pred_ans)

0.7949279315859

In [13]:
pred_ans = model.predict(test_model_input,batch_size=4096*4)

In [14]:
pred_ans

array([[0.01364288],
       [0.10171694],
       [0.3447391 ],
       ...,
       [0.02585912],
       [0.01625967],
       [0.01853165]], dtype=float32)

In [15]:
submit = pd.DataFrame()
submit['id'] = range(1,len(test)+1)
submit['probability'] = pred_ans


In [30]:
submit.head()

Unnamed: 0,id,probability
0,1,0.00938
1,2,0.05944
2,3,0.259231
3,4,0.018994
4,5,0.059031


In [16]:
submit.to_csv('../submit/0907_3/submission.csv',index=False)

In [28]:
pd.read_csv('../submit/0907_1/submission.csv').head()

Unnamed: 0,id,probability
0,1,0.011877
1,2,0.102997
2,3,0.282427
3,4,0.023081
4,5,0.074156


In [17]:
# 用前一天预测效果最好

In [17]:
test_pred_prob

array([0.01327293, 0.15979464, 0.23355472, ..., 0.15256163, 0.08342812,
       0.07438161])

In [18]:
submit = pd.DataFrame()
submit['id'] = range(1,len(test)+1)
submit['probability'] = test_pred_prob


In [20]:
submit.to_csv('../submit/0903_2/submission.csv',index=False)

In [21]:
prob_oof

array([0.01163306, 0.06300333, 0.02498697, ..., 0.01603606, 0.01725756,
       0.02520344])

In [18]:
t1 = pd.DataFrame({'id':[1,2,3],'to':[2,3,4],'label':[9,6,7]})
t1

Unnamed: 0,id,to
0,1,2
1,2,3
2,3,4


In [19]:
t2 = pd.DataFrame({'ids':[1,2,3],'tos':[2,3,4]})
t2

Unnamed: 0,ids,tos
0,1,2
1,2,3
2,3,4


In [20]:
t2[['ids','tos']].map(t1)

AttributeError: 'DataFrame' object has no attribute 'map'