In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
# import seaborn as sns
# import pickle
import time
from tqdm import tqdm

%matplotlib inline

#每次可以输出多个变量
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from pylab import rcParams
rcParams['figure.figsize'] = 14, 6

import warnings
warnings.filterwarnings("ignore")

#中文字体
import matplotlib
matplotlib.use('qt4agg')
#指定默认字体
matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['font.family'] = 'sans-serif'
#解决负号'-'显示为方块的问题
matplotlib.rcParams['axes.unicode_minus'] = False

In [2]:
basics_cols = [
    'item_price_level', 'item_sales_level', 'item_collected_level',
    'item_pv_level', 'user_gender_id', 'user_age_level', 'user_occupation_id',
    'user_star_level', 'context_page_id', 'shop_review_num_level',
    'shop_review_positive_rate', 'shop_star_level', 'shop_score_service',
    'shop_score_delivery', 'shop_score_description'
]

zero_feas = [
    # 基础特征
    basics_cols,
    #id转vector
    [
        'item_id_idx', 'user_id_idx', 'shop_id_idx', 'item_brand_id_idx',
        'item_city_id_idx'
    ],
    # 时间特征 !!
    [
        'day', 'hour', 'minute', 'yesterday', 'yesterhour', 'next_time_sub',
        'last_time_sub', 'user_item_next_time_sub', 'user_item_last_time_sub',
        'user_shop_next_time_sub', 'user_shop_last_time_sub',
        'user_count_first_time_sub', 'user_item_count_first_time_sub',
        'user_shop_count_first_time_sub', 'is_last_click',
        'is_last_user_item_click', 'max_click_time_sub',
        'max_user_item_click_time_sub'
    ],
    #shop、item得分 !!
    ['item_score', 'item_score2', 'item_score3', 'shop_score'],
    # 查询次数 !
    [
        'user_day_query', 'user_yesterday_query', 'user_hour_query',
        'user_yesterhour_query', 'user_minute_query', 'item_day_query',
        'item_hour_query', 'user_item_day_query', 'user_item_hour_query',
        'item_minute_query', 'user_item_minute_query'
    ],
    # user在一个时间段出现的次数序号 !
    ['user_count', 'user_day_count', 'user_hour_count', 'user_minute_count'],
    # item与pred类型特征
    [
        'item_category_split_count', 'item_property_split_count',
        'pred_category_split_count', 'pred_property_split_count',
        'item_pred_category_score', 'item_pred_property_score',
        'item_pred_category_score_item%', 'item_pred_property_score_item%',
        'item_pred_category_score_pred%', 'item_pred_property_score_pred%'
    ],
    # 统计特征
    [
        'user_shop_count', 'user_item_count', 'user_context_count',
        'user_shop_trade_count', 'user_item_trade_count',
        'user_context_trade_count', 'user_brand_count', 'user_city_count',
        'user_brand_trade_count', 'user_city_trade_count', 'shop_item_count',
        'item_occupation_count', 'item_age_count', 'item_gender_count',
        'item_occupation_trade_count', 'item_age_trade_count',
        'item_gender_trade_count', 'brand_item_count', 'city_brand_count',
        'city_item_count', 'user_item_num', 'user_shop_num',
        'user_item_day_num', 'user_shop_day_num', 'min_time'
    ],
    # 火热程度（搜索次数） --
    [
        'item_hot', 'user_hot', 'shop_hot', 'brand_hot', 'occupation_hot',
        'city_hot', 'item_trade_hot', 'user_trade_hot', 'shop_trade_hot',
        'brand_trade_hot', 'occupation_trade_hot', 'city_trade_hot'
    ],
    # user购买力、成交率 --
    [
        'user_sell_power', 'user_sell_trade_power', 'user_sell_power_mean',
        'user_sell_trade_power_mean', 'user_sell_power_mean_item_minus',
        'user_sell_trade_power_mean_item_minus', 'is_home'
    ],
    # 有没有在历史中出现过 --
    [
        'user_is_his', 'brand_is_his', 'user_is_trade_his',
        'item_is_trade_his', 'brand_is_trade_his', 'item_is_his'
    ],
    # 成交率 --
    [
        'user_trade_percent',
        'occupation_trade_percent',
        'brand_trade_percent',
        'item_trade_percent',
    ],
    # 历史数据中出现次数 --
    [
        'city_brand_count_his', 'city_item_count_his', 'shop_item_count_his',
        'brand_item_count_his'
    ],
    # star用户喜爱程度
    [],
    # 平均值编码
    [],
    # 贝叶斯平滑
    [],
    # 分级特征
    [],
    # 最大最小特征/局部最优特征
    []
]

cjf_feas = [
    # 基础特征
    [],
    # 时间特征
    [],
    #shop、item得分
    [],
    # 查询次数
    [],
    # user在一个时间段出现的次数序号
    [],
    # item与pred类型特征
    [
        'category_1',
        'category_2',
        'property_0',
        'property_1',
        'property_2',
        'property_3',
        'property_4',
        'property_5',
        'property_6',
        'property_7',
        'property_8',
        'query_col_0',
        'query_col_1',
        'query_col_2',
        'query_col_3',
        'query_col_4',
        'query_col_5',
        'query_col_6',
    ],
    # 统计特征
    [],
    # 火热程度（搜索次数）
    [],
    # user购买力、成交率
    [],
    # 有没有在历史中出现过
    [],
    # 成交率
    [],
    # 历史数据中出现次数
    [],
    # star用户喜爱程度
    [],
    # 平均值编码
    [],
    # 贝叶斯平滑
    [
        'item_pv_stair_shop_id_bayes_rate_2',
        'item_city_id_shop_id_bayes_rate_2',
        'item_brand_id_shop_review_num_level_bayes_rate_1',
        'item_sales_stair_user_age_stair_bayes_rate_1',
        'time_slice_shop_review_num_level_bayes_rate_1',
        'item_collected_level_bayes_rate_1'
    ],
    # 分级特征
    [
        'gender_filled',
        'user_age_stair',
        'user_occupation_stair',
        'user_star_stair',
        'context_page_stair',
        'hour_stair',
        'item_price_stair',
        'item_sales_stair',
    ],
    # 最大最小特征/局部最优特征
    [
        'user_query_cheapest',
        'user_query_maxsell',
        'user_query_maxcollect',
        'user_query_maxpv',
        'user_query_best_service',
        'user_query_best_delivery',
        'user_query_best_description',
        'user_query_maxshopstar',
        'user_query_maxreview',
        'user_query_maxqueryitem_c_similarity',
        'user_query_maxqueryitem_p_similarity',
    ]
]

cjy_feas = [
    # 基础特征
    [],
    # 时间特征
    [],
    #shop、item得分
    [],
    # 查询次数
    [],
    # user在一个时间段出现的次数序号
    [],
    # item与pred类型特征
    [
        'item_cat_vec_0', 'item_cat_vec_1', 'item_cat_vec_2', 'item_cat_vec_3',
        'item_cat_vec_4', 'item_cat_vec_5', 'item_cat_vec_6', 'item_cat_vec_7',
        'item_cat_vec_8', 'item_cat_vec_9', 'item_cat_vec_10',
        'item_cat_vec_11', 'item_cat_vec_12', 'item_cat_vec_13',
        'item_cat_vec_14', 'item_cat_vec_15', 'item_cat_vec_16',
        'item_cat_vec_17', 'item_cat_vec_18', 'item_cat_vec_19',
        'item_cat_vec_20', 'item_cat_vec_21', 'item_cat_vec_22',
        'item_cat_vec_23', 'pred_cat_mean_0', 'pred_cat_mean_1',
        'pred_cat_mean_2', 'pred_cat_mean_3', 'pred_cat_mean_4',
        'pred_cat_mean_5', 'pred_cat_mean_6', 'pred_cat_mean_7',
        'pred_cat_mean_8', 'pred_cat_mean_9', 'pred_cat_mean_10',
        'pred_cat_mean_11', 'pred_cat_mean_12', 'pred_cat_mean_13',
        'pred_cat_mean_14', 'pred_cat_mean_15', 'pred_cat_mean_16',
        'pred_cat_mean_17', 'pred_cat_mean_18', 'pred_cat_mean_19',
        'pred_cat_mean_20', 'pred_cat_mean_21', 'pred_cat_mean_22',
        'pred_cat_mean_23', 'item_pred_cat_cos', 'item_pred_ppt_cos'
    ],
    # 统计特征
    ['shop_item_cnt', 'shop_brand_cnt'],
    # 火热程度（搜索次数）
    ['24h_cat_hot', '24h_ppt_hot', '24h_user_seem_times'],
    # user购买力、成交率
    [],
    # 有没有在历史中出现过
    [],
    # 成交率
    [],
    # 历史数据中出现次数
    [],
    # star用户喜爱程度
    [
        'star_category_max', 'star_category_min', 'star_category_mean',
        'star_category_var', 'star_property_max', 'star_property_min',
        'star_property_mean', 'star_property_var'
    ],
    # 平均值编码
    [
        'item_brand_id+item_price_level_pred_1', 'shop_id_pred_1',
        'item_id_pred_1', 'item_sales_level+item_collected_level_pred_1',
        'shop_star_level+shop_item_cnt_pred_1',
        'shop_review_num_level+shop_item_cnt_pred_1',
        'item_price_level+item_pv_level_pred_1', 'item_price_level_pred_1',
        'item_sales_level+item_pv_level_pred_1',
        'shop_item_cnt+shop_brand_cnt_pred_1',
        'user_age_level+user_star_level_pred_1', '24h_user_seem_times_pred_1',
        'item_brand_id+shop_brand_cnt_pred_1',
        'item_collected_level+item_id_pred_1',
        'item_price_level+item_pred_cat_cos_pred_1',
        'item_brand_id+hist_hour_ctr_pred_1',
        'item_brand_id+shop_score_description_pred_1',
        'item_city_id+context_page_id_pred_1',
        'item_pv_level+shop_review_positive_rate_pred_1',
        'item_id+hist_hour_ctr_pred_1',
        'item_price_level+star_property_min_pred_1',
        'user_star_level+shop_id_pred_1', 'shop_item_cnt+24h_ppt_hot_pred_1',
        'item_brand_id+star_property_max_pred_1',
        'user_gender_id+star_category_min_pred_1',
        'shop_item_cnt+star_property_max_pred_1',
        'item_sales_level+context_page_id_pred_1',
        'item_collected_level+shop_score_description_pred_1',
        'shop_review_num_level+hist_hour_ctr_pred_1',
        'item_collected_level+user_gender_id_pred_1',
        'shop_item_cnt+star_property_var_pred_1',
        '24h_ppt_hot+star_property_max_pred_1',
        'user_star_level+star_property_max_pred_1',
        'user_age_level+user_id_pred_1'
    ],
    # 贝叶斯平滑
    [],
    # 分级特征
    [],
    # 最大最小特征/局部最优特征
    []
]

target = ['is_trade']

In [3]:
zero_fea_list = []
for crt_list in zero_feas:
    for i in crt_list:
        zero_fea_list.append(i)
zero_fea_list = list(set(zero_fea_list))
print('zero_feas len =', len(zero_fea_list))

cjf_fea_list = []
for crt_list in cjf_feas:
    for i in crt_list:
        cjf_fea_list.append(i)
cjf_fea_list = list(set(cjf_fea_list))
print('cjf_feas len =', len(cjf_fea_list))

cjy_fea_list = []
for crt_list in cjy_feas:
    for i in crt_list:
        cjy_fea_list.append(i)
cjy_fea_list = list(set(cjy_fea_list))
print('cjy_feas len =', len(cjy_fea_list))

zero_feas len = 125
cjf_feas len = 43
cjy_feas len = 97


In [4]:
train = pd.read_csv('train_preprocessing34_full.csv')

train.shape
train.head()

(278894, 266)

Unnamed: 0,user_context_trade_count,user_city_trade_count,user_day_count,user_shop_count,city_item_count_his,shop_item_count_his,user_shop_next_time_sub,user_age_level,city_item_count,shop_star_level,user_trade_hot,user_item_num,shop_trade_hot,user_count,shop_score_delivery,user_item_last_time_sub,city_hot,user_sell_trade_power_mean,is_last_click,user_is_his,is_last_user_item_click,item_hot,user_item_count_first_time_sub,user_sell_power,user_shop_day_num,item_trade_hot,item_sales_level,item_occupation_trade_count,item_gender_count,minute,hour,item_score2,user_minute_query,user_id_idx,item_pred_property_score,brand_item_count_his,user_minute_count,pred_property_split_count,item_occupation_count,user_shop_trade_count,item_brand_id_idx,next_time_sub,user_day_query,user_sell_power_mean_item_minus,yesterday,user_item_trade_count,user_yesterhour_query,brand_is_his,item_pred_category_score,user_brand_count,user_item_count,city_brand_count_his,item_gender_trade_count,user_shop_count_first_time_sub,brand_hot,item_trade_percent,user_is_trade_his,item_collected_level,item_id_idx,user_sell_power_mean,brand_is_trade_his,city_trade_hot,user_gender_id,city_brand_count,item_minute_query,item_is_his,item_pred_property_score_item%,occupation_hot,context_page_id,shop_id_idx,user_context_count,last_time_sub,item_property_split_count,item_hour_query,user_shop_last_time_sub,user_trade_percent,min_time,shop_item_count,user_count_first_time_sub,user_item_day_query,user_sell_trade_power,shop_hot,user_occupation_id,user_brand_trade_count,brand_trade_hot,user_shop_num,user_item_minute_query,item_city_id_idx,day,item_pv_level,user_hot,user_item_next_time_sub,item_category_split_count,shop_score,brand_item_count,user_hour_query,pred_category_split_count,item_score,user_hour_count,brand_trade_percent,item_price_level,item_age_count,user_yesterday_query,shop_score_service,occupation_trade_hot,user_city_count,item_score3,user_item_day_num,item_pred_property_score_pred%,item_is_trade_his,item_pred_category_score_item%,occupation_trade_percent,shop_review_num_level,shop_review_positive_rate,user_sell_trade_power_mean_item_minus,user_item_hour_query,shop_score_description,yesterhour,max_click_time_sub,is_home,item_pred_category_score_pred%,item_age_trade_count,user_star_level,item_day_query,max_user_item_click_time_sub,user_query_maxsell,category_2,property_7,property_1,query_col_6,item_collected_level_bayes_rate_1,item_price_stair,user_query_maxpv,user_query_cheapest,user_occupation_stair,property_5,user_query_best_service,user_age_stair,query_col_5,query_col_1,query_col_0,user_query_maxqueryitem_c_similarity,user_star_stair,item_pv_stair_shop_id_bayes_rate_2,item_sales_stair,property_8,user_query_maxcollect,property_2,user_query_best_description,user_query_maxreview,property_0,property_3,query_col_2,category_1,hour_stair,user_query_best_delivery,query_col_3,user_query_maxshopstar,time_slice_shop_review_num_level_bayes_rate_1,property_6,item_sales_stair_user_age_stair_bayes_rate_1,item_city_id_shop_id_bayes_rate_2,context_page_stair,user_query_maxqueryitem_p_similarity,item_brand_id_shop_review_num_level_bayes_rate_1,gender_filled,query_col_4,property_4,pred_cat_mean_5,pred_cat_mean_14,pred_cat_mean_17,pred_cat_mean_22,shop_item_cnt,item_cat_vec_22,item_cat_vec_2,item_brand_id+hist_hour_ctr_pred_1,item_brand_id+shop_score_description_pred_1,user_gender_id+star_category_min_pred_1,pred_cat_mean_15,item_id_pred_1,item_cat_vec_21,24h_ppt_hot+star_property_max_pred_1,star_category_mean,item_cat_vec_19,item_price_level+item_pred_cat_cos_pred_1,star_property_min,24h_cat_hot,item_cat_vec_5,item_cat_vec_3,item_cat_vec_16,pred_cat_mean_0,star_property_var,shop_review_num_level+shop_item_cnt_pred_1,pred_cat_mean_6,item_pv_level+shop_review_positive_rate_pred_1,item_cat_vec_9,item_sales_level+item_pv_level_pred_1,pred_cat_mean_2,pred_cat_mean_8,item_pred_cat_cos,item_price_level+star_property_min_pred_1,item_pred_ppt_cos,item_cat_vec_10,pred_cat_mean_19,item_cat_vec_13,shop_brand_cnt,shop_star_level+shop_item_cnt_pred_1,item_city_id+context_page_id_pred_1,item_brand_id+item_price_level_pred_1,user_age_level+user_id_pred_1,pred_cat_mean_16,pred_cat_mean_21,star_category_var,item_collected_level+shop_score_description_pred_1,item_brand_id+shop_brand_cnt_pred_1,item_cat_vec_0,shop_item_cnt+star_property_max_pred_1,star_property_mean,star_category_max,item_cat_vec_23,pred_cat_mean_12,item_cat_vec_14,item_cat_vec_15,pred_cat_mean_23,item_collected_level+user_gender_id_pred_1,shop_item_cnt+24h_ppt_hot_pred_1,item_cat_vec_20,pred_cat_mean_1,pred_cat_mean_13,item_cat_vec_6,item_cat_vec_1,item_id+hist_hour_ctr_pred_1,item_cat_vec_11,item_cat_vec_7,item_sales_level+context_page_id_pred_1,shop_id_pred_1,item_cat_vec_12,item_price_level_pred_1,pred_cat_mean_3,user_age_level+user_star_level_pred_1,24h_user_seem_times_pred_1,item_brand_id+star_property_max_pred_1,pred_cat_mean_11,item_sales_level+item_collected_level_pred_1,pred_cat_mean_18,pred_cat_mean_7,user_star_level+star_property_max_pred_1,pred_cat_mean_10,pred_cat_mean_9,shop_item_cnt+shop_brand_cnt_pred_1,star_property_max,pred_cat_mean_4,24h_user_seem_times,24h_ppt_hot,item_price_level+item_pv_level_pred_1,star_category_min,shop_review_num_level+hist_hour_ctr_pred_1,pred_cat_mean_20,item_cat_vec_4,shop_item_cnt+star_property_var_pred_1,user_star_level+shop_id_pred_1,item_cat_vec_18,item_cat_vec_17,item_cat_vec_8,item_collected_level+item_id_pred_1,is_trade
0,0.0,0.0,0,0.0,968.0,1.0,999999,1000,1014,3,0.0,0,0.0,0,1.0,999999,25252.0,0.0,0,1,1,12.0,0,0.0,0,0.0,4,0.0,6.0,55,1,12.0,1,123470,1,13.0,0,2,11.0,0.0,390,9487,9,3.0,20,0.0,0.0,1,2,0.0,0.0,204.0,0.0,0,50.0,0.0,1,4,2654,0.0,1,429.0,0,224,1,0,0.454545,137467,1,1907,0.0,999999,21,1,999999,0.0,3.0,1,0,1,0.0,12.0,2005,0.0,0.0,0,1,28,21,14,0.0,999999,2,7.0,7,1,5,60,0,0.0,3,0.0,0.0,1.0,2611,0.0,4.0,0,3.333333,1,6.666667,1.899365,4,1.0,3.0,1,1.0,0,9894,0,3.333333,0.0,3003,1,0,1,0,74846,19279,0.0,0.01851,2,1,1,1,2723,1,1,0.0,0.0,1.0,1,1,0.001379,1,89740,1,71440,1,1,46913,27496,1.0,595,0,1,0.0,1,0.018014,110,0.012998,0.001375,2,1,0.001354,1,0.0,26401,-0.09376,-0.037004,0.003739,0.09985,1,0.149699,0.09828,0.007551,8.2e-05,0.021217,0.001786,8.2e-05,-0.065712,2.527635e-12,0.333536,0.173776,0.053698,0.038378,45359,-0.140542,-0.060477,-0.007867,-0.05483,0.0,0.020143,0.04984,0.013442,-0.035831,0.008983,0.065572,-0.076477,1.0,6.814379e-15,-0.31366,-0.040322,0.115942,0.292921,1,0.023433,0.017108,4.235789e-09,0.01964,-0.005248,-0.043818,0.111024,0.018554,3.67224e-11,-0.082182,0.019684,0.038378,0.97868,-0.030753,0.01881,-0.055463,0.002678,-0.020517,0.013617,0.040973,-0.02252,-0.124263,0.195416,0.07471,-0.186273,0.011333,-0.036183,0.008552,0.02365036,8.2e-05,0.028195,0.045222,-0.040345,0.01656,0.019482,0.01964,-0.02414,0.008911,-0.041971,0.00571,0.017348,-0.026915,-0.023887,0.02717,0.038378,-0.021384,0,42892,0.036904,0.090444,1.583973e-20,-0.015049,-0.032052,0.024836,0.015238,-0.062935,0.005618,-0.114663,8.2e-05,0
1,0.0,0.0,0,0.0,79.0,2.0,999999,1003,84,13,0.0,0,2.0,0,0.976604,999999,739.0,0.0,0,1,1,29.0,0,0.0,0,1.0,9,1.0,22.0,35,21,12.1,5,65467,1,4.0,0,1,19.0,0.0,893,85949,9,8.0,20,0.0,0.0,1,2,0.0,0.0,21.0,1.0,0,70.0,3.448276,1,8,5371,0.0,0,11.0,0,24,1,0,0.196078,137467,14,668,0.0,999999,50,3,999999,0.0,43.0,2,0,1,0.0,55.0,2005,0.0,1.0,0,1,56,21,14,0.0,999999,2,25.636987,3,8,4,121,0,1.428571,8,13.0,0.0,0.974774,2611,0.0,0.75625,0,5.0,0,6.666667,1.899365,15,0.985647,8.0,1,0.969257,20,1189,0,4.0,1.0,3003,15,0,1,0,13860,19279,0.0,0.021815,2,0,1,1,61618,0,1,0.0,0.0,1.0,1,1,0.022345,3,35693,1,69795,1,1,46913,2723,1.0,904,0,0,0.0,1,0.018014,42930,0.02477,0.022341,2,1,0.022367,1,0.0,89740,-0.111533,-0.043868,0.006399,0.121642,2,0.242893,0.153354,0.002883,0.045297,0.016404,0.00383,0.032821,-0.111946,2.98506e-06,0.605382,0.27151,0.012745,0.116025,51114,-0.222706,-0.096608,-0.013355,-0.064404,0.0,0.023231,0.058802,0.022679,-0.054892,0.01946,0.076803,-0.091214,1.0,0.01380893,0.007221,-0.065816,0.135977,0.46835,2,0.018124,0.002106,0.01801216,0.01964,-0.006685,-0.056057,0.057779,0.019715,0.0328211,-0.128599,0.011933,0.116025,0.97868,-0.049673,0.023367,-0.087591,0.007678,-0.02487,0.013931,0.015682,-0.02836,-0.147806,0.234541,0.117402,-0.295125,0.007551,-0.05845,0.015756,1.250961e-56,0.035904,0.046685,0.013609,-0.04838,0.019694,0.019482,0.007551,-0.029265,0.022416,-0.053979,0.007874,0.020061,-0.032936,-0.02752,0.021817,0.116025,-0.026687,0,61613,0.008661,0.318582,0.01928269,-0.014199,-0.053317,0.019101,0.000905,-0.107819,0.012795,-0.182084,0.032821,0
2,0.0,0.0,0,0.0,79.0,2.0,999999,1003,84,13,0.0,0,2.0,0,0.976604,999999,739.0,0.0,0,1,1,26.0,0,0.0,0,1.0,9,0.0,20.0,15,22,16.9,2,115727,0,3.0,0,3,9.0,0.0,14,86382,2,8.0,20,0.0,0.0,1,2,0.0,0.0,21.0,1.0,0,60.0,3.846154,1,10,818,0.0,0,11.0,0,24,2,0,0.0,137467,1,668,0.0,999999,46,2,999999,0.0,44.0,2,0,1,0.0,55.0,2005,0.0,1.0,0,1,56,21,16,0.0,999999,2,25.636987,3,2,3,169,0,1.666667,8,10.0,0.0,0.974774,2611,0.0,5.633333,0,0.0,0,6.666667,1.899365,15,0.985647,8.0,1,0.969257,21,86382,0,5.0,1.0,3003,14,0,1,0,31773,72863,0.0,0.018561,2,0,0,2,58070,0,2,0.0,0.0,1.0,0,1,0.002539,2,46722,0,36855,1,0,19279,74846,0.0,595,0,0,1.0,0,0.031936,13847,0.022491,0.002534,2,1,0.002515,1,0.0,89740,-0.149771,-0.058822,0.006545,0.160013,2,0.160013,0.104555,0.002262,6.2e-05,0.013919,0.003173,0.037368,-0.070563,1.136894e-09,0.507929,0.185101,0.012745,0.031857,51524,-0.149771,-0.064571,-0.00845,-0.087509,3.3e-05,0.023231,0.079579,0.05327,-0.037967,0.009873,0.104555,-0.122116,1.0,0.0120944,0.132421,-0.043145,0.185101,0.312531,2,0.018124,0.042244,0.01646485,0.01964,-0.00845,-0.070563,0.1465,0.00555,0.03736803,-0.087509,0.007446,0.041881,0.97868,-0.033052,0.030514,-0.058822,0.003173,-0.033052,0.020374,0.015427,-0.023651,-0.198701,0.312531,0.079579,-0.198701,0.003138,-0.038781,0.009369,0.01578408,0.035904,0.030514,0.013609,-0.064571,0.019694,0.019482,0.015239,-0.038781,0.011793,-0.067826,0.009369,0.004444,-0.043145,-0.037967,0.021817,0.045222,-0.034495,0,43506,0.012603,0.041154,0.01671079,-0.023651,-0.034495,0.009994,0.000905,-0.067826,0.006545,-0.122116,0.037368,0
3,0.0,0.0,0,0.0,79.0,2.0,29099,1003,84,13,0.0,0,2.0,0,0.976604,999999,739.0,0.0,0,1,0,29.0,0,0.0,0,1.0,9,1.0,22.0,55,13,12.1,3,53815,1,4.0,0,2,19.0,0.0,893,29099,9,8.0,20,0.0,0.0,1,2,0.0,0.0,21.0,1.0,0,70.0,3.448276,1,8,5371,0.0,0,11.0,0,24,2,0,0.196078,137467,0,668,0.0,999999,50,2,999999,0.0,27.0,2,0,2,0.0,55.0,2005,0.0,1.0,0,1,56,21,14,0.0,29099,2,25.636987,3,3,4,121,0,1.428571,8,13.0,0.0,0.974774,2611,0.0,6.05,0,3.333333,0,6.666667,1.899365,15,0.985647,8.0,1,0.969257,12,30153,0,4.0,1.0,3005,15,29099,0,0,41515,19279,0.0,0.019231,2,0,0,2,42930,0,2,0.0,1.0,0.0,0,1,0.003143,1,69550,0,110,0,0,46913,89740,0.0,517,0,0,1.0,0,0.018014,13860,0.016288,0.003136,2,0,0.003115,1,0.0,61618,-0.111533,-0.043868,0.006399,0.121642,2,0.242893,0.153354,0.01964,0.045297,0.015404,0.00383,0.032821,-0.111946,0.02117116,0.61181,0.27151,0.012745,0.114927,51073,-0.222706,-0.096608,-0.013355,-0.064404,0.066432,0.023231,0.058802,0.022679,-0.054892,0.01946,0.076803,-0.091214,1.0,0.01380893,0.047626,-0.065816,0.135977,0.46835,2,0.018124,0.015424,0.01801216,0.01964,-0.006685,-0.056057,0.052671,0.019715,0.0328211,-0.128599,0.016012,0.413059,0.981966,-0.049673,0.023367,-0.087591,0.007678,-0.02487,0.013931,0.006647,-0.02836,-0.147806,0.234541,0.117402,-0.295125,0.01964,-0.05845,0.015756,0.01379249,0.035904,0.046685,0.013609,-0.04838,0.018441,0.019482,0.004834,-0.029265,0.022416,-0.053979,0.007874,0.016353,-0.032936,-0.02752,0.021817,0.743746,-0.026687,0,62433,0.008661,0.368182,0.01991791,-0.014199,-0.053317,0.01171,0.004568,-0.107819,0.012795,-0.182084,0.032821,0
4,0.0,0.0,0,0.0,79.0,2.0,999999,1007,84,13,0.0,0,2.0,0,0.976604,999999,739.0,0.0,1,1,1,26.0,0,0.0,0,1.0,9,0.0,20.0,55,9,16.9,1,54570,1,3.0,0,1,9.0,0.0,14,999999,1,8.0,20,0.0,0.0,1,2,0.0,0.0,21.0,1.0,0,60.0,3.846154,1,10,818,0.0,0,11.0,0,24,1,0,0.212766,137467,0,668,0.0,999999,46,1,999999,0.0,19.0,2,0,1,0.0,55.0,2005,0.0,1.0,0,1,56,21,16,0.0,999999,2,25.636987,3,1,5,169,0,1.666667,8,0.0,0.0,0.974774,2611,0.0,8.45,0,5.0,0,6.666667,1.899365,15,0.985647,8.0,1,0.969257,8,0,0,3.333333,0.0,3002,14,0,1,0,2723,6344,0.0,0.019231,2,1,0,2,36855,0,2,0.0,1.0,1.0,1,1,0.015847,2,89740,1,42163,0,0,46913,27496,0.0,589,0,0,0.0,1,0.014121,86967,0.022491,0.015839,2,0,0.006183,1,0.0,69795,-0.099977,-0.039269,0.004359,0.106813,2,0.160013,0.104555,0.004568,6.2e-05,0.014673,0.002105,0.037368,-0.070563,0.05363618,0.364336,0.185101,0.012745,0.000291,50644,-0.149771,-0.064571,-0.00845,-0.058416,0.0,0.023231,0.053127,0.05327,-0.037967,0.009873,0.069796,-0.081539,1.0,0.00808425,0.623223,-0.043145,0.123562,0.312531,2,0.018124,0.015424,0.01646485,0.01964,-0.005637,-0.047097,0.130858,0.00555,0.03736803,-0.087509,0.006724,0.000291,0.978508,-0.033052,0.020357,-0.058822,0.003173,-0.022058,0.020374,0.030968,-0.023651,-0.132642,0.208612,0.079579,-0.198701,0.015239,-0.038781,0.009369,0.01379249,0.035904,0.030514,0.013609,-0.043101,0.027674,0.019482,2.3e-05,-0.025882,0.011793,-0.045258,0.006239,0.007962,-0.028777,-0.025372,0.021817,0.000291,-0.023012,0,41816,0.012603,0.01589,0.02032851,-0.015785,-0.034495,0.019101,0.21979,-0.067826,0.006545,-0.122116,0.037368,0


In [6]:
test = train[train.day == 25]
val = train[train.day == 24]
train = train[(train.day != 24) & (train.day != 25)]

train.shape
val.shape
test.shape

(203112, 266)

(57411, 266)

(18371, 266)

In [7]:
import sys
sys.path.append('/Users/zero/xgboost/python-package')
import xgboost as xgb
import lightgbm as lgb

from sklearn.model_selection import GridSearchCV
from sklearn.cross_validation import KFold
from sklearn.metrics import log_loss



In [8]:
rubbish_features_zero = [
    'user_city_trade_count', 'user_id_idx', 'brand_item_count',
    'brand_trade_percent', 'item_score', 'shop_item_count_his',
    'item_occupation_count', 'item_age_count', 'shop_id_idx',
    'item_brand_id_idx', 'item_hour_query', 'user_trade_percent',
    'user_city_count', 'min_time', 'user_sell_trade_power_mean', 'item_score3',
    'user_item_day_query', 'user_yesterhour_query', 'brand_trade_hot',
    'item_trade_hot', 'item_gender_trade_count',
    'item_pred_category_score_pred%', 'item_age_trade_count',
    'item_gender_count', 'item_day_query', 'item_id_idx'
]
rubbish_features_cjf = [
    'category_2', 'item_sales_stair_user_age_stair_bayes_rate_1', 'property_5',
    'property_4', 'property_0', 'category_1',
    'item_collected_level_bayes_rate_1', 'query_col_5', 'property_2',
    'item_pv_stair_shop_id_bayes_rate_2',
    'time_slice_shop_review_num_level_bayes_rate_1', 'property_3',
    'query_col_2', 'property_8', 'query_col_0', 'query_col_1', 'query_col_3',
    'property_6', 'item_brand_id_shop_review_num_level_bayes_rate_1',
    'query_col_6', 'query_col_4', 'property_1', 'property_7',
    'item_city_id_shop_id_bayes_rate_2'
]

rubbish_features_cv = [
    'item_pred_cat_cos', 'item_occupation_count', 'property_1',
    'item_pred_ppt_cos', 'property_7',
    'item_price_level+star_property_min_pred_1',
    'user_star_level+star_property_max_pred_1',
    'item_pv_stair_shop_id_bayes_rate_2', 'item_brand_id_idx',
    'star_category_mean', '24h_ppt_hot+star_property_max_pred_1',
    'item_sales_stair_user_age_stair_bayes_rate_1',
    'item_price_level+item_pv_level_pred_1', 'property_5', 'category_1',
    'shop_item_cnt+star_property_max_pred_1',
    'time_slice_shop_review_num_level_bayes_rate_1', 'user_id_idx',
    'shop_item_cnt+24h_ppt_hot_pred_1',
    'item_sales_level+context_page_id_pred_1',
    'item_city_id_shop_id_bayes_rate_2', 'property_4', 'property_8',
    'item_brand_id_shop_review_num_level_bayes_rate_1', 'item_age_count',
    'item_collected_level_bayes_rate_1'
]

rubbish_features = [
    'user_id_idx', 'item_id_idx', 'item_hour_query',
    'item_pred_category_score_pred%', 'item_brand_id_idx', 'brand_trade_hot',
    'brand_trade_hot', 'shop_id_idx', 'min_time', 'item_age_count',
    'brand_trade_percent'
]

In [9]:
predictors = list(
    set(zero_fea_list + cjy_fea_list + cjf_fea_list) - set(basics_cols))
predictors = list(set(basics_cols + predictors))
predictors = list(
    set(predictors) -
    set(rubbish_features_zero + rubbish_features_cjf + rubbish_features_cv))
len(predictors)

205

In [13]:
#train/val
params = {
    'objective': 'binary:logistic',
    'eta': 0.06,
    'colsample_bytree': 0.886,
    'min_child_weight': 1.6,
    'max_depth': 4,
    'subsample': 0.886,
    'gamma': 0.1,
    'lambda': 10,
    'verbose_eval': True,
    'eval_metric': 'auc',
    'seed': 201803,
    'missing': -1
}

# feature_performance = {}

for i in tqdm(predictors):
    temp_features = predictors.copy()
#     temp_features.remove(i)
#     len(temp_features)
    xgbtrain = xgb.DMatrix(train[temp_features], train[target])
    xgbval = xgb.DMatrix(val[temp_features])
    model = xgb.train(params, xgbtrain, num_boost_round=450)
    y_pred = model.predict(xgbval)
    y_pred = pd.Series(y_pred, name='val')

    print(str(i) + ':' + str(log_loss(val[target], y_pred)))
#     feature_performance[i] = log_loss(val[target], y_pred)

  0%|          | 1/205 [02:51<9:43:27, 171.60s/it]

item_pred_category_score:0.0792309261243


  1%|          | 2/205 [05:41<9:38:20, 170.94s/it]

shop_trade_hot:0.0792309261243


  1%|▏         | 3/205 [08:31<9:34:32, 170.66s/it]

user_star_stair:0.0792309261243


  2%|▏         | 4/205 [11:15<9:25:40, 168.86s/it]

item_cat_vec_11:0.0792309261243


  2%|▏         | 5/205 [14:16<9:30:42, 171.21s/it]

user_shop_day_num:0.0792309261243


  3%|▎         | 6/205 [16:14<8:58:56, 162.50s/it]

star_category_min:0.0792309261243


  3%|▎         | 7/205 [17:46<8:22:56, 152.41s/it]

item_cat_vec_22:0.0792309261243


  4%|▍         | 8/205 [19:19<7:56:04, 145.00s/it]

pred_cat_mean_6:0.0792309261243


KeyboardInterrupt: 

In [16]:
baseline = 0.0792309261243
rubbish_features = []

for i in enumerate(feature_performance):
    if (feature_performance[i[1]] > baseline):
        rubbish_features.append(i[1])
        
len(rubbish_features)

162

In [137]:
#cv
params = {
    'objective': 'binary:logistic',
    'eta': 0.06,
    'colsample_bytree': 0.886,
    'min_child_weight': 1.6,
    'max_depth': 4,
    'subsample': 0.886,
    'gamma': 0.1,
    'lambda': 10,
    'verbose_eval': True,
    'eval_metric': 'logloss',
    'seed': 201803,
    'missing': -1
}
# temp_predictors = list(
#     set(zero_fea_list) - set(basics_cols))
# temp_predictors = list(set(basics_cols + temp_predictors))
# temp_predictors = list(
#     set(temp_predictors) -
#     set(rubbish_features_zero + rubbish_features_cv))
# len(temp_predictors)

cv_train = pd.concat([train, val])
xgbtrain = xgb.DMatrix(cv_train[good_features + basics_cols], cv_train[target])
len(good_features)

cv = xgb.cv(
    params,
    dtrain=xgbtrain,
    num_boost_round=500,
    nfold=5,
    early_stopping_rounds=50,
    seed=201803)
print(str(cv.shape[0]) + ':' + str(cv.iloc[-1, :]['test-logloss-mean']))

# 0.0836343333333(zero+cjf+cjy)
# 0.083560(zero+cjf+cjy - rubbish)
# 0.083543(zero - rubbish_zero)
# 0.083598(zero - rubbish_zero - rubbish_cv)
# 0.0835596666667(zero+cjy - rubbish_zero)
# 0.0838413333333(zero+cjf - rubbish_zero)
# 0.0836076666667(zero+cjf+cjy - rubbish_zero)
# 0.0836136666667(zero+cjf+cjy - rubbish_cv)
# 0.083562(zero+cjf+cjy - rubbish_zero - rubbish_cjf)
# 0.083492(zero+cjf+cjy - rubbish_zero - rubbish_cjf - rubbish_cv)

# 0.0836006666667 45+15
# 0.0843633333333 45
# 0.083702 30+15
# 0.0847226666667 30
# 0.083427 100+15
# 0.0839198 100
# 0.0834056 75+15
# 0.0839876 75

66

398:0.0835366


In [133]:
#train/test
params = {
    'objective': 'binary:logistic',
    'eta': 0.04,
    'colsample_bytree': 0.886,
    'min_child_weight': 1.6,
    'max_depth': 4,
    'subsample': 0.886,
    'gamma': 0.1,
    'lambda': 10,
    'verbose_eval': True,
    'eval_metric': 'auc',
    'seed': 201803,
    'missing': -1
}

predictors = list(set(basics_cols + predictors))
predictors = list(set(predictors) - set(rubbish_features))
len(predictors)

train = pd.concat([train, val])
xgbtrain = xgb.DMatrix(train[predictors], train[target])
model = xgb.train(params, xgbtrain, num_boost_round=600)

xgbtest = xgb.DMatrix(test[predictors])
y_pred = model.predict(xgbtest)

idx = pd.read_csv('resultWithHistory.txt', sep=' ')
idx.head()
idx['predicted_score'] = y_pred
idx.head()

idx[['instance_id', 'predicted_score']].to_csv(
    'resultWithHistory.txt',
    sep=' ',
    header=['instance_id', 'predicted_score'],
    index=False)