In [1]:
import numpy as np
import os
import pandas as pd
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
# import seaborn as sns
import pickle
import time
import gc
from tqdm import tqdm, tqdm_notebook

%matplotlib inline

#每次可以输出多个变量
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from pylab import rcParams
rcParams['figure.figsize'] = 14, 6

import warnings
warnings.filterwarnings("ignore")

#中文字体
import matplotlib
matplotlib.use('qt4agg')
#指定默认字体
matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['font.family'] = 'sans-serif'
#解决负号'-'显示为方块的问题
matplotlib.rcParams['axes.unicode_minus'] = False

In [2]:
df = pd.read_csv('./feature/df_feature4_ctr_extra.csv', encoding='utf-8', nrows=1)

cont_fea = ['dict_len', 'title_rate', 'max_title_rate', 'title_rate_max_sub', 'title_rank_in_query', 'title_rank_in_query_origin', 'rank_first_origin_rate', 'rank_first_origin_rate_sub']
cont_fea += [i for i in df.columns if '_count' in i]
cate_fea = ['is_max_rate', 'is_first_rate', 'title_equal_prefix', 'prefix_in_title', 'title_startswith_prefix','title_endswith_prefix', 'rate_big5', 'title_in_query']
long_cate_fea = ['user_id', 'item_id']

df = pd.read_csv('./feature/df_feature4_ctr_extra.csv', encoding='utf-8', usecols=long_cate_fea+['tag', 'label', 'is_val']+cont_fea+cate_fea)
df.shape

(2100000, 51)

In [3]:
feature1 = pd.read_csv('./feature/feature_rank_query.csv')
feature2 = pd.read_csv('./feature/feature_rank_query_re.csv')
feature3 = pd.read_csv('./feature/feature_dict_dis.csv')
feature4 = pd.read_csv('./feature/feature_vector_dis.csv')
feature5 = pd.read_csv('./feature/feature_re_pretit_rate.csv')
feature6 = pd.read_csv('./feature/feature_titpre_dis.csv')

cont_fea += list(feature1.columns)
cont_fea += list(feature2.columns)
cont_fea += list(feature3.columns)
cont_fea += list(feature4.columns)
cont_fea += list(feature5.columns)
cont_fea += list(feature6.columns)

feature1.shape
feature2.shape
feature3.shape
feature4.shape
feature5.shape
feature6.shape

(2100000, 6)

(2100000, 4)

(2100000, 2)

(2100000, 8)

(2100000, 14)

(2100000, 6)

In [4]:
df = pd.concat([df, feature1, feature2, feature3, feature4, feature5, feature6], axis=1)
df.shape

(2100000, 91)

## 稀疏矩阵表示

In [6]:
from scipy.sparse import csr_matrix, csc_matrix, hstack

In [7]:
user_id_dict = dict(zip(df.user_id.unique(), range(len(df.user_id.unique()))))
item_id_dict = dict(zip(df.item_id.unique(), range(len(df.item_id.unique()))))

user_id_len = len(df.user_id.unique())
item_id_len = len(df.item_id.unique())

ori_fea = ['user_id', 'item_id']

In [8]:
tag = pd.get_dummies(df.tag, prefix='tag')
ori_fea += list(tag.columns)
df = pd.concat([df, tag], axis=1)
df = df.drop(['tag'], axis=1)

df.shape

(2100000, 112)

In [9]:
ord_col = []
ord_row = list(range(0, df.shape[0]*2+2, 2))
ord_data = [1] * df.shape[0]*2

for index, row in tqdm_notebook(df[long_cate_fea].iterrows()):
    offset = 0
    for idx in long_cate_fea:
        ord_col.append(locals()[idx+'_dict'][row[idx]] + offset)
        offset += locals()[idx+'_len']
    
spr_df = csr_matrix((ord_data, ord_col, ord_row))
spr_df = spr_df.tocsc()

A Jupyter Widget




In [10]:
for i in tqdm_notebook(list(tag.columns)):
    spr_tmp = csc_matrix(df[i]).transpose()
    spr_df = hstack((spr_df, spr_tmp))

A Jupyter Widget




## lgb

In [7]:
import lightgbm as lgb
from sklearn.metrics import f1_score

In [8]:
spr_df = spr_df.tocsc()
spr_df

<2100000x435617 sparse matrix of type '<class 'numpy.float64'>'
	with 24560805 stored elements in Compressed Sparse Column format>

In [9]:
def lgb_f1(y_pred, data):
    y_true = data.get_label()
    y_pred = np.round(y_pred)
    return 'f1', f1_score(y_true, y_pred), True

In [25]:
train_data = lgb.Dataset(spr_df[:2000000, :], label=df[df.is_val == 0].label)
val_data = lgb.Dataset(spr_df[2000000:2050000, :], label=df[df.is_val == 1].label)

cv_params =  {
        'boosting_type': 'gbdt',
        'objective': 'binary',
#         'min_data_in_leaf': 300,
        'num_leaves': 127,
#         'max_depth': 7,
        'learning_rate': 0.5,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'lambda_l1': 0.2,
        'lambda_l2': 0.2,
        'seed': 2018
        }

bst = lgb.train(cv_params, train_data, early_stopping_rounds=100, verbose_eval=100, valid_sets=[train_data, val_data], 
                valid_names=['train', 'val'], num_boost_round=300, feval=lgb_f1)

Training until validation scores don't improve for 100 rounds.
[100]	train's binary_logloss: 0.503161	train's f1: 0.614253	val's binary_logloss: 0.527631	val's f1: 0.593303
[200]	train's binary_logloss: 0.4856	train's f1: 0.636195	val's binary_logloss: 0.519668	val's f1: 0.606231
[300]	train's binary_logloss: 0.476038	train's f1: 0.646478	val's binary_logloss: 0.51653	val's f1: 0.61117
Early stopping, best iteration is:
[294]	train's binary_logloss: 0.476432	train's f1: 0.646406	val's binary_logloss: 0.516676	val's f1: 0.61188


In [29]:
y_pred_train = bst.predict(spr_df[:2000000, :], pred_leaf=True)
y_pred_val = bst.predict(spr_df[2000000:2050000, :], pred_leaf=True)

In [30]:
y_pred_train.shape
y_pred_val.shape

(2000000, 294)

(50000, 294)

In [32]:
np.save('./feature/lgb_leaf_train.npy', y_pred_train)
np.save('./feature/lgb_leaf_val.npy', y_pred_val)

## feature select

In [11]:
import lightgbm as lgb
from sklearn.metrics import f1_score

In [12]:
def lgb_f1(y_pred, data):
    y_true = data.get_label()
    y_pred = np.round(y_pred)
    return 'f1', f1_score(y_true, y_pred), True

In [13]:
spr_df

<2100000x435624 sparse matrix of type '<class 'numpy.int32'>'
	with 6300000 stored elements in COOrdinate format>

In [15]:
cv_params =  {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'num_leaves': 255,
        'learning_rate': 0.5,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'lambda_l1': 0.2,
        'lambda_l2': 0.2,
        'seed': 2018
        }

spr_df = spr_df.astype(np.float64)
spr_df = spr_df.tocsc()

train_data = lgb.Dataset(spr_df[:2000000, :], label=df[df.is_val == 0].label)
val_data = lgb.Dataset(spr_df[2000000:2050000, :], label=df[df.is_val == 1].label)
bst = lgb.train(cv_params, train_data, early_stopping_rounds=100, verbose_eval=0, valid_sets=[train_data, val_data], 
            valid_names=['train', 'val'], num_boost_round=100, feval=lgb_f1)
print('origin porformance:', bst.best_score['val']['f1'])

In [16]:
feature_importance = {}
tmp_feas = list(feature6.columns)

for i in tqdm_notebook(tmp_feas):
    spr_tmp = csc_matrix(df[i]).transpose()
    spr_tmp = hstack((spr_df, spr_tmp))
    spr_tmp = spr_tmp.astype(np.float64)
    spr_tmp = spr_tmp.tocsc()
    
    train_data = lgb.Dataset(spr_tmp[:2000000, :], label=df[df.is_val == 0].label)
    val_data = lgb.Dataset(spr_tmp[2000000:2050000, :], label=df[df.is_val == 1].label)
    bst = lgb.train(cv_params, train_data, early_stopping_rounds=100, verbose_eval=0, valid_sets=[train_data, val_data], 
                valid_names=['train', 'val'], num_boost_round=100, feval=lgb_f1)
    feature_importance[i] = bst.best_score['val']['f1']
    print(i, bst.best_score['val']['f1'])

A Jupyter Widget

title_prefix_distance 0.6065538527686497
title_prefix_cos_distance 0.6091805298829328
title_query_str_distance 0.5649487336185238
title_query_str_cos_distance 0.5671591672263264
prefix_has_symbol 0.5423157894736842
title_has_symbol 0.552434392855921



In [19]:
good_fea = [i[0] for i in sorted(feature_importance, key=lambda x: x[1], reverse=True) if i[1] >= 0.55]
len(good_fea)
good_fea

58

['title_query_dict_cos_distance',
 'title_max_distance',
 'title_rank_in_query_origin',
 'title_max_cos_distance',
 'title_rank_in_query_origin_count',
 'title_rank_in_query_origin_tag_count',
 'title_rate',
 'title_in_query_count',
 'title_in_query',
 'title_in_query_tag_count',
 'title_rank_in_query',
 'rank_first_origin_rate_sub',
 'title_prefix_cos_distance',
 'title_rate_max_sub',
 'title_rank_in_query_count',
 'title_rank_in_query_tag_count',
 'title_prefix_distance',
 'title_str_cos_distance',
 'title_str_distance',
 'title_origin_str_cos_distance',
 'title_origin_str_distance',
 'is_max_rate_tag_count',
 'is_max_rate_count',
 'is_max_rate',
 'title_equal_prefix_tag_count',
 'title_equal_prefix_count',
 'title_equal_prefix',
 'prefix_inter_title_url_len_rate_title_url',
 'prefix_inter_title_len_rate_title',
 'title_first_distance',
 'prefix_tag_count',
 'title_query_dict_distance',
 'title_startswith_prefix_tag_count',
 'title_startswith_prefix_count',
 'title_startswith_prefix'

In [18]:
feature_importance = [('title_query_dict_cos_distance', 0.6204140256658445),('title_max_distance', 0.619008525418377),('title_rank_in_query_origin', 0.6173192163194337),('title_max_cos_distance', 0.6163573709335779),('title_rank_in_query_origin_count', 0.6142071723103836),('title_rank_in_query_origin_tag_count', 0.6136491118728576),('title_rate', 0.6129634907064374),('title_in_query_count', 0.6128478186190313),('title_in_query', 0.6128478186190313),('title_in_query_tag_count', 0.6127627580589582),('title_rank_in_query', 0.6123902811347156),('rank_first_origin_rate_sub', 0.6122913505311077),('title_rate_max_sub', 0.608851025812833),('title_rank_in_query_count', 0.6083167809316332),('title_rank_in_query_tag_count', 0.6082040409880826),('title_str_cos_distance', 0.6062754941951678),('title_str_distance', 0.6040697490426912),('title_origin_str_cos_distance', 0.6029793977812996),('title_origin_str_distance', 0.6028838535889717),('is_max_rate_tag_count', 0.6016869117790731),('is_max_rate_count', 0.600767629456154),('is_max_rate', 0.600767629456154),('title_equal_prefix_tag_count', 0.5987489391129411),('title_equal_prefix_count', 0.5971060081786727),('title_equal_prefix', 0.5971060081786727),('title_first_distance', 0.5820831828775892),('prefix_tag_count', 0.5810161818237435),('title_query_dict_distance', 0.5798574445617739),('title_startswith_prefix_tag_count', 0.575589824700145),('title_startswith_prefix_count', 0.5746374657285369),('title_startswith_prefix', 0.5746374657285369),('title_first_cos_distance', 0.5730636145861059),('rate_big5_tag_count', 0.5645436548569869),('rate_big5_count', 0.5642532545187412),('rate_big5', 0.5642532545187412),('title_len', 0.5612982408291286),('title_endswith_prefix_tag_count', 0.5603760186656748),('prefix_in_title_count', 0.5602442333785618),('prefix_in_title', 0.5602442333785618),('prefix_in_title_tag_count', 0.5601519056015192),('is_first_rate_count', 0.5556928144945439),('is_first_rate', 0.5556928144945439),('dict_len_tag_count', 0.5540346314420642),('is_first_rate_tag_count', 0.5540303436887192),('tag_rank_query_re', 0.5518176885512751),('title_tag_count', 0.5478763681420802),('prefix_count', 0.5477649975893656),('title_count', 0.5476510993752373),('prefix_len', 0.5475966398271115),('max_title_rate', 0.5443090845805778),('rank_first_origin_rate', 0.5442124708096615),('title_rank_query_re', 0.5431192660550459),('title_tag_rank_query_re', 0.5422568620806506),('prefix_len_tag_count', 0.5422209774541381),('tag_count', 0.5419159569226437),('dict_len', 0.5418097707997477),('prefix_rank_query_re', 0.541692573402418),('dict_len_count', 0.5413581330057905),('prefix_len_count', 0.5408330714709682),('title_rank_query', 0.5407931912518749),('prefix_title_count', 0.540550033367286),('tag_rank_query', 0.5401511157968722),('title_endswith_prefix_count', 0.5381520384546911),('title_endswith_prefix', 0.5381520384546911),('title_tag_rank_query', 0.5341052631578947),('prefix_rank_query', 0.5332437402744378),('prefix_inter_title_url_len_rate_title_url', 0.5883230135831685),('prefix_inter_title_len_rate_title', 0.5849013681713302),('prefix_inter_title_len', 0.5577520570862782),('prefix_inter_title_url_len', 0.5577520570862782),('title_has_num', 0.5553931400210762),('title_url_has_num', 0.5528372061282286),('prefix_inter_title_len_rate_prefix', 0.5519949951341582),('prefix_inter_title_url_len_rate_prefix', 0.5519949951341582),('query_values_sum', 0.5480669119178178),('prefix_has_letter', 0.5424572971835432),('title_has_letter', 0.5422515440763617),('title_url_has_letter', 0.5422515440763617),('prefix_has_url', 0.5420468903551874),('prefix_has_num', 0.5417295211366427),('title_prefix_cos_distance', 0.6091805298829328),('title_prefix_distance', 0.6065538527686497),('title_query_str_cos_distance', 0.5671591672263264),('title_query_str_distance', 0.5649487336185238),('title_has_symbol', 0.552434392855921),('prefix_has_symbol', 0.5423157894736842)]

In [34]:
sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)

[('title_query_dict_cos_distance', 0.6204140256658445),
 ('title_max_distance', 0.619008525418377),
 ('title_rank_in_query_origin', 0.6173192163194337),
 ('title_max_cos_distance', 0.6163573709335779),
 ('title_rank_in_query_origin_count', 0.6142071723103836),
 ('title_rank_in_query_origin_tag_count', 0.6136491118728576),
 ('title_rate', 0.6129634907064374),
 ('title_in_query_count', 0.6128478186190313),
 ('title_in_query', 0.6128478186190313),
 ('title_in_query_tag_count', 0.6127627580589582),
 ('title_rank_in_query', 0.6123902811347156),
 ('rank_first_origin_rate_sub', 0.6122913505311077),
 ('title_rate_max_sub', 0.608851025812833),
 ('title_rank_in_query_count', 0.6083167809316332),
 ('title_rank_in_query_tag_count', 0.6082040409880826),
 ('title_str_cos_distance', 0.6062754941951678),
 ('title_str_distance', 0.6040697490426912),
 ('title_origin_str_cos_distance', 0.6029793977812996),
 ('title_origin_str_distance', 0.6028838535889717),
 ('is_max_rate_tag_count', 0.6016869117790731),

In [17]:
sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)

[('prefix_inter_title_url_len_rate_title_url', 0.5883230135831685),
 ('prefix_inter_title_len_rate_title', 0.5849013681713302),
 ('prefix_inter_title_len', 0.5577520570862782),
 ('prefix_inter_title_url_len', 0.5577520570862782),
 ('title_has_num', 0.5553931400210762),
 ('title_url_has_num', 0.5528372061282286),
 ('prefix_inter_title_len_rate_prefix', 0.5519949951341582),
 ('prefix_inter_title_url_len_rate_prefix', 0.5519949951341582),
 ('query_values_sum', 0.5480669119178178),
 ('prefix_has_letter', 0.5424572971835432),
 ('title_has_letter', 0.5422515440763617),
 ('title_url_has_letter', 0.5422515440763617),
 ('prefix_has_url', 0.5420468903551874),
 ('prefix_has_num', 0.5417295211366427)]

In [17]:
sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)

[('title_prefix_cos_distance', 0.6091805298829328),
 ('title_prefix_distance', 0.6065538527686497),
 ('title_query_str_cos_distance', 0.5671591672263264),
 ('title_query_str_distance', 0.5649487336185238),
 ('title_has_symbol', 0.552434392855921),
 ('prefix_has_symbol', 0.5423157894736842)]

## good feature

In [10]:
import lightgbm as lgb
from sklearn.metrics import f1_score

In [11]:
def lgb_f1(y_pred, data):
    y_true = data.get_label()
    y_pred = np.round(y_pred)
    return 'f1', f1_score(y_true, y_pred), True

In [12]:
good_fea = ['title_query_dict_cos_distance','title_max_distance','title_rank_in_query_origin','title_max_cos_distance','title_rank_in_query_origin_count','title_rank_in_query_origin_tag_count','title_rate','title_in_query_count','title_in_query','title_in_query_tag_count','title_rank_in_query','rank_first_origin_rate_sub','title_rate_max_sub','title_rank_in_query_count','title_rank_in_query_tag_count','title_str_cos_distance','title_str_distance','title_origin_str_cos_distance','title_origin_str_distance','is_max_rate_tag_count','is_max_rate_count','is_max_rate','title_equal_prefix_tag_count','title_equal_prefix_count','title_equal_prefix','title_first_distance','prefix_tag_count','title_query_dict_distance','title_startswith_prefix_tag_count','title_startswith_prefix_count','title_startswith_prefix','title_first_cos_distance']

In [13]:
for i in good_fea:
    spr_tmp = csc_matrix(df[i]).transpose()
    spr_df = hstack((spr_df, spr_tmp))

spr_df = spr_df.astype(np.float64)
spr_df = spr_df.tocsc()

In [18]:
cv_params =  {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'num_leaves': 63,
        'learning_rate': 0.3,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'lambda_l1': 0.2,
        'lambda_l2': 0.2,
        'seed': 2018
        }

train_data = lgb.Dataset(spr_df[:2000000, :], label=df[df.is_val == 0].label)
val_data = lgb.Dataset(spr_df[2000000:2050000, :], label=df[df.is_val == 1].label)
bst = lgb.train(cv_params, train_data, early_stopping_rounds=100, verbose_eval=100, valid_sets=[train_data, val_data], 
            valid_names=['train', 'val'], num_boost_round=10000, feval=lgb_f1)

Training until validation scores don't improve for 100 rounds.
[100]	train's binary_logloss: 0.49272	train's f1: 0.650941	val's binary_logloss: 0.512573	val's f1: 0.644021
[200]	train's binary_logloss: 0.478078	train's f1: 0.668619	val's binary_logloss: 0.5009	val's f1: 0.658275
[300]	train's binary_logloss: 0.469041	train's f1: 0.677875	val's binary_logloss: 0.494272	val's f1: 0.666272
[400]	train's binary_logloss: 0.462899	train's f1: 0.683956	val's binary_logloss: 0.489751	val's f1: 0.6711
[500]	train's binary_logloss: 0.458542	train's f1: 0.688352	val's binary_logloss: 0.486099	val's f1: 0.673855
[600]	train's binary_logloss: 0.454394	train's f1: 0.692689	val's binary_logloss: 0.483349	val's f1: 0.677704
[700]	train's binary_logloss: 0.450261	train's f1: 0.697105	val's binary_logloss: 0.480596	val's f1: 0.681139
[800]	train's binary_logloss: 0.447386	train's f1: 0.699779	val's binary_logloss: 0.479196	val's f1: 0.68308
[900]	train's binary_logloss: 0.444656	train's f1: 0.702189	val