In [2]:
import numpy as np
import os
import pandas as pd
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
# import seaborn as sns
import pickle
import time
import gc
from tqdm import tqdm, tqdm_notebook

%matplotlib inline

#每次可以输出多个变量
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from pylab import rcParams
rcParams['figure.figsize'] = 14, 6

import warnings
warnings.filterwarnings("ignore")

#中文字体
import matplotlib
matplotlib.use('qt4agg')
#指定默认字体
matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['font.family'] = 'sans-serif'
#解决负号'-'显示为方块的问题
matplotlib.rcParams['axes.unicode_minus'] = False

In [3]:
label = pd.read_csv('./data/round1_iflyad_train.txt', sep='\t', usecols=['click'])
name_col = list(np.load('./feature/sparse_fea_name.np.npy'))
train_len = 1001650

In [4]:
from scipy import sparse
import lightgbm as lgb

In [46]:
spr_df = sparse.load_npz('./data/sparse_df_content_noinstanceid.npz')
# name_col.remove('instance_id')

In [49]:
spr_df.data = np.array(spr_df.data, dtype=np.float64)

In [50]:
cv_params =  {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'min_data_in_leaf': 300,
    'num_leaves': 15,
    'max_depth': 5,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'lambda_l1': 0.2,
    'lambda_l2': 0.2,
    'seed': 2018
    }
    
# content:2253 0.4173767716758179
# no_content:3247 0.41655446978701055
# content no_instance_id:2124 0.4174006524561557
train_data = lgb.Dataset(spr_df[:train_len, ], label=label.click)
train_data.set_feature_name(name_col)
bst = lgb.cv(cv_params, train_data, nfold=3, early_stopping_rounds=100, verbose_eval=100, num_boost_round=10000, seed=2018)
print('full features:', len(bst['binary_logloss-mean']), bst['binary_logloss-mean'][-1])

[100]	cv_agg's binary_logloss: 0.426093 + 6.08436e-05
[200]	cv_agg's binary_logloss: 0.422111 + 9.09832e-05
[300]	cv_agg's binary_logloss: 0.420628 + 0.000123998
[400]	cv_agg's binary_logloss: 0.419784 + 0.000115317
[500]	cv_agg's binary_logloss: 0.419189 + 0.000136964
[600]	cv_agg's binary_logloss: 0.418797 + 0.000146487
[700]	cv_agg's binary_logloss: 0.418498 + 0.00014391
[800]	cv_agg's binary_logloss: 0.418267 + 0.000129932
[900]	cv_agg's binary_logloss: 0.41809 + 0.000136038
[1000]	cv_agg's binary_logloss: 0.417965 + 0.000135858
[1100]	cv_agg's binary_logloss: 0.417847 + 0.000128018
[1200]	cv_agg's binary_logloss: 0.417754 + 0.000123504
[1300]	cv_agg's binary_logloss: 0.417674 + 0.000133176
[1400]	cv_agg's binary_logloss: 0.417605 + 0.000136355
[1500]	cv_agg's binary_logloss: 0.41755 + 0.000130366
[1600]	cv_agg's binary_logloss: 0.417519 + 0.000136987
[1700]	cv_agg's binary_logloss: 0.417491 + 0.000148554
[1800]	cv_agg's binary_logloss: 0.417464 + 0.000158124
[1900]	cv_agg's binary

In [51]:
bst = lgb.train(cv_params, train_data, verbose_eval=100, num_boost_round=int(len(bst['binary_logloss-mean'])*1))

## feature select

In [30]:
fea_imp = dict(zip(lgb.Booster.feature_name(bst), lgb.Booster.feature_importance(bst)))
sorted(fea_imp.items(), key=lambda x:x[1], reverse=True)[2000]

('Column_482', 0)

In [40]:
good_fea = [int(i[0][i[0].find('_')+1:]) for i in fea_imp.items() if i[1] >= 1]
len(good_fea)

1595

In [34]:
spr_df[:train_len, good_fea]

<1001650x876 sparse matrix of type '<class 'numpy.float64'>'
	with 75286733 stored elements in Compressed Sparse Column format>

In [12]:
# np.save('./feature/fea_imp_10.np', np.array(good_fea))

In [41]:
cv_params =  {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'min_data_in_leaf': 300,
    'num_leaves': 15,
    'max_depth': 7,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'lambda_l1': 0.2,
    'lambda_l2': 0.2,
    'seed': 2018
    }
    
# content:2253 0.4173767716758179
# no_content:3247 0.41655446978701055
# good>=10:3972 0.41587608675203386
train_data = lgb.Dataset(spr_df[:train_len, good_fea].toarray(), label=label.click)
# train_data.set_feature_name(name_col)
bst = lgb.cv(cv_params, train_data, nfold=3, early_stopping_rounds=100, verbose_eval=100, num_boost_round=10000, seed=2018)
print('good features:', len(bst['binary_logloss-mean']), bst['binary_logloss-mean'][-1])

[100]	cv_agg's binary_logloss: 0.424268 + 6.04845e-05
[200]	cv_agg's binary_logloss: 0.420938 + 0.000101993
[300]	cv_agg's binary_logloss: 0.419631 + 0.000129766
[400]	cv_agg's binary_logloss: 0.418889 + 0.000120081
[500]	cv_agg's binary_logloss: 0.418419 + 0.00012162
[600]	cv_agg's binary_logloss: 0.418109 + 0.000157707
[700]	cv_agg's binary_logloss: 0.417901 + 0.000166423
[800]	cv_agg's binary_logloss: 0.417768 + 0.000172144
[900]	cv_agg's binary_logloss: 0.417671 + 0.000163151
[1000]	cv_agg's binary_logloss: 0.417584 + 0.000172872
[1100]	cv_agg's binary_logloss: 0.417523 + 0.000177609
[1200]	cv_agg's binary_logloss: 0.417473 + 0.000176013
[1300]	cv_agg's binary_logloss: 0.417439 + 0.000177537
[1400]	cv_agg's binary_logloss: 0.417406 + 0.000187188
[1500]	cv_agg's binary_logloss: 0.417378 + 0.000192172
[1600]	cv_agg's binary_logloss: 0.417358 + 0.000192455
[1700]	cv_agg's binary_logloss: 0.417348 + 0.000193428
[1800]	cv_agg's binary_logloss: 0.417349 + 0.000195676
good features: 1726 

In [42]:
bst = lgb.train(cv_params, train_data, verbose_eval=100, num_boost_round=int(len(bst['binary_logloss-mean'])*1))

## result

In [43]:
test = pd.read_csv('./data/round1_iflyad_test_feature.txt', usecols=['instance_id'], sep='\t')
test['predicted_score'] = bst.predict(spr_df[train_len:, good_fea].toarray())

In [44]:
test.shape
test.head()

(40024, 2)

Unnamed: 0,instance_id,predicted_score
0,6930856710792380886,0.12581
1,5460409694420131920,0.272476
2,982813438159141507,0.092677
3,529991959116679673,0.262487
4,5357053206615171780,0.004271


In [45]:
test.to_csv('./result/lgb_feature_full_sparse_noinstanceid_good1.csv', index=None)