In [1]:
import pandas as pd
import lightgbm as lgb
import xgboost as xg
import numpy as np
import warnings
from sklearn import preprocessing
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import StratifiedKFold

In [2]:
warnings.filterwarnings("ignore") 

In [3]:
target = ["is_trade"]
categorical_features = ["item_brand_id", "item_city_id"]

In [4]:
data = pd.read_csv("data3.csv")

In [5]:
label = pd.read_csv("label3.csv", names=target)

In [6]:
all_data = pd.concat([data, label], axis=1)

In [7]:
all_data_copy = all_data.replace(-1, np.nan)

In [8]:
nan = all_data_copy.apply(lambda x:x.isnull().sum()/len(x))

In [9]:
drop_col = nan[nan>0.6].index.tolist()

In [10]:
all_data.drop(drop_col, axis=1, inplace=True)

In [11]:
continue_feature_data = all_data.select_dtypes(include=["float64"])
continue_feature_data.replace(-1.0, np.nan, inplace=True)

In [12]:
continue_feature_data.fillna(continue_feature_data.mean(), inplace=True)

In [13]:
all_data.loc[:, continue_feature_data.columns] = continue_feature_data

In [14]:
all_features = all_data.columns.tolist()

In [15]:
no_use = ["day", "is_trade"]
for feat in no_use:
    all_features.remove(feat)

In [16]:
use_features = all_features

In [17]:
train_data, test_data = all_data.query("day<24"), all_data.query("day==24")

In [18]:
train_data2, val_data = train_data.query("day<23"), train_data.query("day==23")

In [19]:
lgb_clf = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=48, max_depth=-1, learning_rate=0.05, n_estimators=2000,
                               max_bin=425, subsample_for_bin=50000, objective='binary', min_split_gain=0,
                               min_child_weight=5, min_child_samples=10, subsample=1, subsample_freq=1,
                               colsample_bytree=1, reg_alpha=3, reg_lambda=5, seed=1000, nthread=-1, silent=True)

In [20]:
model = lgb_clf.fit(train_data2[use_features], train_data2[target], \
            eval_set=[(train_data2[use_features], train_data2[target]), \
                      (val_data[use_features], val_data[target])],\
            early_stopping_rounds=200,\
            categorical_feature=categorical_features)

[1]	valid_0's binary_logloss: 0.0949118	valid_1's binary_logloss: 0.0865609
Training until validation scores don't improve for 200 rounds.
[2]	valid_0's binary_logloss: 0.0940822	valid_1's binary_logloss: 0.0859907
[3]	valid_0's binary_logloss: 0.0933339	valid_1's binary_logloss: 0.0854829
[4]	valid_0's binary_logloss: 0.092665	valid_1's binary_logloss: 0.0850785
[5]	valid_0's binary_logloss: 0.0920815	valid_1's binary_logloss: 0.084703
[6]	valid_0's binary_logloss: 0.0915485	valid_1's binary_logloss: 0.0843944
[7]	valid_0's binary_logloss: 0.091048	valid_1's binary_logloss: 0.0841048
[8]	valid_0's binary_logloss: 0.0906081	valid_1's binary_logloss: 0.0838566
[9]	valid_0's binary_logloss: 0.0901846	valid_1's binary_logloss: 0.0836266
[10]	valid_0's binary_logloss: 0.0897927	valid_1's binary_logloss: 0.083399
[11]	valid_0's binary_logloss: 0.0894356	valid_1's binary_logloss: 0.08322
[12]	valid_0's binary_logloss: 0.0890761	valid_1's binary_logloss: 0.0830421
[13]	valid_0's binary_loglos

[108]	valid_0's binary_logloss: 0.0756566	valid_1's binary_logloss: 0.0814292
[109]	valid_0's binary_logloss: 0.075588	valid_1's binary_logloss: 0.0814361
[110]	valid_0's binary_logloss: 0.0755068	valid_1's binary_logloss: 0.081447
[111]	valid_0's binary_logloss: 0.0754302	valid_1's binary_logloss: 0.0814559
[112]	valid_0's binary_logloss: 0.0753483	valid_1's binary_logloss: 0.0814699
[113]	valid_0's binary_logloss: 0.0752728	valid_1's binary_logloss: 0.0814829
[114]	valid_0's binary_logloss: 0.075189	valid_1's binary_logloss: 0.0814872
[115]	valid_0's binary_logloss: 0.0751222	valid_1's binary_logloss: 0.0814697
[116]	valid_0's binary_logloss: 0.0750635	valid_1's binary_logloss: 0.0814686
[117]	valid_0's binary_logloss: 0.0749924	valid_1's binary_logloss: 0.081471
[118]	valid_0's binary_logloss: 0.0749289	valid_1's binary_logloss: 0.0814766
[119]	valid_0's binary_logloss: 0.0748584	valid_1's binary_logloss: 0.0814907
[120]	valid_0's binary_logloss: 0.0747945	valid_1's binary_logloss: 

[215]	valid_0's binary_logloss: 0.0689992	valid_1's binary_logloss: 0.0820931
[216]	valid_0's binary_logloss: 0.0689519	valid_1's binary_logloss: 0.0820972
[217]	valid_0's binary_logloss: 0.0689022	valid_1's binary_logloss: 0.0821098
[218]	valid_0's binary_logloss: 0.0688525	valid_1's binary_logloss: 0.0821166
[219]	valid_0's binary_logloss: 0.0688026	valid_1's binary_logloss: 0.0821159
[220]	valid_0's binary_logloss: 0.0687524	valid_1's binary_logloss: 0.0821227
[221]	valid_0's binary_logloss: 0.0686956	valid_1's binary_logloss: 0.0821288
[222]	valid_0's binary_logloss: 0.0686526	valid_1's binary_logloss: 0.0821371
[223]	valid_0's binary_logloss: 0.0686039	valid_1's binary_logloss: 0.0821581
[224]	valid_0's binary_logloss: 0.0685616	valid_1's binary_logloss: 0.082158
[225]	valid_0's binary_logloss: 0.0685249	valid_1's binary_logloss: 0.0821601
[226]	valid_0's binary_logloss: 0.0684892	valid_1's binary_logloss: 0.082165
[227]	valid_0's binary_logloss: 0.06845	valid_1's binary_logloss: 

In [26]:
best_iter = model.best_iteration_



In [27]:
best_iter

62

In [28]:
lgb_clf = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=48, max_depth=-1, learning_rate=0.05, n_estimators=219,
                               max_bin=425, subsample_for_bin=50000, objective='binary', min_split_gain=0,
                               min_child_weight=5, min_child_samples=10, subsample=1, subsample_freq=1,
                               colsample_bytree=1, reg_alpha=3, reg_lambda=5, seed=1000, nthread=-1, silent=True)

In [29]:
lgb_clf.fit(train_data[use_features], train_data[target], categorical_feature=categorical_features)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1,
        learning_rate=0.05, max_bin=425, max_depth=-1,
        min_child_samples=10, min_child_weight=5, min_split_gain=0,
        n_estimators=219, n_jobs=-1, nthread=-1, num_leaves=48,
        objective='binary', random_state=None, reg_alpha=3, reg_lambda=5,
        seed=1000, silent=True, subsample=1, subsample_for_bin=50000,
        subsample_freq=1)

In [30]:
pre = lgb_clf.predict_proba(test_data[use_features])

In [31]:
log_loss(test_data[target], pre)

0.08178898551309843

In [32]:
roc_auc_score(test_data[target], pre[:,1])

0.7085618924474264

In [30]:
new_features = set(['item_brand_id', 'item_sales_level', 'item_collected_level', 'user_age_level',
'user_star_level', 'user_count_10_bf', 'click_item_id_user_id_tab', 'u_day_diffTime_last',
'diffWithLastView_item_brand_id', 'item_category_list_day__active_item_id__num',
'i_day_diffTime_last', 'item_price_level_mean_by_item_brand_id',
'user_id_day__active_item_city_id__num', 'user_id_lasttime_diff', 'i_day_diffTime_first',
'item_collected_level_mean_by_item_city_id'])

In [31]:
all_features = set(all_features)

In [32]:
selsect_features = list(all_features.intersection(new_features))

In [33]:
lgb_clf = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=48, max_depth=-1, learning_rate=0.05, n_estimators=2000,
                               max_bin=425, subsample_for_bin=50000, objective='binary', min_split_gain=0,
                               min_child_weight=5, min_child_samples=10, subsample=1, subsample_freq=1,
                               colsample_bytree=1, reg_alpha=3, reg_lambda=5, seed=1000, nthread=-1, silent=True)

In [34]:
model = lgb_clf.fit(train_data2[selsect_features], train_data2[target], \
            eval_set=[(train_data2[selsect_features], train_data2[target]), \
                      (val_data[selsect_features], val_data[target])],\
            early_stopping_rounds=200,\
            categorical_feature=categorical_features)

TypeError: Wrong type(str) or unknown name(item_city_id) in categorical_feature