In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns

import lightgbm as lgb
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

import warnings
import time
import sys
import datetime
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.linear_model import BayesianRidge
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)
import gc
pd.set_option('display.width',None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_info_columns', 500)
np.random.seed(4950)

In [19]:
train = pd.read_csv("./data/pre_train_clip.csv", index_col=0)


test = pd.read_csv("./data/pre_test_clip.csv", index_col=0)

cats = [ 'first_active_month',
        'feature_1', 'feature_2', 'feature_3', 
        'hist_first_year', 
        'hist_first_quarter', 
        'hist_first_month',
        'hist_re_year', 
        'hist_re_quarter', 
        'hist_re_month',
        'hist_now_year', 
        'hist_now_quarter', 
        'hist_now_month',
       ]

In [20]:
data = pd.concat([train, test])    
data.reset_index(drop=True, inplace=True)

for feat in cats:
    lbe = LabelEncoder()
    data[feat] = lbe.fit_transform(data[feat])
#     data[feat][data[feat]>=0] = lbe.fit_transform(data[feat][data[feat]>=0])

In [21]:
train[cats] = data[0:201917][cats]

In [22]:
tmp = data[201917:][cats].reset_index(drop=True)
test[cats] = tmp

In [23]:
for c in cats:
    print(train[c].unique())
    print(test[c].unique())
    print(data[c].unique())

[67 62 57 70 72 58 61 69 59 52 65 63 56 55 71 47 53 68 50 60 48 18 51 54
 27 66 73 64 45 42 38 33 39 31 32 28 43 30 49 44 40 46 26 29 35 36 34 25
 37 19 41 23 21 20  6  9 11  7 10  5 13 24 22 17 12 14  0 15  4  3 16 74
  8 75  1]
[65 62 69 73 49 68 71 63 52 64 53 61 57 72 70 59 67 54 16 37 56 66 58 60
 40 51 55 22 39 43 47 35 34 46 50 38 19 42 45 48 29 28 31 44  8 41 32 27
 36 20 30 33 11 25 74  3 21  7 24 23 26 15 17 12 10  5  6  4 13  9 18 14
  0  1  2]
[67 62 57 70 72 58 61 69 59 52 65 63 56 55 71 47 53 68 50 60 48 18 51 54
 27 66 73 64 45 42 38 33 39 31 32 28 43 30 49 44 40 46 26 29 35 36 34 25
 37 19 41 23 21 20  6  9 11  7 10  5 13 24 22 17 12 14  0 15  4  3 16 74
  8 75  1  2]
[4 3 1 0 2]
[2 1 4 0 3]
[4 3 1 0 2]
[1 0 2]
[2 0 1]
[1 0 2]
[1 0]
[1 0]
[1 0]
[6 5 4 2 3 1 0 7]
[6 4 5 2 3 1 7 0]
[6 5 4 2 3 1 0 7]
[1 0 2 3]
[1 0 2 3]
[1 0 2 3]
[ 5  0  7  8 10 11  9  2  3  1  6  4]
[ 3  0  7 11  6  9  1  2 10  8  5  4]
[ 5  0  7  8 10 11  9  2  3  1  6  4]
[1 0]
[0 1]
[1 0]
[0 2 3 1]
[3 

In [24]:
fold_target = train['target']
fold_target = (fold_target<-20).astype(int)

In [25]:
features = [i for i in train.columns if i not in ['card_id','target']]
print(len(features),features)

302 ['first_active_month', 'feature_1', 'feature_2', 'feature_3', 'authorized_flag_mean', 'hist_transactions_count', 'hist_is_month_start_mean', 'hist_weekend_mean', 'hist_category_1_mean', 'hist_category_2_nunique', 'hist_category_3_nunique', 'hist_state_id_nunique', 'hist_city_id_nunique', 'hist_subsector_id_nunique', 'hist_merchant_category_id_nunique', 'hist_merchant_id_nunique', 'hist_quarter_nunique', 'hist_month_nunique', 'hist_weekofyear_nunique', 'hist_dayofweek_nunique', 'hist_day_nunique', 'hist_hour_nunique', 'hist_a2p_mean', 'hist_a2p_median', 'hist_a2p_max', 'hist_a2p_min', 'hist_a2p_std', 'hist_p2r_mean', 'hist_p2r_median', 'hist_p2r_max', 'hist_p2r_min', 'hist_p2r_std', 'hist_p2now_mean', 'hist_p2now_median', 'hist_p2now_max', 'hist_p2now_min', 'hist_p2now_std', 'hist_month_lag_mean', 'hist_month_lag_median', 'hist_month_lag_max', 'hist_month_lag_min', 'hist_month_lag_std', 'hist_purchase_amount_sum', 'hist_purchase_amount_mean', 'hist_purchase_amount_median', 'hist_pur

In [26]:
obj_fea = []

for i in features:
    if str(train[i].dtype)=='object':
        print(i)
        obj_fea.append(i)

In [27]:
param = {
         'objective':'regression',
         "metric": 'rmse',
         "boosting": "gbdt",
         'num_leaves': 31,
         'min_data_in_leaf': 30, 
         'max_depth': -1,
         'learning_rate': 0.01,
         "lambda_l1": 0.1,
         "feature_fraction": 0.9,
         "bagging_fraction": 0.9 ,
         "bagging_freq": 1,
   
         "random_state": 1024,
         "verbosity": -1,
}

n_fold = 5

# param ={
#         'task': 'train',
#         'boosting': 'goss',
#         'objective': 'regression',
#         'metric': 'rmse',
#         'learning_rate': 0.01,
#         'subsample': 0.9855232997390695,
#         'max_depth': 7,
#         'top_rate': 0.9064148448434349,
#         'num_leaves': 63,
#         'min_child_weight': 41.9612869171337,
#         'other_rate': 0.0721768246018207,
#         'reg_alpha': 9.677537745007898,
#         'colsample_bytree': 0.5665320670155495,
#         'min_split_gain': 9.820197773625843,
#         'reg_lambda': 8.2532317400459,
#         'min_data_in_leaf': 21,
#         'verbose': -1,
#         'seed':int(2**n_fold),
#         'bagging_seed':int(2**n_fold),
#         'drop_seed':int(2**n_fold)
#         }

In [None]:
# from sklearn.model_selection import RepeatedKFold
# folds = RepeatedKFold(n_splits=5, n_repeats=2, random_state=2333)
folds = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=4590)
oof_train = np.zeros((len(train),1))
oof_test = np.zeros(len(test))
oof_test_skf = np.zeros((5,len(test),1))
start = time.time()
feature_importance_df = pd.DataFrame()

target = train['target']

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train, fold_target)):
    print("fold n°{}".format(fold_+1))
    trn_data = lgb.Dataset(train.iloc[trn_idx][features], label=target.iloc[trn_idx])
    val_data = lgb.Dataset(train.iloc[val_idx][features], label=target.iloc[val_idx])
#     trn_data = lgb.Dataset(train.iloc[trn_idx][features], label=target.iloc[trn_idx])
#     val_data = lgb.Dataset(train.iloc[val_idx][features], label=target.iloc[val_idx])

    num_round = 10000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data],
                    verbose_eval=100, early_stopping_rounds = 200)
    oof_train[val_idx] = clf.predict(train.iloc[val_idx][features], num_iteration=clf.best_iteration).reshape(-1,1)
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = features
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    oof_test_skf[fold_,:]= clf.predict(test[features], num_iteration=clf.best_iteration).reshape(-1,1)
#     oof_test += clf.predict(test[features], num_iteration=clf.best_iteration) / 10
    oof_test += clf.predict(test[features], num_iteration=clf.best_iteration) / folds.n_splits

fold n°1
Training until validation scores don't improve for 200 rounds.


In [None]:
print("CV score: {:<8.5f}".format(mean_squared_error(oof_train, target)**0.5))

In [None]:

mean_loss=np.sqrt(mean_squared_error(oof_train.reshape(-1), target))
print(mean_loss)
sub_df = pd.DataFrame({"card_id":test["card_id"].values})
sub_df["target"] = oof_test
sub_df.to_csv("sub/submission_%s.csv"%mean_loss, index=False)

train_prob=pd.DataFrame(oof_train)
train_prob.columns=['class1']
train_prob.to_csv("oof/train_prob_%s.csv"%mean_loss,index=False)

test_prob=pd.DataFrame(oof_test)
test_prob.columns=['class1']
test_prob.to_csv("oof/test_prob_%s.csv"%mean_loss,index=False)



In [None]:
#feature select

def get_feature_importances(data, shuffle, seed=None):
    # Gather real features
    train_features = features
    # Go over fold and keep track of CV score (train and valid) and feature importances
    
    # Shuffle target if required
    y = data['target'].copy()
    if shuffle:
        # Here you could as well use a binomial distribution
        y = data['target'].copy().sample(frac=1.0)
    
    # Fit LightGBM in RF mode, yes it's quicker than sklearn RandomForest
    dtrain = lgb.Dataset(data[train_features], y, free_raw_data=False, silent=True)
    lgb_params = {
        'objective': 'regression',
        'boosting_type': 'rf',
        'subsample': 0.623,
        'colsample_bytree': 0.7,
        'num_leaves': 127,
        'max_depth': 8,
        'seed': 4590,
        'bagging_freq': 1,
        'n_jobs': 4
    }
    
    # Fit the model
    clf = lgb.train(params=lgb_params, train_set=dtrain, num_boost_round=200)

    # Get feature importances
    imp_df = pd.DataFrame()
    imp_df["feature"] = list(train_features)
    imp_df["importance_gain"] = clf.feature_importance(importance_type='gain')
    imp_df["importance_split"] = clf.feature_importance(importance_type='split')
    
    return imp_df


# Seed the unexpected randomness of this world
np.random.seed(123)
# Get the actual importance, i.e. without shuffling
actual_imp_df = get_feature_importances(data=train, shuffle=False)


null_imp_df = pd.DataFrame()
nb_runs = 80
import time
start = time.time()
dsp = ''
for i in range(nb_runs):
    # Get current run importances
    imp_df = get_feature_importances(data=train, shuffle=True)
    imp_df['run'] = i + 1 
    # Concat the latest importances with the old ones
    null_imp_df = pd.concat([null_imp_df, imp_df], axis=0)
    # Erase previous message
    for l in range(len(dsp)):
        print('\b', end='', flush=True)
    # Display current run and time used
    spent = (time.time() - start) / 60
    dsp = 'Done with %4d of %4d (Spent %5.1f min)' % (i + 1, nb_runs, spent)
    print(dsp, end='', flush=True)



In [None]:
feature_scores = []
for _f in actual_imp_df['feature'].unique():
    f_null_imps_gain = null_imp_df.loc[null_imp_df['feature'] == _f, 'importance_gain'].values
    f_act_imps_gain = actual_imp_df.loc[actual_imp_df['feature'] == _f, 'importance_gain'].mean()
    gain_score = np.log(1e-10 + f_act_imps_gain / (1 + np.percentile(f_null_imps_gain, 75)))  # Avoid didvide by zero
    f_null_imps_split = null_imp_df.loc[null_imp_df['feature'] == _f, 'importance_split'].values
    f_act_imps_split = actual_imp_df.loc[actual_imp_df['feature'] == _f, 'importance_split'].mean()
    split_score = np.log(1e-10 + f_act_imps_split / (1 + np.percentile(f_null_imps_split, 75)))  # Avoid didvide by zero
    feature_scores.append((_f, split_score, gain_score))

scores_df = pd.DataFrame(feature_scores, columns=['feature', 'split_score', 'gain_score'])



In [None]:
thr_list=scores_df['gain_score'].tolist()
thr_list=[ i for i in set(thr_list)]
thr_list=sorted(thr_list)
thr_list

In [None]:
def test_feature(test_fea):
    param['n_jobs']=3
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=4590)
   
    train_y = target
    oof_train = np.zeros((len(train),1))
    oof_test = np.zeros((len(test),1))
    
    train.reset_index(drop=True,inplace=True)
    for idx,(idx_trn,idx_val) in enumerate(kf.split(train,train['outliers'])):
        print('第 %d fold'%idx)
        tr_x,tr_y,val_x,val_y=train.iloc[idx_trn][test_fea],train['target'].iloc[idx_trn],train.iloc[idx_val][test_fea],train['target'][idx_val]

        trn_data = lgb.Dataset(tr_x,tr_y)#, categorical_feature=categorical_feats)
        val_data = lgb.Dataset(val_x,val_y)#, categorical_feature=categorical_feats)

        num_round = 10000
        clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=200, early_stopping_rounds = 500)
        oof_train[idx_val] = clf.predict(val_x, num_iteration=clf.best_iteration).reshape(-1,1)
        oof_test += clf.predict(test[test_fea], num_iteration=clf.best_iteration).reshape(-1,1)/5
        
        

    loss=np.sqrt(mean_squared_error(oof_train.reshape(-1), target))
    
    print('mean loss %f'%loss)
    if loss<3.660:
        train_prob=pd.DataFrame(oof_train)
        train_prob.columns=['class1']
        train_prob.to_csv("oof/train_prob_%s.csv"%mean_loss,index=False)

        test_prob=pd.DataFrame(oof_test)
        test_prob.columns=['class1']
        test_prob.to_csv("oof/test_prob_%s.csv"%mean_loss,index=False)


    return loss


test_dict={}
for idx,i in enumerate(thr_list):
    if idx>50:
        break
    print(i)
    bed_fea=set(scores_df[scores_df['gain_score']<=i]['feature'].tolist())
    bed_fea=[i for i in bed_fea]
    print(bed_fea)
    del_col =  ['card_id', 'first_active_month','target','outliers']+bed_fea
    df_test_fea = [c for c in features if c not in del_col ]
    test_dict[''.join(bed_fea)]=test_feature(df_test_fea)
