In [2]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
import gc
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold

In [3]:
campaign  = pd.read_csv('input/campaign_data.csv')
campaign1 = campaign.drop(['subject','email_url','email_body'],axis=1)
gc.collect()

0

In [4]:
np.random.seed(123)

In [5]:
def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))
def target_encode(trn_series=None,val_series=None,
                  tst_series=None,
                  target=None,
                  min_samples_leaf=1,
                  smoothing=1,
                  noise_level=0):
    """
    Smoothing is computed like in the following paper by Daniele Micci-Barreca
    https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf
    trn_series : training categorical feature as a pd.Series
    tst_series : test categorical feature as a pd.Series
    target : target data as a pd.Series
    min_samples_leaf (int) : minimum samples to take category average into account
    smoothing (int) : smoothing effect to balance categorical average vs prior
    """
    assert len(trn_series) == len(target)
    #assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series, target], axis=1)
    # Compute target mean
    averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    # Apply average function to all target data
    prior = target.mean()
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index
    ft_val_series = pd.merge(
        val_series.to_frame(val_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=val_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    ft_val_series.index = val_series.index
    
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_tst_series.index = tst_series.index
    return add_noise(ft_trn_series, noise_level), ft_val_series,ft_tst_series

In [6]:
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')
gc.collect()

0

In [7]:
all_data = pd.concat([train,test])

In [8]:
user_clust = pd.read_csv('./input/user_cluster1.csv')
all_data = all_data.merge(user_clust,on='user_id',how='left')

In [10]:
all_data['send_date'] = all_data.send_date.apply(lambda x: pd.datetime.strptime(x,'%d-%m-%Y %H:%M'))

In [11]:
all_data['send_dayofweek'] = all_data.send_date.dt.dayofweek

In [12]:
all_data.columns

Index(['campaign_id', 'id', 'is_click', 'is_open', 'send_date', 'user_id',
       'clust_id', 'send_dayofweek'],
      dtype='object')

In [13]:
#count features
all_data['cnt_sd'] = all_data.groupby('send_date')['user_id'].transform('count')

In [14]:
all_data = all_data.merge(campaign1,on='campaign_id',how='left')

In [15]:
all_data.columns

Index(['campaign_id', 'id', 'is_click', 'is_open', 'send_date', 'user_id',
       'clust_id', 'send_dayofweek', 'cnt_sd', 'communication_type',
       'total_links', 'no_of_internal_links', 'no_of_images',
       'no_of_sections'],
      dtype='object')

In [16]:
le1 = LabelEncoder()
all_data.loc[:,'communication_type'] = le1.fit_transform(all_data.communication_type)   
all_data['usr_cnt'] = all_data.groupby('user_id')['user_id'].transform('count')
all_data['cm_cnt'] = np.log(all_data.groupby('communication_type')['communication_type'].transform('count'))
#all_data['camp_cnt'] = all_data.groupby('campaign_id')['campaign_id'].transform('count')

In [17]:
test = all_data[len(train):]
train = all_data[:len(train)]

In [18]:
#del all_data
gc.collect()

35

In [73]:
xgb_params = {}
xgb_params['eta'] = 0.07
xgb_params['max_depth'] = 5
xgb_params['max_leaves'] = 31
xgb_params['max_bin'] = 10
xgb_params['min_child_weight '] = 100
xgb_params['subsample'] = 0.6
xgb_params['colsample_bytree'] = 0.77
xgb_params['objective'] = 'binary:logistic'
xgb_params['eval_metric'] = 'auc'
xgb_params['verbose'] = 1
xgb_params['scale_pos_weight'] = 1.

xgb_params['max_bin']=10
xgb_params['max_delta_step']=1
xgb_params['nthread']=7
xgb_params['booster']='gbtree'

In [91]:
nfold =2
kf = KFold(n_splits=nfold,random_state=123,shuffle=False)
unq_campaign_id = np.sort(train.campaign_id.unique())

test_subm = test[['id']]
test_subm['is_click'] = 0
train_score = train[['is_click']]
train_score['pred'] = 0
nbag = 3
cf =0
for train_index, test_index in kf.split(unq_campaign_id):
    cf+=1
    print('Fold:',cf)
    
    test1 = test.copy()
    tr_cid = unq_campaign_id[train_index]
    val_cid = unq_campaign_id[test_index]
    print('val_cid',val_cid)

    val = train[train.campaign_id.isin(tr_cid)]
    train1 = train[train.campaign_id.isin(val_cid)]
    print(val.shape,train1.shape)

    a1,a2,a3 = target_encode(train1['user_id'],val['user_id'],
                             test1['user_id'],train1.is_click,noise_level=.9,smoothing=5)
    train1.loc[:,'mean_is_click'] = a1
    val.loc[:,'mean_is_click'] = a2
    test1.loc[:,'mean_is_click'] = a3


    a1,a2,a3 = target_encode(train1['user_id'],val['user_id'],
                             test1['user_id'],train1.is_open,noise_level=.9,smoothing=1.)
    train1.loc[:,'mean_is_open'] = a1
    val.loc[:,'mean_is_open'] = a2
    test1.loc[:,'mean_is_open'] = a3


    a1,a2,a3 = target_encode(train1['communication_type'],val['communication_type'],
                             test1['communication_type'],train1.is_open,noise_level=0)
    train1.loc[:,'mean_ct'] = a1
    val.loc[:,'mean_ct'] = a2
    test1.loc[:,'mean_ct'] = a3

    #a1,a2,a3 = target_encode(train1['communication_type'],val['communication_type'],
    #                         test1['communication_type'],train1.is_click,noise_level=0)
    #train1.loc[:,'mean_clk_ct'] = a1
    #val.loc[:,'mean_clk_ct'] = a2
    #test1.loc[:,'mean_clk_ct'] = a3


    a1,a2,a3 = target_encode(train1['clust_id'],val['clust_id'],
                             test1['clust_id'],train1.is_click,noise_level=0)
    train1.loc[:,'mean_clk_clust_id'] = a1
    val.loc[:,'mean_clk_clust_id'] = a2
    test1.loc[:,'mean_clk_clust_id'] = a3



    gc.collect()
    val.drop(['id','campaign_id','is_open','send_date',
              'user_id','no_of_images','no_of_sections','no_of_internal_links'],axis=1,inplace=True)
    train1.drop(['id','campaign_id','is_open','send_date',
                 'user_id','no_of_images','no_of_sections','no_of_internal_links'],axis=1,inplace=True)
    test1.drop(['id','campaign_id','is_open','send_date',
               'user_id','no_of_images','no_of_sections','no_of_internal_links'],axis=1,inplace=True)
    gc.collect()
    train_y = train1.is_click.values
    val_y = val.is_click.values
    val.drop(['is_click'],axis=1,inplace=True)
    train1.drop(['is_click'],axis=1,inplace=True)
    test1.drop(['is_click'],axis=1,inplace=True)
    
    dtrain = xgb.DMatrix(train1,label=train_y)
    dval = xgb.DMatrix(val[train1.columns],label=val_y)
    dtest =  xgb.DMatrix(test1[train1.columns])
    gc.collect()
    
    evals_results = {}
    np.random.seed(0)
    
    for bg in range(nbag):
        xgb_params['seed'] = 100*cf + bg
        watchlist = [(dval, 'eval'), (dtrain, 'train')]

        bst = xgb.train(xgb_params, dtrain, 70, watchlist,early_stopping_rounds=150,
                        verbose_eval=10,maximize=True)
    
        train_score.loc[val.index,'pred'] += bst.predict(dval)
        test_subm['is_click'] += bst.predict(dtest)
    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Fold: 1
val_cid [29 30 31 32 33 34 35 36 37 38 39 40 41]
(588141, 16) (435050, 16)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame


[0]	eval-auc:0.507239	train-auc:0.630868
Multiple eval metrics have been passed: 'train-auc' will be used for early stopping.

Will train until train-auc hasn't improved in 150 rounds.
[10]	eval-auc:0.549008	train-auc:0.941565
[20]	eval-auc:0.54935	train-auc:0.970986
[30]	eval-auc:0.621005	train-auc:0.985933
[40]	eval-auc:0.623506	train-auc:0.986568
[50]	eval-auc:0.63699	train-auc:0.987577
[60]	eval-auc:0.643214	train-auc:0.988096
[69]	eval-auc:0.640067	train-auc:0.98841


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


[0]	eval-auc:0.513118	train-auc:0.699374
Multiple eval metrics have been passed: 'train-auc' will be used for early stopping.

Will train until train-auc hasn't improved in 150 rounds.
[10]	eval-auc:0.549338	train-auc:0.963648
[20]	eval-auc:0.549279	train-auc:0.972213
[30]	eval-auc:0.585382	train-auc:0.981735
[40]	eval-auc:0.628757	train-auc:0.986893
[50]	eval-auc:0.648687	train-auc:0.987708
[60]	eval-auc:0.644673	train-auc:0.988209
[69]	eval-auc:0.643383	train-auc:0.988605


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


[0]	eval-auc:0.508098	train-auc:0.654833
Multiple eval metrics have been passed: 'train-auc' will be used for early stopping.

Will train until train-auc hasn't improved in 150 rounds.
[10]	eval-auc:0.549292	train-auc:0.96816
[20]	eval-auc:0.549248	train-auc:0.971653
[30]	eval-auc:0.556878	train-auc:0.977378
[40]	eval-auc:0.630023	train-auc:0.986505
[50]	eval-auc:0.637828	train-auc:0.986918
[60]	eval-auc:0.647163	train-auc:0.988098
[69]	eval-auc:0.644905	train-auc:0.98856
Fold: 2
val_cid [42 43 44 45 46 47 48 49 50 51 52 53 54]
(435050, 16) (588141, 16)
[0]	eval-auc:0.521143	train-auc:0.660784
Multiple eval metrics have been passed: 'train-auc' will be used for early stopping.

Will train until train-auc hasn't improved in 150 rounds.
[10]	eval-auc:0.574698	train-auc:0.959211
[20]	eval-auc:0.582151	train-auc:0.966154
[30]	eval-auc:0.64668	train-auc:0.981158
[40]	eval-auc:0.646689	train-auc:0.982213
[50]	eval-auc:0.646649	train-auc:0.982668
[60]	eval-auc:0.669827	train-auc:0.98389
[69]	

In [93]:
from sklearn.metrics import roc_auc_score

In [94]:
train_score['pred']/=3

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [95]:
train_score.head(5),train_score.tail(5)

(   is_click      pred
 0       0.0  0.007969
 1       0.0  0.010713
 2       0.0  0.008167
 3       0.0  0.008279
 4       0.0  0.007871,          is_click      pred
 1023186       0.0  0.008056
 1023187       0.0  0.009954
 1023188       1.0  0.007931
 1023189       0.0  0.009988
 1023190       0.0  0.007871)

In [96]:
roc_auc_score(train_score.is_click,train_score.pred)

0.6603355706439776

In [97]:
test_subm['is_click'] /= nfold*nbag

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [98]:
test_subm.to_csv('./xgb_2fold-cv2_bag3_nt70_scalepos1_nt70.csv',index=False)