In [7]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
import gc
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold

In [8]:
campaign  = pd.read_csv('input/campaign_data.csv')
'''
vectorizer = CountVectorizer(ngram_range=(1,3))
n_grams = vectorizer.fit_transform(campaign.subject)
tsvd = TruncatedSVD(2,n_iter=250)
tsvd_subject_feats = tsvd.fit_transform(n_grams)
campaign['email_body'] = campaign.email_body.apply(lambda x: x.replace("\r\n",""))
vectorizer = CountVectorizer(ngram_range=(1,4))
n_grams = vectorizer.fit_transform(campaign.email_body)
tsvd = TruncatedSVD(4,n_iter=250)
tsvd_email_body_feats = tsvd.fit_transform(n_grams)
for i in range(tsvd_subject_feats.shape[1]):
    campaign.loc[:,'sub_'+str(i)] = tsvd_subject_feats[:,i]
for i in range(tsvd_email_body_feats.shape[1]):
    campaign.loc[:,'eb_'+str(i)] = tsvd_email_body_feats[:,i]
'''
campaign1 = campaign.drop(['subject','email_url','email_body'],axis=1)
gc.collect()

1961

In [9]:
np.random.seed(123)

In [10]:
def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))
def target_encode(trn_series=None,val_series=None,
                  tst_series=None,
                  target=None,
                  min_samples_leaf=1,
                  smoothing=1,
                  noise_level=0):
    """
    Smoothing is computed like in the following paper by Daniele Micci-Barreca
    https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf
    trn_series : training categorical feature as a pd.Series
    tst_series : test categorical feature as a pd.Series
    target : target data as a pd.Series
    min_samples_leaf (int) : minimum samples to take category average into account
    smoothing (int) : smoothing effect to balance categorical average vs prior
    """
    assert len(trn_series) == len(target)
    #assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series, target], axis=1)
    # Compute target mean
    averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    # Apply average function to all target data
    prior = target.mean()
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index
    ft_val_series = pd.merge(
        val_series.to_frame(val_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=val_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    ft_val_series.index = val_series.index
    
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_tst_series.index = tst_series.index
    return add_noise(ft_trn_series, noise_level), ft_val_series,ft_tst_series

In [11]:
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')
gc.collect()

0

In [12]:
all_data = pd.concat([train,test])

In [13]:
user_clust = pd.read_csv('./input/user_cluster1.csv')
all_data = all_data.merge(user_clust,on='user_id',how='left')

In [14]:
all_data['send_date'] = all_data.send_date.apply(lambda x: pd.datetime.strptime(x,'%d-%m-%Y %H:%M'))

In [15]:
all_data['send_dayofweek'] = all_data.send_date.dt.dayofweek

In [16]:
all_data.columns

Index(['campaign_id', 'id', 'is_click', 'is_open', 'send_date', 'user_id',
       'clust_id', 'send_dayofweek'],
      dtype='object')

In [17]:
#count features
all_data['cnt_sd'] = all_data.groupby('send_date')['user_id'].transform('count')

In [18]:
all_data = all_data.merge(campaign1,on='campaign_id',how='left')

In [19]:
all_data.columns

Index(['campaign_id', 'id', 'is_click', 'is_open', 'send_date', 'user_id',
       'clust_id', 'send_dayofweek', 'cnt_sd', 'communication_type',
       'total_links', 'no_of_internal_links', 'no_of_images',
       'no_of_sections'],
      dtype='object')

In [20]:
le1 = LabelEncoder()
all_data.loc[:,'communication_type'] = le1.fit_transform(all_data.communication_type)   
all_data['usr_cnt'] = all_data.groupby('user_id')['user_id'].transform('count')
all_data['cm_cnt'] = np.log(all_data.groupby('communication_type')['communication_type'].transform('count'))
#all_data['camp_cnt'] = all_data.groupby('campaign_id')['campaign_id'].transform('count')

In [21]:
test = all_data[len(train):]
train = all_data[:len(train)]

In [22]:
#del all_data
gc.collect()

35

In [23]:
lgb_params = {}
lgb_params['learning_rate'] = 0.01
lgb_params['num_leaves'] = 31
lgb_params['max_depth'] = 5
lgb_params['max_bin'] = 10
lgb_params['min_data_in_leaf'] = 50
lgb_params['subsample'] = 0.6
lgb_params['colsample_bytree'] = 0.7
lgb_params['feature_fraction'] = 0.7,
lgb_params['bagging_fraction'] = 0.77,
lgb_params['objective'] = 'binary'
lgb_params['metric'] = {'auc'}
lgb_params['verbose'] = 1
lgb_params['scale_pos_weight'] = 1.
lgb_params['boosting_type'] = 'gbdt'
lgb_params['min_split_gain'] = 0.0001
#lgb_params['bagging_fraction'] = 0.7
lgb_params['bagging_freq'] = 100000


In [26]:
nfold =5
kf = KFold(n_splits=nfold,random_state=123,shuffle=False)
unq_campaign_id = np.sort(train.campaign_id.unique())

test_subm = test[['id']]
test_subm['is_click'] = 0
train_score = train[['is_click']]
train_score['pred'] = 0
nbag = 5
cf =0
for train_index, test_index in kf.split(unq_campaign_id):
    cf+=1
    print('Fold:',cf)
    
    test1 = test.copy()
    tr_cid = unq_campaign_id[train_index]
    val_cid = unq_campaign_id[test_index]
    print('val_cid',val_cid)

    val = train[train.campaign_id.isin(val_cid)]
    train1 = train[train.campaign_id.isin(tr_cid)]
    print(val.shape,train1.shape)

    a1,a2,a3 = target_encode(train1['user_id'],val['user_id'],
                             test1['user_id'],train1.is_click,noise_level=.9,smoothing=5)
    train1.loc[:,'mean_is_click'] = a1
    val.loc[:,'mean_is_click'] = a2
    test1.loc[:,'mean_is_click'] = a3


    a1,a2,a3 = target_encode(train1['user_id'],val['user_id'],
                             test1['user_id'],train1.is_open,noise_level=.9,smoothing=1.)
    train1.loc[:,'mean_is_open'] = a1
    val.loc[:,'mean_is_open'] = a2
    test1.loc[:,'mean_is_open'] = a3


    a1,a2,a3 = target_encode(train1['communication_type'],val['communication_type'],
                             test1['communication_type'],train1.is_open,noise_level=0)
    train1.loc[:,'mean_ct'] = a1
    val.loc[:,'mean_ct'] = a2
    test1.loc[:,'mean_ct'] = a3

    a1,a2,a3 = target_encode(train1['communication_type'],val['communication_type'],
                             test1['communication_type'],train1.is_click,noise_level=0)
    train1.loc[:,'mean_clk_ct'] = a1
    val.loc[:,'mean_clk_ct'] = a2
    test1.loc[:,'mean_clk_ct'] = a3


    a1,a2,a3 = target_encode(train1['clust_id'],val['clust_id'],
                             test1['clust_id'],train1.is_click,noise_level=0)
    train1.loc[:,'mean_clk_clust_id'] = a1
    val.loc[:,'mean_clk_clust_id'] = a2
    test1.loc[:,'mean_clk_clust_id'] = a3



    gc.collect()
    val.drop(['id','campaign_id','is_open','send_date',
              'user_id','no_of_images','no_of_sections','no_of_internal_links'],axis=1,inplace=True)
    train1.drop(['id','campaign_id','is_open','send_date',
                 'user_id','no_of_images','no_of_sections','no_of_internal_links'],axis=1,inplace=True)
    test1.drop(['id','campaign_id','is_open','send_date',
               'user_id','no_of_images','no_of_sections','no_of_internal_links'],axis=1,inplace=True)
    gc.collect()
    train_y = train1.is_click.values
    val_y = val.is_click.values
    val.drop(['is_click'],axis=1,inplace=True)
    train1.drop(['is_click'],axis=1,inplace=True)
    test1.drop(['is_click'],axis=1,inplace=True)
    
    lgtrain = lgb.Dataset(train1, label=train_y,categorical_feature=['communication_type','send_dayofweek','clust_id'],
                      free_raw_data=False)
    lgvalid = lgb.Dataset(val, label=val_y,categorical_feature=['communication_type','send_dayofweek','clust_id'],
                     free_raw_data=False)
    gc.collect()
    
    evals_results = {}
    np.random.seed(0)
    
    test_subm['is_click'+str(cf)]=0
    
    for bg in range(nbag):
        lgb_params['feature_fraction_seed'] = 100*cf + bg
        bst1 = lgb.train(lgb_params, 
                     lgtrain, 
                     valid_sets=[lgtrain, lgvalid], 
                     valid_names=['train','valid'], 
                     evals_result=evals_results, 
                     num_boost_round=55,
                     early_stopping_rounds=1000,
                     verbose_eval=10)
        train_score.loc[val.index,'pred'] += bst1.predict(val[train1.columns],num_iteration=51)
        test_subm['is_click'+str(cf)] += bst1.predict(test1[train1.columns],num_iteration=51)
    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Fold: 1
val_cid [29 30 31 32 33 34]
(331628, 16) (691563, 16)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame


Training until validation scores don't improve for 1000 rounds.
[10]	train's auc: 0.959499	valid's auc: 0.64165
[20]	train's auc: 0.960904	valid's auc: 0.682885
[30]	train's auc: 0.961472	valid's auc: 0.685754
[40]	train's auc: 0.961442	valid's auc: 0.685165
[50]	train's auc: 0.962194	valid's auc: 0.683764
Did not meet early stopping. Best iteration is:
[55]	train's auc: 0.96278	valid's auc: 0.686568


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Training until validation scores don't improve for 1000 rounds.
[10]	train's auc: 0.959771	valid's auc: 0.681249
[20]	train's auc: 0.958168	valid's auc: 0.688639
[30]	train's auc: 0.958047	valid's auc: 0.688835
[40]	train's auc: 0.960492	valid's auc: 0.688832
[50]	train's auc: 0.961439	valid's auc: 0.688582
Did not meet early stopping. Best iteration is:
[46]	train's auc: 0.961925	valid's auc: 0.688432


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Training until validation scores don't improve for 1000 rounds.
[10]	train's auc: 0.962265	valid's auc: 0.679662
[20]	train's auc: 0.963529	valid's auc: 0.682266
[30]	train's auc: 0.963445	valid's auc: 0.685344
[40]	train's auc: 0.963438	valid's auc: 0.690044
[50]	train's auc: 0.964154	valid's auc: 0.689997
Did not meet early stopping. Best iteration is:
[55]	train's auc: 0.964456	valid's auc: 0.687732
Training until validation scores don't improve for 1000 rounds.
[10]	train's auc: 0.962724	valid's auc: 0.684908
[20]	train's auc: 0.961615	valid's auc: 0.685348
[30]	train's auc: 0.962596	valid's auc: 0.68143
[40]	train's auc: 0.962821	valid's auc: 0.682635
[50]	train's auc: 0.963031	valid's auc: 0.683734
Did not meet early stopping. Best iteration is:
[51]	train's auc: 0.963166	valid's auc: 0.683732
Training until validation scores don't improve for 1000 rounds.
[10]	train's auc: 0.956924	valid's auc: 0.684525
[20]	train's auc: 0.958931	valid's auc: 0.687749
[30]	train's auc: 0.961068	

Did not meet early stopping. Best iteration is:
[17]	train's auc: 0.961457	valid's auc: 0.654111
Training until validation scores don't improve for 1000 rounds.
[10]	train's auc: 0.95697	valid's auc: 0.65449
[20]	train's auc: 0.960161	valid's auc: 0.655808
[30]	train's auc: 0.960082	valid's auc: 0.655498
[40]	train's auc: 0.960535	valid's auc: 0.655341
[50]	train's auc: 0.960987	valid's auc: 0.654711
Did not meet early stopping. Best iteration is:
[55]	train's auc: 0.961063	valid's auc: 0.654381
Training until validation scores don't improve for 1000 rounds.
[10]	train's auc: 0.958435	valid's auc: 0.650125
[20]	train's auc: 0.960372	valid's auc: 0.650232
[30]	train's auc: 0.961111	valid's auc: 0.652803
[40]	train's auc: 0.960978	valid's auc: 0.654544
[50]	train's auc: 0.961009	valid's auc: 0.65462
Did not meet early stopping. Best iteration is:
[45]	train's auc: 0.961195	valid's auc: 0.655012
Training until validation scores don't improve for 1000 rounds.
[10]	train's auc: 0.960162	val

In [27]:
from sklearn.metrics import roc_auc_score

In [28]:
train_score['pred']/=nbag

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [29]:
roc_auc_score(train_score.is_click,train_score.pred)

0.6198733764612789

In [30]:
test_subm.loc[:,'is_click'] = (test_subm['is_click1'].rank()/test_subm.shape[0] +\
test_subm['is_click2'].rank()/test_subm.shape[0] + test_subm['is_click3'].rank()/test_subm.shape[0])/nfold

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [32]:
test_subm[['id','is_click']].to_csv('./lgb_5fold-5_bag_nt55_rank_average.csv',index=False)