In [7]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
import gc
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold

In [8]:
campaign  = pd.read_csv('input/campaign_data.csv')
campaign1 = campaign.drop(['subject','email_url','email_body'],axis=1)
gc.collect()

1961

In [9]:
np.random.seed(123)

In [10]:
def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))
def target_encode(trn_series=None,val_series=None,
                  tst_series=None,
                  target=None,
                  min_samples_leaf=1,
                  smoothing=1,
                  noise_level=0):
    """
    Smoothing is computed like in the following paper by Daniele Micci-Barreca
    https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf
    trn_series : training categorical feature as a pd.Series
    tst_series : test categorical feature as a pd.Series
    target : target data as a pd.Series
    min_samples_leaf (int) : minimum samples to take category average into account
    smoothing (int) : smoothing effect to balance categorical average vs prior
    """
    assert len(trn_series) == len(target)
    #assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series, target], axis=1)
    # Compute target mean
    averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    # Apply average function to all target data
    prior = target.mean()
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index
    ft_val_series = pd.merge(
        val_series.to_frame(val_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=val_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    ft_val_series.index = val_series.index
    
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_tst_series.index = tst_series.index
    return add_noise(ft_trn_series, noise_level), ft_val_series,ft_tst_series

In [11]:
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')
gc.collect()

0

In [12]:
all_data = pd.concat([train,test])

In [13]:
user_clust = pd.read_csv('./input/user_cluster1.csv')
all_data = all_data.merge(user_clust,on='user_id',how='left')

In [14]:
all_data['send_date'] = all_data.send_date.apply(lambda x: pd.datetime.strptime(x,'%d-%m-%Y %H:%M'))

In [15]:
all_data['send_dayofweek'] = all_data.send_date.dt.dayofweek

In [16]:
all_data.columns

Index(['campaign_id', 'id', 'is_click', 'is_open', 'send_date', 'user_id',
       'clust_id', 'send_dayofweek'],
      dtype='object')

In [17]:
#count features
all_data['cnt_sd'] = all_data.groupby('send_date')['user_id'].transform('count')

In [18]:
all_data = all_data.merge(campaign1,on='campaign_id',how='left')

In [19]:
all_data.columns

Index(['campaign_id', 'id', 'is_click', 'is_open', 'send_date', 'user_id',
       'clust_id', 'send_dayofweek', 'cnt_sd', 'communication_type',
       'total_links', 'no_of_internal_links', 'no_of_images',
       'no_of_sections'],
      dtype='object')

In [20]:
le1 = LabelEncoder()
all_data.loc[:,'communication_type'] = le1.fit_transform(all_data.communication_type)   
all_data['usr_cnt'] = all_data.groupby('user_id')['user_id'].transform('count')
all_data['cm_cnt'] = np.log(all_data.groupby('communication_type')['communication_type'].transform('count'))
#all_data['camp_cnt'] = all_data.groupby('campaign_id')['campaign_id'].transform('count')

In [21]:
test = all_data[len(train):]
train = all_data[:len(train)]

In [22]:
#del all_data
gc.collect()

35

In [24]:
lgb_params = {}
lgb_params['learning_rate'] = 0.01
lgb_params['num_leaves'] = 31
lgb_params['max_depth'] = 5
lgb_params['max_bin'] = 10
lgb_params['min_data_in_leaf'] = 50
lgb_params['subsample'] = 0.6
lgb_params['colsample_bytree'] = 0.7
lgb_params['feature_fraction'] = 0.77,
lgb_params['bagging_fraction'] = 0.77,
lgb_params['objective'] = 'binary'
lgb_params['metric'] = {'auc'}
lgb_params['verbose'] = 1
lgb_params['scale_pos_weight'] = 1.
lgb_params['boosting_type'] = 'gbdt'
lgb_params['min_split_gain'] = 0.0001
#lgb_params['bagging_fraction'] = 0.7
lgb_params['bagging_freq'] = 100000


In [26]:
nfold =5
kf = KFold(n_splits=nfold,random_state=123,shuffle=False)
unq_campaign_id = np.sort(train.campaign_id.unique())

test_subm = test[['id']]
test_subm['is_click'] = 0
train_score = train[['is_click']]
train_score['pred'] = 0
nbag = 7
cf =0
for train_index, test_index in kf.split(unq_campaign_id):
    cf+=1
    print('Fold:',cf)
    
    test1 = test.copy()
    tr_cid = unq_campaign_id[train_index]
    val_cid = unq_campaign_id[test_index]
    print('val_cid',val_cid)

    val = train[train.campaign_id.isin(val_cid)]
    train1 = train[train.campaign_id.isin(tr_cid)]
    print(val.shape,train1.shape)

    a1,a2,a3 = target_encode(train1['user_id'],val['user_id'],
                             test1['user_id'],train1.is_click,noise_level=.9,smoothing=5)
    train1.loc[:,'mean_is_click'] = a1
    val.loc[:,'mean_is_click'] = a2
    test1.loc[:,'mean_is_click'] = a3


    a1,a2,a3 = target_encode(train1['user_id'],val['user_id'],
                             test1['user_id'],train1.is_open,noise_level=.9,smoothing=1.2)
    train1.loc[:,'mean_is_open'] = a1
    val.loc[:,'mean_is_open'] = a2
    test1.loc[:,'mean_is_open'] = a3


    a1,a2,a3 = target_encode(train1['communication_type'],val['communication_type'],
                             test1['communication_type'],train1.is_open,noise_level=0)
    train1.loc[:,'mean_ct'] = a1
    val.loc[:,'mean_ct'] = a2
    test1.loc[:,'mean_ct'] = a3

    a1,a2,a3 = target_encode(train1['communication_type'],val['communication_type'],
                             test1['communication_type'],train1.is_click,noise_level=0)
    train1.loc[:,'mean_clk_ct'] = a1
    val.loc[:,'mean_clk_ct'] = a2
    test1.loc[:,'mean_clk_ct'] = a3


    a1,a2,a3 = target_encode(train1['clust_id'],val['clust_id'],
                             test1['clust_id'],train1.is_click,noise_level=0)
    train1.loc[:,'mean_clk_clust_id'] = a1
    val.loc[:,'mean_clk_clust_id'] = a2
    test1.loc[:,'mean_clk_clust_id'] = a3



    gc.collect()
    val.drop(['id','campaign_id','is_open','send_date',
              'user_id','no_of_images','no_of_sections','no_of_internal_links'],axis=1,inplace=True)
    train1.drop(['id','campaign_id','is_open','send_date',
                 'user_id','no_of_images','no_of_sections','no_of_internal_links'],axis=1,inplace=True)
    test1.drop(['id','campaign_id','is_open','send_date',
               'user_id','no_of_images','no_of_sections','no_of_internal_links'],axis=1,inplace=True)
    gc.collect()
    train_y = train1.is_click.values
    val_y = val.is_click.values
    val.drop(['is_click'],axis=1,inplace=True)
    train1.drop(['is_click'],axis=1,inplace=True)
    test1.drop(['is_click'],axis=1,inplace=True)
    
    lgtrain = lgb.Dataset(train1, label=train_y,categorical_feature=['communication_type','send_dayofweek','clust_id'],
                      free_raw_data=False)
    lgvalid = lgb.Dataset(val, label=val_y,categorical_feature=['communication_type','send_dayofweek','clust_id'],
                     free_raw_data=False)
    gc.collect()
    
    evals_results = {}
    np.random.seed(0)
    
    test_subm['is_click'+str(cf)]=0
    
    for bg in range(nbag):
        lgb_params['feature_fraction_seed'] = 100*cf + bg
        bst1 = lgb.train(lgb_params, 
                     lgtrain, 
                     valid_sets=[lgtrain, lgvalid], 
                     valid_names=['train','valid'], 
                     evals_result=evals_results, 
                     num_boost_round=55,
                     early_stopping_rounds=1000,
                     verbose_eval=10)
        train_score.loc[val.index,'pred'] += bst1.predict(val[train1.columns],num_iteration=45)
        test_subm['is_click'+str(cf)] += bst1.predict(test1[train1.columns],num_iteration=45)
    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Fold: 1
val_cid [29 30 31 32 33 34]
(331628, 16) (691563, 16)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame


Training until validation scores don't improve for 1000 rounds.
[10]	train's auc: 0.962805	valid's auc: 0.674618
[20]	train's auc: 0.963193	valid's auc: 0.68505
[30]	train's auc: 0.963489	valid's auc: 0.68306
[40]	train's auc: 0.96343	valid's auc: 0.685816
[50]	train's auc: 0.963585	valid's auc: 0.68531
Did not meet early stopping. Best iteration is:
[15]	train's auc: 0.96401	valid's auc: 0.687131


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Training until validation scores don't improve for 1000 rounds.
[10]	train's auc: 0.958735	valid's auc: 0.67823
[20]	train's auc: 0.956856	valid's auc: 0.678421
[30]	train's auc: 0.958005	valid's auc: 0.674503
[40]	train's auc: 0.959854	valid's auc: 0.681749
[50]	train's auc: 0.960658	valid's auc: 0.681605
Did not meet early stopping. Best iteration is:
[46]	train's auc: 0.961148	valid's auc: 0.681517


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Training until validation scores don't improve for 1000 rounds.
[10]	train's auc: 0.962397	valid's auc: 0.681139
[20]	train's auc: 0.963666	valid's auc: 0.681113
[30]	train's auc: 0.96332	valid's auc: 0.689459
[40]	train's auc: 0.963309	valid's auc: 0.689208
[50]	train's auc: 0.964175	valid's auc: 0.689181
Did not meet early stopping. Best iteration is:
[55]	train's auc: 0.964218	valid's auc: 0.689003
Training until validation scores don't improve for 1000 rounds.
[10]	train's auc: 0.961895	valid's auc: 0.679073
[20]	train's auc: 0.962721	valid's auc: 0.680552
[30]	train's auc: 0.963843	valid's auc: 0.684549
[40]	train's auc: 0.963979	valid's auc: 0.684159
[50]	train's auc: 0.964223	valid's auc: 0.685265
Did not meet early stopping. Best iteration is:
[48]	train's auc: 0.964333	valid's auc: 0.684591
Training until validation scores don't improve for 1000 rounds.
[10]	train's auc: 0.961093	valid's auc: 0.688025
[20]	train's auc: 0.960536	valid's auc: 0.688727
[30]	train's auc: 0.961975	

Training until validation scores don't improve for 1000 rounds.
[10]	train's auc: 0.957479	valid's auc: 0.689646
[20]	train's auc: 0.959953	valid's auc: 0.71164
[30]	train's auc: 0.960348	valid's auc: 0.71069
[40]	train's auc: 0.960473	valid's auc: 0.710728
[50]	train's auc: 0.960472	valid's auc: 0.71061
Did not meet early stopping. Best iteration is:
[55]	train's auc: 0.960592	valid's auc: 0.710661
Training until validation scores don't improve for 1000 rounds.
[10]	train's auc: 0.958059	valid's auc: 0.713996
[20]	train's auc: 0.959824	valid's auc: 0.710924
[30]	train's auc: 0.959983	valid's auc: 0.710757
[40]	train's auc: 0.960269	valid's auc: 0.710889
[50]	train's auc: 0.960259	valid's auc: 0.711001
Did not meet early stopping. Best iteration is:
[49]	train's auc: 0.960388	valid's auc: 0.711046
Training until validation scores don't improve for 1000 rounds.
[10]	train's auc: 0.957215	valid's auc: 0.693773
[20]	train's auc: 0.959706	valid's auc: 0.699633
[30]	train's auc: 0.959715	va

In [44]:
from sklearn.metrics import roc_auc_score

In [45]:
train_score['pred']/=nbag

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [46]:
roc_auc_score(train_score.is_click,train_score.pred)

0.6315854523976097

In [2]:
test_subm.loc[:,'is_click'] = (test_subm['is_click1'].rank()/test_subm.shape[0] +\
test_subm['is_click2'].rank()/test_subm.shape[0] + test_subm['is_click3'].rank()/test_subm.shape[0]+\
test_subm['is_click4'].rank()/test_subm.shape[0] + test_subm['is_click5'].rank()/test_subm.shape[0])/nfold

In [None]:
test_subm[['id','is_click']].to_csv('./lgb_5fold-5_bag_nt45_rank_average.csv',index=False)

In [1]:
#Removing prediction from last fold
test_subm.loc[:,'is_click'] = (test_subm['is_click2'].rank()/test_subm.shape[0] + test_subm['is_click3'].rank()/test_subm.shape[0]+\
test_subm['is_click4'].rank()/test_subm.shape[0])/(nfold-1


In [37]:
test_subm[['id','is_click']].to_csv('./lgb_5fold-5_bag_nt45_rank_average_4f.csv',index=False)