In [1]:
import os
import numpy as np
import pandas as p
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

In [2]:
#directory path and load functions file
path = os.getcwd() + '/'
path_t = path + 'source_tables/'
path_b = path_t + 'built/'
functions = path + 'functions.py'
%run $functions

In [3]:
#set size of fraction to take from train and test, for full tables, fraction = 0
fraction = 0.2

#delete tables after merge with train/test, 1 to delete 0 to keep
delete = 1

In [4]:
#load tables
train = p.read_csv(path_b + 'train.csv', dtype={'display_id':int, 'ad_id':int, 'clicked':bool})
test = p.read_csv(path_b + 'test.csv', dtype={'display_id':int, 'ad_id':int, 'clicked':bool})
events = p.read_csv(path_b + 'events_prep.csv', dtype={'display_id':int, 'document_id':int, 'plat_1':int, 'plat_2':int, 'plat_3':int})
promoted = p.read_csv(path_b + 'promoted_content_prep.csv', dtype = {'ad_id':int, 'document_id':int, 'campaign_id':int, 'advertiser_id':int})
topics_categories = p.read_csv(path_b + 'topics_categories.csv', dtype={'document_id':int, 'topic_id':int, 'confi_top':float, 'category_id':int, 'confi_cat':float})

In [5]:
if fraction:
    train, test = fractioned(train,test,fraction)
    print train.shape
    print test.shape

(2621118, 3)
(865401, 3)


In [6]:
print train.head()

   display_id   ad_id clicked
0     1976931  110242   False
1     1976931  130952    True
2     1976931  144088   False
3     1976931  290431   False
4      149301   93787   False


In [7]:
#add document_id and platform of display_id (as one hot)
train = train.merge(events, how='left', on='display_id')
test = test.merge(events, how='left', on='display_id')
if delete:
    del events

In [8]:
print train.head()

   display_id   ad_id clicked  document_id  plat_1  plat_2  plat_3
0     1976931  110242   False       380201       0       1       0
1     1976931  130952    True       380201       0       1       0
2     1976931  144088   False       380201       0       1       0
3     1976931  290431   False       380201       0       1       0
4      149301   93787   False      1697051       1       0       0


In [9]:
#add categories and topics and document of ads
train = train.merge(promoted,how = 'left',on = 'ad_id')
test = test.merge(promoted,how = 'left',on = 'ad_id')
if delete:
    del promoted

In [10]:
print train.head()

   display_id   ad_id clicked  document_id  plat_1  plat_2  plat_3  \
0     1976931  110242   False       380201       0       1       0   
1     1976931  130952    True       380201       0       1       0   
2     1976931  144088   False       380201       0       1       0   
3     1976931  290431   False       380201       0       1       0   
4      149301   93787   False      1697051       1       0       0   

   ad_document_id  campaign_id  advertiser_id  topic_id  confi_top  \
0         1127582        14185           1355       102   0.135783   
1         1286844        16563            131       143   0.108567   
2         1249632        11367           2681       142   0.088109   
3         1332778        15891            571       260   0.189616   
4         1130406         4080             16       129   0.024838   

   category_id  confi_cat  
0         1211   0.920000  
1         1505   0.920000  
2         1211   0.380363  
3         1608   0.808691  
4         1708   0

In [11]:
#add categories and topics of document
train = train.merge(topics_categories, how='left', on='document_id',suffixes=('_ad', '_doc'))
test = test.merge(topics_categories, how='left', on='document_id',suffixes=('_ad', '_doc'))
if delete:
    del topics_categories

In [12]:
print train.head()

   display_id   ad_id clicked  document_id  plat_1  plat_2  plat_3  \
0     1976931  110242   False       380201       0       1       0   
1     1976931  130952    True       380201       0       1       0   
2     1976931  144088   False       380201       0       1       0   
3     1976931  290431   False       380201       0       1       0   
4      149301   93787   False      1697051       1       0       0   

   ad_document_id  campaign_id  advertiser_id  topic_id_ad  confi_top_ad  \
0         1127582        14185           1355          102      0.135783   
1         1286844        16563            131          143      0.108567   
2         1249632        11367           2681          142      0.088109   
3         1332778        15891            571          260      0.189616   
4         1130406         4080             16          129      0.024838   

   category_id_ad  confi_cat_ad  topic_id_doc  confi_top_doc  category_id_doc  \
0            1211      0.920000         2

In [13]:
#fill missing topics and categories confidences with 0 and ids with -1
train.confi_top_doc = train.confi_top_doc.fillna(0)
train.confi_cat_doc = train.confi_cat_doc.fillna(0)
test.confi_top_doc = test.confi_top_doc.fillna(0)
test.confi_cat_doc = test.confi_cat_doc.fillna(0)
train = train.fillna(-1)
test = test.fillna(-1)

In [14]:
#load dictionaries for correlation of topics and categories
with open(path + 'dicts/dict_topic_0.2_3', 'rb') as handle:
    top_dict = pickle.load(handle)
with open(path + 'dicts/dict_category_0.4_3', 'rb') as handle:
    cat_dict = pickle.load(handle)
train, test = correlations(train, test, top_dict, cat_dict)
if delete:
    del top_dict, cat_dict

In [15]:
print train.head()

   display_id   ad_id clicked  document_id  plat_1  plat_2  plat_3  \
0     1976931  110242   False       380201       0       1       0   
1     1976931  130952    True       380201       0       1       0   
2     1976931  144088   False       380201       0       1       0   
3     1976931  290431   False       380201       0       1       0   
4      149301   93787   False      1697051       1       0       0   

   ad_document_id  campaign_id  advertiser_id  topic_id_ad  confi_top_ad  \
0         1127582        14185           1355          102      0.135783   
1         1286844        16563            131          143      0.108567   
2         1249632        11367           2681          142      0.088109   
3         1332778        15891            571          260      0.189616   
4         1130406         4080             16          129      0.024838   

   category_id_ad  confi_cat_ad  topic_id_doc  confi_top_doc  category_id_doc  \
0            1211      0.920000         2

In [16]:
#merge train and test with CTR and the time tables
train, test = merge_ctrs_and_time(train, test)

In [18]:
print train.head(10)

   display_id   ad_id clicked  document_id  plat_1  plat_2  plat_3  \
0     1976931  110242   False       380201       0       1       0   
1     1976931  130952    True       380201       0       1       0   
2     1976931  144088   False       380201       0       1       0   
3     1976931  290431   False       380201       0       1       0   
4      149301   93787   False      1697051       1       0       0   
5      149301  153086   False      1697051       1       0       0   
6      149301  180693   False      1697051       1       0       0   
7      149301  198560   False      1697051       1       0       0   
8      149301  199280   False      1697051       1       0       0   
9      149301  289908    True      1697051       1       0       0   

   ad_document_id  campaign_id  advertiser_id  ...    score_camp  \
0         1127582        14185           1355  ...      1.027231   
1         1286844        16563            131  ...      1.231652   
2         1249632        

In [None]:
#checkpoint of saving to pick the method of filling the ctr NAs 
#and for rerunning faster from ready-made feature tables

#save
train.to_csv(path_b + 'train_current.csv', index=False)
test.to_csv(path_b + 'test_current.csv', index=False)

In [None]:
#load
train = p.read_csv(path_b + 'train_current.csv')
test = p.read_csv(path_b + 'test_current.csv')

In [19]:
#fill NAs with medians
test.score_ad = test.score_ad.fillna(test.score_ad.median())
test.score_ad_doc = test.score_ad_doc.fillna(test.score_ad_doc.median())
test.score_adv = test.score_adv.fillna(test.score_adv.median())
test.score_camp = test.score_camp.fillna(test.score_camp.median())
test.score_docXad = test.score_docXad.fillna(test.score_docXad.median())
test.score_docXad_doc = test.score_docXad_doc.fillna(test.score_docXad_doc.median())
test.score_docXadv = test.score_docXadv.fillna(test.score_docXadv.median())
test.score_docXcamp = test.score_docXcamp.fillna(test.score_docXcamp.median())

In [29]:
#fill NAs with mean
test.score_ad = test.score_ad.fillna(test.score_ad.mean())
test.score_ad_doc = test.score_ad_doc.fillna(test.score_ad_doc.mean())
test.score_adv = test.score_adv.fillna(test.score_adv.mean())
test.score_camp = test.score_camp.fillna(test.score_camp.mean())
test.score_docXad = test.score_docXad.fillna(test.score_docXad.mean())
test.score_docXad_doc = test.score_docXad_doc.fillna(test.score_docXad_doc.mean())
test.score_docXadv = test.score_docXadv.fillna(test.score_docXadv.mean())
test.score_docXcamp = test.score_docXcamp.fillna(test.score_docXcamp.mean())

In [20]:
predictors=[x for x in train.columns if x not in ['display_id','ad_id','clicked','document_id','ad_document_id','campaign_id','advertiser_id','confi_top_ad','topic_id_ad','topic_id_doc','category_id_ad','confi_cat_ad','confi_top_doc','category_id_doc','confi_cat_doc']]

In [21]:
train[predictors].head()

Unnamed: 0,plat_1,plat_2,plat_3,cor_top,cor_cat,score_ad,score_ad_doc,score_adv,score_camp,score_docXad,score_docXad_doc,score_docXadv,score_docXcamp,weekend,morning,noon,evening,night
0,0,1,0,0.003222,0.411283,1.019082,1.017267,0.909733,1.027231,0.677593,0.693252,0.742388,0.6992,0,1,0,0,0
1,0,1,0,0.002684,0.416033,1.343629,1.287471,1.215249,1.231652,0.985286,1.000944,1.05008,1.006893,0,1,0,0,0
2,0,1,0,0.002023,0.17004,0.492982,0.516392,0.524011,0.510836,0.677593,0.693252,0.742388,0.6992,0,1,0,0,0
3,0,1,0,0.004633,0.368866,0.328051,0.328201,0.277804,0.291471,0.629194,0.643734,0.68936,0.649258,0,1,0,0,0
4,1,0,0,0.003728,0.394798,1.892163,1.89296,1.56671,2.320623,1.01639,1.039878,1.113582,1.048801,0,0,0,1,0


In [None]:
#multi_class, fit_intercept, intercept_scaling, dual, random_state, solver, verbose, pentaly, class weights
alg = LogisticRegression(verbose=0, n_jobs=4)
log_params = {'C' : [10 ** i for i in range(-12,2)], 'solver' : ['lbfgs', 'sag'], 'class_weight' : ['none', 'balanced']}
grid_log = GridSearchCV(alg, log_params, n_jobs=4, verbose=0)
grid_log.fit(train[predictors], train['clicked'])

  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)


In [188]:
#[i -> score_ad, j -> score_ad_doc, k->score_adv, l->score_camp, t->score_docXad,n->score_docXad_doc,m->score_docXadv,p->score_docXcamp]
i_l = ['','score_ad']
j_l = ['','score_ad_doc']
k_l = ['','score_adv']
l_l = ['','score_camp']
t_l = ['','score_docXad']
n_l = ['','score_docXad_doc']
m_l = ['','score_docXadv']
p_l = ['','score_docXcamp']
results = [0] * 256
for i in range(2):
    for j in range(2):
        for k in range(2):
            for l in range(2):
                for t in range(2):
                    for n in range(2):
                        for m in range(2):
                            for p in range(2):
                                predictors=[x for x in train.columns if x not in ['display_id','ad_id','clicked','document_id','ad_document_id','campaign_id','advertiser_id','confi_top_ad','topic_id_ad','topic_id_doc','category_id_ad','confi_cat_ad','confi_top_doc','category_id_doc','confi_cat_doc']]
                                to_reduce = i_l[i] + j_l[j] + k_l[k] + l_l[l] + t_l[t] + n_l[n] + m_l[m] + p_l[p]
                                predictors = [x for x in predictors if x not in to_reduce]
                                alg = LogisticRegression(C = 0.0000000001, solver = 'lbfgs')
                                alg.fit(train[predictors], train['clicked'])
                                predY = list(alg.predict_proba(test[predictors]).astype(float)[:,1])
                                predict = np.asarray(predY)
                                test_copy = test.copy()
                                test_copy['predict'] = predict
                                results[i + j * 2 + k * 4 + l * 8 + t * 16 + n * 32 + m * 64 + p * 128] = score_map(test_copy)

MAP: 0.649737819877
MAP: 0.650101485384
MAP: 0.649530906309
MAP: 0.648415965659
MAP: 0.649764846759
MAP: 0.648138087874
MAP: 0.648183828897
MAP: 0.644473403284
MAP: 0.649984732469
MAP: 0.649957822527
MAP: 0.649530906309
MAP: 0.648415965659
MAP: 0.649764846759
MAP: 0.648138087874
MAP: 0.648183828897
MAP: 0.644473403284
MAP: 0.648364339188
MAP: 0.649261721798
MAP: 0.649435152663
MAP: 0.648558998070
MAP: 0.649458393875
MAP: 0.648269543077
MAP: 0.648202286499
MAP: 0.643961837954
MAP: 0.649194778412
MAP: 0.649657418206
MAP: 0.649435152663
MAP: 0.648558998070
MAP: 0.649458393875
MAP: 0.648269543077
MAP: 0.648202286499
MAP: 0.643961837954
MAP: 0.646508970320
MAP: 0.648178032597
MAP: 0.648959976844
MAP: 0.648811248611
MAP: 0.649307606911
MAP: 0.649078135810
MAP: 0.648830938672
MAP: 0.643811842843
MAP: 0.648075269005
MAP: 0.649584920950
MAP: 0.648959976844
MAP: 0.648811248611
MAP: 0.649307606911
MAP: 0.649078135810
MAP: 0.648830938672
MAP: 0.643811842843
MAP: 0.639095776162
MAP: 0.642051811385


In [22]:
alg = LogisticRegression(C = 0.0000000001, solver = 'lbfgs', class_weight='balanced')
alg.fit(train[predictors], train['clicked'])

LogisticRegression(C=1e-10, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='lbfgs', tol=0.0001, verbose=0, warm_start=False)

In [23]:
predY = list(alg.predict_proba(test[predictors]).astype(float)[:,1])
predict = np.asarray(predY)
test_copy = test.copy()
test_copy['predict'] = predict
map_score = score_map(test_copy)
portion_score = score_portion(test_copy)

MAP: 0.650310049357
PORTION: 0.445561804078


In [None]:
print portion_score
print map_score
print values
0.649734852141 -> no docXad 0.649983074581

0.650101485384 -> no score_docXcamp
0.650121732777 -> no score_docXcamp, no platform
0.650124145533 -> no score_docXcamp, no platform, no cor_top
0.650107933508 -> no score_docXcamp, no platform, no cor_top, no cor_cat
0.650126784043 -> no score_docXcamp, no platform, no cor_top, no cor_cat, no times


0.659890146771 c = 0.0000000001 solver = lbfgs, no corellations