In [1]:
import os
import numpy as np
import pandas as p
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

In [2]:
#directory path and load functions file
path = os.getcwd() + "/"
path_t = path + "source_tables/"
path_b = path_t + "built/"
functions = path + "functions.py"
%run $functions

In [3]:
#set size of fraction to take from train and test, for full tables, fraction = 0
fraction = 0.2

In [9]:
#load tables
train = p.read_csv(path_b + "train.csv", dtype={"display_id":int, "ad_id":int, "clicked":bool})
test = p.read_csv(path_b + "test.csv", dtype={"display_id":int, "ad_id":int, "clicked":bool})
events = p.read_csv(path_b + "events_prep.csv", dtype={"display_id":int, "document_id":int, "plat_1":int, "plat_2":int, "plat_3":int})
promoted = p.read_csv(path_b + "promoted_content_prep.csv", dtype = {"ad_id":int, "document_id":int, "campaign_id":int, "advertiser_id":int})
topics_categories = p.read_csv(path_b + "topics_categories.csv", dtype={"document_id":int, "topic_id":int, "confi_top":float, "category_id":int, "confi_cat":float})

In [None]:
if fraction:
    train, test = fractioned(train,test,fraction)
    print train.shape
    print test.shape

In [None]:
print train.head()

In [None]:
#add document_id and platform of display_id (as one hot)
train = train.merge(events, how='left', on='display_id')
test = test.merge(events, how='left', on='display_id')
del events

In [None]:
print train.head()

In [None]:
#add categories and topics and document of ads
train = train.merge(promoted,how = 'left',on = 'ad_id')
test = test.merge(promoted,how = 'left',on = 'ad_id')

In [None]:
print train.head()

In [91]:
#add categories and topics of document
train = train.merge(topics_categories, how='left', on='document_id',suffixes=('_ad', '_doc'))
test = test.merge(topics_categories, how='left', on='document_id',suffixes=('_ad', '_doc'))

In [92]:
print train.head()

   display_id   ad_id clicked  document_id  plat_1  plat_2  plat_3  \
0      844445  153658   False       140264       1       0       0   
1      844445  219729    True       140264       1       0       0   
2     1831819   29281   False      1095542       0       1       0   
3     1831819   59605   False      1095542       0       1       0   
4     1831819   64255   False      1095542       0       1       0   

   ad_document_id  campaign_id  advertiser_id  topic_id_ad  confi_top_ad  \
0          931586        19569           1650          227      0.079687   
1         1553984        24654           2623           16      0.058925   
2          690073         4488             19          296      0.280217   
3          927646         7978           1521          226      0.346708   
4          747047         8568           1077            1      0.349048   

   category_id_ad  confi_cat_ad  topic_id_doc  confi_top_doc  category_id_doc  \
0            1403      0.444356          

In [98]:
#fill missing topics and categories confidences with 0 and ids with -1
train.confi_top_doc = train.confi_top_doc.fillna(0)
train.confi_cat_doc = train.confi_cat_doc.fillna(0)
test.confi_top_doc = test.confi_top_doc.fillna(0)
test.confi_cat_doc = test.confi_cat_doc.fillna(0)
train = train.fillna(-1)
test = test.fillna(-1)

In [146]:
#load dictionaries for correlation of topics and categories
with open(path + 'dicts/dict_topic_0.2_3', 'rb') as handle:
    top_dict = pickle.load(handle)
with open(path + 'dicts/dict_category_0.4_3', 'rb') as handle:
    cat_dict = pickle.load(handle)
train, test = correlations(train, test, top_dict, cat_dict)

In [144]:
print train.head()

   display_id   ad_id clicked  document_id  plat_1  plat_2  plat_3  \
0      844445  153658   False       140264       1       0       0   
1      844445  219729    True       140264       1       0       0   
2     1831819   29281   False      1095542       0       1       0   
3     1831819   59605   False      1095542       0       1       0   
4     1831819   64255   False      1095542       0       1       0   

   ad_document_id  campaign_id  advertiser_id    ...     score_docXad_doc  \
0          931586        19569           1650    ...             0.422627   
1         1553984        24654           2623    ...             1.089025   
2          690073         4488             19    ...             1.339943   
3          927646         7978           1521    ...             1.215162   
4          747047         8568           1077    ...             1.275921   

   score_docXadv  score_docXcamp  weekend  morning  noon  evening  night  \
0       0.268833        0.423664        

In [99]:
#load all CTR tables and the time table
ad_ctr = p.read_csv(path_b + "ad_ctr.csv", dtype={"ad_id":int, "score":float})
ad_document_ctr = p.read_csv(path_b + "ad_document_ctr.csv", dtype={"ad_document_id":int, "score":float})
advertiser_ctr = p.read_csv(path_b + "advertiser_ctr.csv", dtype={"advertiser_id":int, "score":float})
campaign_ctr = p.read_csv(path_b + "campaign_ctr.csv", dtype={"campaign_id":int, "score":float})
document_on_ad_ctr = p.read_csv(path_b + "document_on_ad_ctr.csv", dtype={"document_id":int, "ad_id":int, "score":float})
document_on_ad_document_ctr = p.read_csv(path_b + "document_on_ad_document_ctr.csv", dtype={"document_id":int, "ad_document_id":int, "score":float})
document_on_advertiser_ctr = p.read_csv(path_b + "document_on_advertiser_ctr.csv", dtype={"document_id":int, "advertiser_id":int, "score":float})
document_on_campaign_ctr = p.read_csv(path_b + "document_on_campaign_ctr.csv", dtype={"document_id":int, "campaign_id":int, "score":float})
time_table = p.read_csv(path_b + "time_table.csv", dtype={"display_id":int, "weekend":int, "morning":int, "noon":int, "evening":int, "night":int})

In [100]:
#merge with train and test
train = train.merge(ad_ctr, how = 'left', on = 'ad_id')
test = test.merge(ad_ctr, how = 'left', on = 'ad_id')

train = train.merge(ad_document_ctr, how = 'left', on = 'ad_document_id')
test = test.merge(ad_document_ctr, how = 'left', on = 'ad_document_id')

train = train.merge(advertiser_ctr, how = 'left', on = 'advertiser_id')
test = test.merge(advertiser_ctr, how = 'left', on = 'advertiser_id')

train = train.merge(campaign_ctr, how = 'left', on = 'campaign_id')
test = test.merge(campaign_ctr, how = 'left', on = 'campaign_id')

train = train.merge(document_on_ad_ctr, how = 'left', on = ["document_id", "ad_id"])
test = test.merge(document_on_ad_ctr, how = 'left', on = ["document_id", "ad_id"])

train = train.merge(document_on_ad_document_ctr, how = 'left', on = ["document_id", "ad_document_id"])
test = test.merge(document_on_ad_document_ctr, how = 'left', on = ["document_id", "ad_document_id"])

train = train.merge(document_on_advertiser_ctr, how = 'left', on = ["document_id", "advertiser_id"])
test = test.merge(document_on_advertiser_ctr, how = 'left', on = ["document_id", "advertiser_id"])

train = train.merge(document_on_campaign_ctr, how = 'left', on = ["document_id", "campaign_id"])
test = test.merge(document_on_campaign_ctr, how = 'left', on = ["document_id", "campaign_id"])

train = train.merge(time_table, how = 'left', on = 'display_id')
test = test.merge(time_table, how = 'left', on = 'display_id')

In [29]:
#consider adding bit signaling the NAed ad_on_doc
#check if median or mean are better (or min?)

In [147]:
#save
train.to_csv(path_b + 'train_current.csv', index=False)
test.to_csv(path_b + 'test_current.csv', index=False)

In [145]:
#load
train = p.read_csv(path_b + 'train_current.csv')
test = p.read_csv(path_b + 'test_current.csv')

In [148]:
#fill NAs, pick with which method
test.score_ad = test.score_ad.fillna(test.score_ad.median())
test.score_ad_doc = test.score_ad_doc.fillna(test.score_ad_doc.median())
test.score_adv = test.score_adv.fillna(test.score_adv.median())
test.score_camp = test.score_camp.fillna(test.score_camp.median())
test.score_docXad = test.score_docXad.fillna(test.score_docXad.median())
test.score_docXad_doc = test.score_docXad_doc.fillna(test.score_docXad_doc.median())
test.score_docXadv = test.score_docXadv.fillna(test.score_docXadv.median())
test.score_docXcamp = test.score_docXcamp.fillna(test.score_docXcamp.median())

In [149]:
#fill NAs, pick with which method
test.score_ad = test.score_ad.fillna(test.score_ad.mean())
test.score_ad_doc = test.score_ad_doc.fillna(test.score_ad_doc.mean())
test.score_adv = test.score_adv.fillna(test.score_adv.mean())
test.score_camp = test.score_camp.fillna(test.score_camp.mean())
test.score_docXad = test.score_docXad.fillna(test.score_docXad.mean())
test.score_docXad_doc = test.score_docXad_doc.fillna(test.score_docXad_doc.mean())
test.score_docXadv = test.score_docXadv.fillna(test.score_docXadv.mean())
test.score_docXcamp = test.score_docXcamp.fillna(test.score_docXcamp.mean())

In [209]:
predictors=[x for x in train.columns if x not in ['weekend','morning','noon','evening','night','cor_top','plat_1','plat_2','plat_3','score_docXcamp','display_id','ad_id','clicked','document_id','platform','ad_document_id','campaign_id','advertiser_id','confi_top_ad','topic_id_ad','topic_id_doc','category_id_ad','confi_cat_ad','confi_top_doc','category_id_doc','confi_cat_doc']]

In [210]:
train[predictors].head()

Unnamed: 0,score_ad,score_ad_doc,score_adv,score_camp,score_docXad,score_docXad_doc,score_docXadv,cor_cat
0,0.207916,0.190167,0.178165,0.208096,0.419899,0.422627,0.268833,0.339842
1,0.803446,0.737809,0.748747,0.738674,1.091258,1.089025,1.061947,0.461964
2,0.970204,0.773144,0.596972,0.954316,1.61796,1.339943,1.01494,1.156469
3,0.813793,0.804405,0.829231,0.913688,1.200622,1.215162,1.260789,1.198036
4,0.795866,0.795943,0.77109,0.798416,1.260653,1.275921,1.323828,1.253104


In [None]:
#multi_class, fit_intercept, intercept_scaling, dual, random_state, solver, verbose, pentaly, class weights
alg = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='sag', max_iter=75, multi_class='ovr', verbose=1, n_jobs=4)
log_params = {'C' : [10 ** i for i in range(-12,2)], 'solver' : ['lbfgs', 'sag']}
grid_log = GridSearchCV(alg, log_params, fit_params=None, n_jobs=-1, iid=True, refit=True, verbose=1, pre_dispatch='2*n_jobs', error_score='raise', return_train_score=True)
grid_log.fit(train[predictors], train["clicked"])

Fitting 3 folds for each of 28 candidates, totalling 84 fits


  **self._backend_args)
  **self._backend_args)
  **self._backend_args)


rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...

[Parallel(n_jobs=4)]: Done   1 out of   1 | elapsed:    6.1s finished


rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...

  **self._backend_args)


rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...


  **self._backend_args)


rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...

[Parallel(n_jobs=4)]: Done   1 out of   1 | elapsed:    6.2s finished


rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...

[Parallel(n_jobs=4)]: Done   1 out of   1 | elapsed:    6.5s finished


rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...

[Parallel(n_jobs=4)]: Done   1 out of   1 | elapsed:    5.2s finished


rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...


[Parallel(n_jobs=4)]: Done   1 out of   1 | elapsed:    5.2s finished


rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...

In [188]:
#[i -> score_ad, j -> score_ad_doc, k->score_adv, l->score_camp, t->score_docXad,n->score_docXad_doc,m->score_docXadv,p->score_docXcamp]
i_l = ['','score_ad']
j_l = ['','score_ad_doc']
k_l = ['','score_adv']
l_l = ['','score_camp']
t_l = ['','score_docXad']
n_l = ['','score_docXad_doc']
m_l = ['','score_docXadv']
p_l = ['','score_docXcamp']
results = [0] * 256
for i in range(2):
    for j in range(2):
        for k in range(2):
            for l in range(2):
                for t in range(2):
                    for n in range(2):
                        for m in range(2):
                            for p in range(2):
                                predictors=[x for x in train.columns if x not in ['display_id','ad_id','clicked','document_id','platform','ad_document_id','campaign_id','advertiser_id','confi_top_ad','topic_id_ad','topic_id_doc','category_id_ad','confi_cat_ad','confi_top_doc','category_id_doc','confi_cat_doc']]
                                to_reduce = i_l[i] + j_l[j] + k_l[k] + l_l[l] + t_l[t] + n_l[n] + m_l[m] + p_l[p]
                                predictors = [x for x in predictors if x not in to_reduce]
                                alg = LogisticRegression(C = 0.0000000001, solver = 'lbfgs')
                                alg.fit(train[predictors], train["clicked"])
                                predY = list(alg.predict_proba(test[predictors]).astype(float)[:,1])
                                predict = np.asarray(predY)
                                test_copy = test.copy()
                                test_copy['predict'] = predict
                                results[i + j * 2 + k * 4 + l * 8 + t * 16 + n * 32 + m * 64 + p * 128] = score_map(test_copy)

MAP: 0.649737819877
MAP: 0.650101485384
MAP: 0.649530906309
MAP: 0.648415965659
MAP: 0.649764846759
MAP: 0.648138087874
MAP: 0.648183828897
MAP: 0.644473403284
MAP: 0.649984732469
MAP: 0.649957822527
MAP: 0.649530906309
MAP: 0.648415965659
MAP: 0.649764846759
MAP: 0.648138087874
MAP: 0.648183828897
MAP: 0.644473403284
MAP: 0.648364339188
MAP: 0.649261721798
MAP: 0.649435152663
MAP: 0.648558998070
MAP: 0.649458393875
MAP: 0.648269543077
MAP: 0.648202286499
MAP: 0.643961837954
MAP: 0.649194778412
MAP: 0.649657418206
MAP: 0.649435152663
MAP: 0.648558998070
MAP: 0.649458393875
MAP: 0.648269543077
MAP: 0.648202286499
MAP: 0.643961837954
MAP: 0.646508970320
MAP: 0.648178032597
MAP: 0.648959976844
MAP: 0.648811248611
MAP: 0.649307606911
MAP: 0.649078135810
MAP: 0.648830938672
MAP: 0.643811842843
MAP: 0.648075269005
MAP: 0.649584920950
MAP: 0.648959976844
MAP: 0.648811248611
MAP: 0.649307606911
MAP: 0.649078135810
MAP: 0.648830938672
MAP: 0.643811842843
MAP: 0.639095776162
MAP: 0.642051811385


In [211]:
alg = LogisticRegression(C = 0.0000000001, solver = 'lbfgs')
alg.fit(train[predictors], train["clicked"])

LogisticRegression(C=1e-10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='lbfgs', tol=0.0001,
          verbose=0, warm_start=False)

In [212]:
predY = list(alg.predict_proba(test[predictors]).astype(float)[:,1])
predict = np.asarray(predY)
test_copy = test.copy()
test_copy['predict'] = predict
map_score = score_map(test_copy)
portion_score = score_portion(test_copy)

MAP: 0.650110219277
PORTION: 0.444433910277


In [None]:
print portion_score
print map_score
print values
0.649734852141 -> docXad 0.649983074581
0.649734852141

0.650101485384 -> no score_docXcamp
0.650121732777 -> no score_docXcamp, no platform
0.650124145533 -> no score_docXcamp, no platform, no cor_top
0.650107933508 -> no score_docXcamp, no platform, no cor_top, no cor_cat
0.650126784043 -> no score_docXcamp, no platform, no cor_top, no cor_cat, no times

0.659890146771 c = 0.0000000001 solver = lbfgs, no corellations