In [7]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier
from collections import Counter
from sklearn.metrics import roc_auc_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.grid_search import GridSearchCV

import xgboost as xgb

import lightgbm as lgb

from catboost import CatBoostClassifier

from sklearn import svm

from fastFM import als



In [8]:
train=pd.read_csv('./train.csv')

In [9]:
validation=pd.read_csv('./validation.csv')

In [34]:
test=pd.read_csv('./test.csv')

# Preprocess Data

In [27]:
def preprocessData(data):
    click=data['click']
    weekday=pd.get_dummies(data['weekday'],prefix='weekday_')
    hour=pd.get_dummies(data['hour'],prefix='hour_')
    useragent=pd.get_dummies(data['useragent'],prefix='useragent_')
    region=pd.get_dummies(data['region'],prefix='region_')
    city=pd.get_dummies(data['city'],prefix='city_')
    adexchange=pd.get_dummies(data['adexchange'],prefix='adexchange_')
    slotwidth=pd.get_dummies(data['slotwidth'],prefix='slotwidth_')
    slotheight=pd.get_dummies(data['slotheight'],prefix='slotheight_')
    slotvisibility=pd.get_dummies(data['slotvisibility'],prefix='slotvisibility_')
    slotformat=pd.get_dummies(data['slotformat'],prefix='slotformat_')
    slotprice=data['slotprice']
    creative=pd.get_dummies(data['creative'],prefix='creative_')
    keypage=pd.get_dummies(data['keypage'],prefix='keypage_')
    advertiser=pd.get_dummies(data['advertiser'],prefix='advertiser_')
    
    data_usertag=data.usertag.fillna('0')
    data_usertag=data_usertag.str.replace(',',' ')
    vect=CountVectorizer()
    data_usertag_vect=vect.fit_transform(data_usertag)
    usertag=pd.DataFrame(data_usertag_vect.toarray(),columns=vect.get_feature_names())
    
    newdata=pd.concat([click,weekday,hour,useragent,region,city,adexchange,\
                       slotwidth,slotheight,slotvisibility,slotformat,\
                       slotprice,creative,keypage,advertiser,usertag],axis=1)
    
    return newdata

In [28]:
def preprocessTestData(data):
    weekday=pd.get_dummies(data['weekday'],prefix='weekday_')
    hour=pd.get_dummies(data['hour'],prefix='hour_')
    useragent=pd.get_dummies(data['useragent'],prefix='useragent_')
    region=pd.get_dummies(data['region'],prefix='region_')
    city=pd.get_dummies(data['city'],prefix='city_')
    adexchange=pd.get_dummies(data['adexchange'],prefix='adexchange_')
    slotwidth=pd.get_dummies(data['slotwidth'],prefix='slotwidth_')
    slotheight=pd.get_dummies(data['slotheight'],prefix='slotheight_')
    slotvisibility=pd.get_dummies(data['slotvisibility'],prefix='slotvisibility_')
    slotformat=pd.get_dummies(data['slotformat'],prefix='slotformat_')
    slotprice=data['slotprice']
    creative=pd.get_dummies(data['creative'],prefix='creative_')
    keypage=pd.get_dummies(data['keypage'],prefix='keypage_')
    advertiser=pd.get_dummies(data['advertiser'],prefix='advertiser_')
    
    data_usertag=data.usertag.fillna('0')
    data_usertag=data_usertag.str.replace(',',' ')
    vect=CountVectorizer()
    data_usertag_vect=vect.fit_transform(data_usertag)
    usertag=pd.DataFrame(data_usertag_vect.toarray(),columns=vect.get_feature_names())
    
    newdata=pd.concat([weekday,hour,useragent,region,city,adexchange,\
                       slotwidth,slotheight,slotvisibility,slotformat,\
                       slotprice,creative,keypage,advertiser,usertag],axis=1)
    
    return newdata

In [29]:
train_processed=preprocessData(train)


In [30]:
validation_processed=preprocessData(validation)

In [35]:
test_processed=preprocessTestData(test)
#test_processed

In [36]:
train_features=list(train_processed)
validation_features=list(validation_processed)
test_features=list(test_processed)

In [37]:
print(list(set(train_features)-set(validation_features)))
print(list(set(validation_features)-set(train_features)))
print(list(set(train_features)-set(test_features)))
print(list(set(test_features)-set(train_features)))

['useragent__other_firefox', 'useragent__android_maxthon', 'creative__7324', 'useragent__android_ie', 'creative__7332']
[]
['useragent__linux_ie', 'useragent__other_firefox', 'creative__7324', 'creative__7327', 'useragent__android_ie', 'click', 'useragent__mac_sogou', 'useragent__mac_maxthon']
[]


In [38]:
remove_features=['click', 'useragent__android_maxthon', 'useragent__linux_ie', 'creative__7324', 'useragent__other_firefox',\
                 'creative__7332', 'creative__7327', 'useragent__android_ie', 'useragent__mac_sogou', 'useragent__mac_maxthon']
feature=[]
for i in train_features:
    if(i not in remove_features):
        feature.append(i)

In [39]:
print(list(set(feature)-set(validation_features)))
print(list(set(validation_features)-set(feature)))
print(list(set(feature)-set(test_features)))
print(list(set(test_features)-set(feature)))

[]
['useragent__linux_ie', 'creative__7327', 'click', 'useragent__mac_sogou', 'useragent__mac_maxthon']
[]
['creative__7332', 'useragent__android_maxthon']


In [40]:
train_processed.click.value_counts()

0    2429188
1       1793
Name: click, dtype: int64

In [41]:
validation.click.value_counts()

0    303723
1       202
Name: click, dtype: int64

# Train Model

In [42]:
train_not_clicked = train_processed[train_processed.click == 0].sample(n= 10000, random_state = 2)
train_clicked = train_processed[train_processed.click == 1]
train_sample = pd.concat([train_clicked,train_not_clicked],0)

In [43]:
downsampling_rate=10000/2429188

In [44]:
train_set=train_sample[feature]
train_label=train_sample['click']
validation_set=validation_processed[feature]
validation_label=validation_processed['click']

### LightGBM

In [72]:
# create dataset for lightgbm
lgb_train = lgb.Dataset(train_set, train_label)
lgb_eval = lgb.Dataset(validation_set, validation_label, reference=lgb_train)

In [None]:
param_grid = {
    'learning_rate': [0.01, 0.1, 1],
    'n_estimators': [20, 40],
    'sigmoid' : [0.85,0.90,0.95],
    'num_leaves': [31, 127],
    'feature_fraction': [0.5, 1.0],
    'bagging_fraction': [0.75, 0.95], 
    'reg_alpha': [0.1, 0.5]}


In [33]:
lgb_estimator = lgb.LGBMRegressor(boosting_type='gbdt',
                                  objective='regression',
                                  bagging_freq=5,
                                  num_boost_round=50,
                                  learning_rate=0.01,
                                  eval_metric='l1')

In [34]:
gsearch = GridSearchCV(estimator=lgb_estimator, 
                       param_grid=param_grid)

In [35]:
lgb_model = gsearch.fit(train_set,train_label)



KeyboardInterrupt: 

In [62]:
print(lgb_model.best_params_, lgb_model.best_score_)

{'bagging_fraction': 0.75, 'feature_fraction': 0.5, 'learning_rate': 0.01, 'n_estimators': 20, 'num_leaves': 31, 'reg_alpha': 0.1, 'sigmoid': 0.85} -0.2795447458684128


In [88]:
params = {'bagging_fraction': 0.75, 'feature_fraction': 0.5, 'learning_rate': 0.01,'num_leaves': 31, 'reg_alpha': 0.1, 'sigmoid': 0.85}

In [89]:
# gbm = lgb.train(lgb_model.best_params_,
#                 lgb_train,
#                 num_boost_round=20,
#                 valid_sets=lgb_eval
#                 )


gbm = lgb.train(params,
                lgb_train,
                num_boost_round=200, # 200
                valid_sets=[lgb_train, lgb_eval],
                verbose_eval=10,)

In [90]:
LGBM_pred = gbm.predict(validation_set, num_iteration=gbm.best_iteration)

In [91]:
LGBM_pred

array([0.07637173, 0.05624167, 0.05761432, ..., 0.19910678, 0.08987092,
       0.09018203])

In [25]:
#array([0.08603051, 0.07863552, 0.0842081 , ..., 0.16756944, 0.11447344,
#      0.11759466])

### CatBoost

In [24]:
# Basic Code

# specify the training parameters 
model = CatBoostClassifier(iterations=20, depth=16, learning_rate=.05)

# Fit model
model.fit(train_set, train_label)#cat_features)

# Get predictions
CB_pred = model.predict_proba(validation_set)


0:	learn: 0.6544279	total: 5.38s	remaining: 1m 42s
1:	learn: 0.6145633	total: 10.6s	remaining: 1m 35s
2:	learn: 0.5783315	total: 15.9s	remaining: 1m 29s
3:	learn: 0.5467679	total: 21.3s	remaining: 1m 25s
4:	learn: 0.5197138	total: 26.5s	remaining: 1m 19s
5:	learn: 0.4981117	total: 32.1s	remaining: 1m 14s
6:	learn: 0.4794233	total: 37.6s	remaining: 1m 9s
7:	learn: 0.4632035	total: 42.9s	remaining: 1m 4s
8:	learn: 0.4485073	total: 48.3s	remaining: 59s
9:	learn: 0.4348724	total: 48.4s	remaining: 48.4s
10:	learn: 0.4207107	total: 53.7s	remaining: 44s
11:	learn: 0.4099484	total: 59.5s	remaining: 39.6s
12:	learn: 0.3981516	total: 1m 5s	remaining: 35.1s
13:	learn: 0.3844398	total: 1m 10s	remaining: 30.3s
14:	learn: 0.3763814	total: 1m 16s	remaining: 25.5s
15:	learn: 0.3703980	total: 1m 16s	remaining: 19.2s
16:	learn: 0.3660248	total: 1m 22s	remaining: 14.5s
17:	learn: 0.3563662	total: 1m 28s	remaining: 9.83s
18:	learn: 0.3496879	total: 1m 34s	remaining: 4.96s
19:	learn: 0.3441257	total: 1m 40

In [25]:
# Grid Search
params = {'depth':[3,1,2,6,4,5,7,8,9,10],
          'iterations':[250,100,500,1000],
          'learning_rate':[0.03,0.001,0.01,0.1,0.2,0.3], 
          'l2_leaf_reg':[3,1,5,10,100],
          'border_count':[32,5,10,20,50,100,200],
          'thread_count':4}

In [26]:
# this function does 3-fold crossvalidation with catboostclassifier          
def crossvaltest(params,train_set,train_label,n_splits=3):
    kf = KFold(n_splits=n_splits,shuffle=True) 
    res = []
    for train_index, test_index in kf.split(train_set):
        train = train_set.iloc[train_index,:]
        test = train_set.iloc[test_index,:]

        labels = train_label.ix[train_index]
        test_labels = train_label.ix[test_index]

        clf = cb.CatBoostClassifier(**params)
        clf.fit(train, np.ravel(labels))

        res.append(np.mean(clf.predict(test)==np.ravel(test_labels)))
    return np.mean(res)

In [27]:
# this function runs grid search on several parameters
def catboost_param_tune(params,train_set,train_label,n_splits=3):
    ps = paramsearch(params)
    # search 'border_count', 'l2_leaf_reg' etc. individually 
    #   but 'iterations','learning_rate' together
    for prms in chain(ps.grid_search(['border_count']),
                      ps.grid_search(['l2_leaf_reg']),
                      ps.grid_search(['iterations','learning_rate']),
                      ps.grid_search(['depth'])):
        res = crossvaltest(prms,train_set,train_label,n_splits)
        # save the crossvalidation result so that future iterations can reuse the best parameters
        ps.register_result(res,prms)
        #print(res,prms,s'best:',ps.bestscore(),ps.bestparam())
    return ps.bestparam()

#bestparams = catboost_param_tune(params,train_set,train_label)

In [54]:
bestparams = {'border_count': 20,'depth': 16,'iterations': 250,'l2_leaf_reg': 3,'learning_rate': 0.03,'thread_count': 4}

In [111]:
from sortedcontainers import SortedList
import copy
import collections
import numpy as np
from itertools import product,chain
import pandas
from sklearn.model_selection import KFold
import catboost as cb

''' a class for doing grid search on a set of parameters provided in a dict. 'pdict' should be a dictionary like the following:
pdict = {'depth':[1,2], 'iterations':[250,100,500], 'thread_count':4}

when grid_search is called it will return an iterator that provides samples from the dictionary e.g.
{'depth':1, 'iterations':250, 'thread_count':4}
{'depth':2, 'iterations':250, 'thread_count':4}
{'depth':1, 'iterations':100, 'thread_count':4}
etc.
after calling an iteration of grid_search, you need to test the classifier and run 'register_result'
This will update the internal list of results, so that the next call to grid_search will use the best
parameters for all the parameters not currently being updated.

grid_search can be provided a list e.g. grid_search(['depth']) this will use the current best parameters for all
the other arguments and only search over 'depth'. You can then call e.g. grid_search(['iterations']) and it will use
the best depth found previously and cycle through all the 'iterations'. Searching incrementally can be much faster
than doing a full grid search, but may miss the global optimum. '''
class paramsearch:
    def __init__(self,pdict):    
        self.pdict = {}
        # if something is not passed in as a sequence, make it a sequence with 1 element
        #   don't treat strings as sequences
        for a,b in pdict.items():
            if isinstance(b, collections.Sequence) and not isinstance(b, str): self.pdict[a] = b
            else: self.pdict[a] = [b]
        # our results are a sorted list, so the best score is always the final element
        self.results = SortedList()       
                    
    def grid_search(self,keys=None):
        # do grid search on only the keys listed. If none provided, do all
        if keys==None: keylist = self.pdict.keys()
        else: keylist = keys
 
        listoflists = [] # this will be list of lists of key,value pairs
        for key in keylist: listoflists.append([(key,i) for i in self.pdict[key]])
        for p in product(*listoflists):
            # do any changes to the current best parameter set
            if len(self.results)>0: template = self.results[-1][1]
            else: template = {a:b[0] for a,b in self.pdict.items()}
            # if our updates are the same as current best, don't bother
            if self.equaldict(dict(p),template): continue
            # take the current best and update just the ones to change
            yield self.overwritedict(dict(p),template)
                              
    def equaldict(self,a,b):
        for key in a.keys(): 
            if a[key] != b[key]: return False
        return True            
                              
    def overwritedict(self,new,old):
        old = copy.deepcopy(old)
        for key in new.keys(): old[key] = new[key]
        return old            
    
    # save a (score,params) pair to results. Since 'results' is a sorted list,
    #   the best score is always the final element. A small amount of noise is added
    #   because sorted lists don't like it when two scores are exactly the same    
    def register_result(self,result,params):
        self.results.add((result+np.random.randn()*1e-10,params))    
        
    def bestscore(self):
        return self.results[-1][0]
        
    def bestparam(self):
        return self.results[-1][1]
        


In [48]:
# train classifier with tuned parameters    


clf = CatBoostClassifier(border_count=20,depth= 16,iterations=250,l2_leaf_reg=3,learning_rate= 0.03,thread_count=4)
clf.fit(train_set, train_label)
CB_pred = clf.predict_proba(validation_set)
#print('error:',1-np.mean(res==np.ravel(validation_label)))



0:	learn: 0.6690839	total: 6.17s	remaining: 25m 36s
1:	learn: 0.6462031	total: 12.5s	remaining: 25m 52s
2:	learn: 0.6251182	total: 12.8s	remaining: 17m 34s
3:	learn: 0.6011717	total: 19s	remaining: 19m 30s
4:	learn: 0.5801353	total: 25.4s	remaining: 20m 45s
5:	learn: 0.5639964	total: 31.7s	remaining: 21m 29s
6:	learn: 0.5463799	total: 37.8s	remaining: 21m 53s
7:	learn: 0.5302108	total: 43.7s	remaining: 22m 2s
8:	learn: 0.5170292	total: 49.8s	remaining: 22m 13s
9:	learn: 0.5012663	total: 55.9s	remaining: 22m 22s
10:	learn: 0.4881635	total: 1m 2s	remaining: 22m 28s
11:	learn: 0.4757177	total: 1m 8s	remaining: 22m 46s
12:	learn: 0.4645900	total: 1m 15s	remaining: 22m 48s
13:	learn: 0.4548794	total: 1m 18s	remaining: 22m 8s
14:	learn: 0.4465294	total: 1m 25s	remaining: 22m 16s
15:	learn: 0.4383843	total: 1m 31s	remaining: 22m 17s
16:	learn: 0.4280672	total: 1m 37s	remaining: 22m 18s
17:	learn: 0.4203224	total: 1m 44s	remaining: 22m 22s
18:	learn: 0.4109553	total: 1m 50s	remaining: 22m 24s


KeyboardInterrupt: 

In [48]:
CB_pred = CB_pred[:,1]
CB_predCTR=pd.DataFrame(CB_pred)
CB_predCTR.to_csv('cb_pred.csv')

In [55]:
CB_pred = pd.read_csv('./cb_pred.csv')

In [56]:
#CB_pred = CB_pred[0]
CB_pred = CB_pred['0']
CB_pred 

0         0.039744
1         0.040818
2         0.024539
3         0.026173
4         0.026456
5         0.081049
6         0.078802
7         0.118493
8         0.045317
9         0.019441
10        0.131289
11        0.061049
12        0.022659
13        0.072006
14        0.213817
15        0.030137
16        0.072789
17        0.063649
18        0.050934
19        0.074363
20        0.109744
21        0.081739
22        0.051446
23        0.039839
24        0.116298
25        0.112174
26        0.017095
27        0.086274
28        0.060940
29        0.152633
            ...   
303895    0.571795
303896    0.020277
303897    0.134538
303898    0.184158
303899    0.068399
303900    0.151672
303901    0.026061
303902    0.090400
303903    0.019429
303904    0.095197
303905    0.064984
303906    0.135669
303907    0.052434
303908    0.060051
303909    0.040044
303910    0.195576
303911    0.118841
303912    0.045416
303913    0.160437
303914    0.036264
303915    0.085645
303916    0.

### Factorisation Machine

In [57]:
import scipy.sparse as sp

sparse_x_train = sp.csc_matrix(train_set)
sparse_y_train = sp.csc_matrix(train_label)
sparse_x_val = sp.csc_matrix(validation_set)
sparse_y_val = sp.csc_matrix(validation_label)

In [58]:
fm = als.FMRegression(n_iter=1000, init_stdev=0.1, rank=6, l2_reg_w=0.1, l2_reg_V=0.5)
fm.fit(sparse_x_train, train_label)

FMRegression(init_stdev=0.1, l2_reg=0, l2_reg_V=0.5, l2_reg_w=0.1,
       n_iter=1000, random_state=123, rank=6)

In [59]:
fm_pred = fm.predict(sparse_x_val)

In [60]:
fm_pred

array([0.06651411, 0.11849002, 0.0470538 , ..., 0.16037129, 0.35451194,
       0.15506717])

In [81]:
from fastFM import mcmc
fm = mcmc.FMClassification(n_iter=1000, rank=2, init_stdev=0.1)
y_pred_proba = fm.fit_predict_proba(sparse_x_train, train_label, sparse_x_val)

# Evaluation

In [61]:
def bid_linear(base_bid, p_ctr, avg_ctr):
    return (base_bid*p_ctr/avg_ctr)

In [62]:
def evaluate(bidprice):
    budget=6250
    impression=0
    click=0
    for row in range(0,validation.shape[0]):
        if budget<validation.payprice[row]:
            continue
        if bidprice.iloc[row,0]>validation.payprice[row]:
            budget=budget-validation.payprice[row]/1000
            impression=impression+1
            if validation.click[row]==1:
                click=click+1
    return click,(click/impression),budget, (6250-budget)/impression,((6250-budget)/click)/1000

In [63]:
def pred_ctr_calibration(pred_ctr):
    return pred_ctr/(pred_ctr+(1-pred_ctr)/downsampling_rate)

In [64]:
number_of_click=len(train[train.click==1])
number_of_impression=len(train[train.bidprice>train.payprice])

## LightGBM Results

In [92]:
avg_ctr=number_of_click/number_of_impression
LGBM_predCTR=pd.DataFrame(LGBM_pred)
LGBM_predCTR_calibrated=pred_ctr_calibration(LGBM_predCTR)

In [93]:
for counter in range(95,100,1):
    base_bid=counter
    bidprice=bid_linear(base_bid, LGBM_predCTR_calibrated, avg_ctr)
    print('basebid ', base_bid, ' click, CTR ,budget remaining ', evaluate(bidprice))

basebid  95  click, CTR ,budget remaining  (159, 0.0011100251326445127, 370.4329999971867, 0.041046963138807686, 0.03697840880504914)
basebid  96  click, CTR ,budget remaining  (161, 0.0011124469687547503, 271.78199999720965, 0.04130714591713162, 0.03713178881989311)
basebid  97  click, CTR ,budget remaining  (161, 0.0011020301997344175, 181.11999999724023, 0.041540925705386665, 0.03769490683231527)
basebid  98  click, CTR ,budget remaining  (162, 0.0010993410739612245, 98.83299999724913, 0.041742163801838685, 0.037970166666683645)
basebid  99  click, CTR ,budget remaining  (160, 0.0010786904697696997, 45.0129999972773, 0.04183287713717385, 0.03878116875001702)


In [95]:
#basebid  98  click, CTR ,budget remaining  (162, 0.0010993410739612245, 98.83299999724913, 0.041742163801838685, 0.037970166666683645)

#162, 0.0010993410739612245, 98.83299999724913, 0.041742163801838685, 0.037970166666683645

## CatBoost Results

In [67]:
avg_ctr=number_of_click/number_of_impression
CB_predCTR=pd.DataFrame(CB_pred)
CB_predCTR_calibrated=pred_ctr_calibration(CB_predCTR)

In [68]:
for counter in range(125,135,1):
    base_bid=counter
    bidprice=bid_linear(base_bid, CB_predCTR_calibrated, avg_ctr)
    print('basebid ', base_bid, ' click, CTR ,budget remaining ', evaluate(bidprice))

basebid  125  click, CTR ,budget remaining  (159, 0.001177542269323913, 425.60599999744375, 0.043135032252827624, 0.03663140880504752)
basebid  126  click, CTR ,budget remaining  (159, 0.0011704430016342034, 371.45199999745853, 0.04327361865643848, 0.036972000000015985)
basebid  127  click, CTR ,budget remaining  (159, 0.0011633011413520631, 312.74999999746444, 0.04343905472638671, 0.037341194968569406)
basebid  128  click, CTR ,budget remaining  (159, 0.0011567085458209358, 260.50799999749313, 0.04357293447502533, 0.03766976100630507)
basebid  129  click, CTR ,budget remaining  (159, 0.0011500904159132007, 206.64199999751014, 0.04371325858952976, 0.038008540880518804)
basebid  130  click, CTR ,budget remaining  (159, 0.001143835517891314, 156.8619999975122, 0.043833633080604346, 0.038321622641525076)
basebid  131  click, CTR ,budget remaining  (159, 0.001137729692598317, 110.33599999752867, 0.0439325662602501, 0.03861423899372624)
basebid  132  click, CTR ,budget remaining  (159, 0.00

In [69]:
#basebid  132  click, impression,budget remaining  (159, 0.0011320593512374334, 68.93599999755462)

## FM Results

In [70]:
avg_ctr=number_of_click/number_of_impression
fm_predCTR=pd.DataFrame(fm_pred)
fm_predCTR_calibrated=pred_ctr_calibration(fm_predCTR)

In [71]:
for counter in range(85,100,1):
    base_bid=counter
    bidprice=bid_linear(base_bid, fm_predCTR_calibrated, avg_ctr)
    print('basebid ', base_bid, ' click, CTR ,budget remaining ', evaluate(bidprice))

basebid  85  click, CTR ,budget remaining  (125, 0.0011579005872871778, 15.448999998775381, 0.05775192211498624, 0.049876408000009795)
basebid  86  click, CTR ,budget remaining  (125, 0.0011592536261453427, 14.427999998793277, 0.057828875616734124, 0.04988457600000966)
basebid  87  click, CTR ,budget remaining  (124, 0.0011512608162810563, 13.662999998788594, 0.05790040665504151, 0.05029304032259042)
basebid  88  click, CTR ,budget remaining  (123, 0.0011440157743963688, 12.830999998785607, 0.058011542468109065, 0.050708691056920434)
basebid  89  click, CTR ,budget remaining  (122, 0.0011360673445822624, 12.132999998791064, 0.05808718851269424, 0.05113005737705909)
basebid  90  click, CTR ,budget remaining  (121, 0.0011286786996875147, 11.5659999988005, 0.05819163285295648, 0.051557305785133886)
basebid  91  click, CTR ,budget remaining  (120, 0.001120961037262613, 11.033999998805523, 0.05828031499006263, 0.051991383333343286)


KeyboardInterrupt: 

# Test Pred

In [17]:
test_set=test_processed[feature]

In [38]:
LGBM_pred_test = gbm.predict(test_set, num_iteration=gbm.best_iteration)

In [39]:
csv_df
for row in test_set.iterrow():
    

array([0.15202686, 0.20262382, 0.14515533, ..., 0.14072025, 0.1450248 ,
       0.14367853])

In [44]:
avg_ctr=number_of_click/number_of_impression
LGBM_predCTR_test = pd.DataFrame(LGBM_pred_test)
LGBM_predCTR_calibrated=pred_ctr_calibration(LGBM_predCTR_test)

In [48]:
for row in test.iterrow():


0.0007383985358870749

In [57]:
bidprice = 98 * (LGBM_predCTR_calibrated/avg_ctr)

In [59]:
test['bidprice'] = bidprice

In [62]:
test.loc[:, ['bidid', 'bidprice']].to_csv('submission.csv', index=False)

In [63]:
temp_df = pd.read_csv('submission.csv')

In [66]:
!head submission.csv

bidid,bidprice
366c563de7d90feb9d4dab53e795a93fb3157387,97.87954711250069
29167d4caa719788b5a342dbaa25151d53121f80,138.6906865708191
ff8bc3f4d44a3ea60c5f3a3a8fbe7cd98fb2966e,92.70783288397554
844c2da00d45315f20b748ec131c26ee99a7cbc7,90.32843623581051
c6017f0ad0c44d7d0c9b62583ea863f28941c0ca,93.28794140207727
7493c42f6d8f29d9f414b75c125ce3db40461ef9,91.40766300766559
819fb6958ea48b10430710d009c57d55a31debcb,87.65710511553648
8660774f6c94e32dfbd46add8a9bc94df3c74130,94.62909657843589
ccc4c75ae4fd33526e5ecdae482c45c1dabc94db,93.4910502726202
