In [3]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns
import xgboost as xgb

from xgboost.sklearn import XGBClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier
from collections import Counter
from sklearn.metrics import roc_auc_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [4]:
train = pd.read_csv("/Users/dingding/Desktop/we_data/train.csv")
validation = pd.read_csv("/Users/dingding/Desktop/we_data/validation.csv")
test=pd.read_csv('/Users/dingding/Desktop/we_data/test.csv')

In [5]:
train['domain'].value_counts()

5F1RQS9rg5scFsf                                 192662
31xSTvprdN1RFt                                  138086
ersbQv1RdoTy1m58uG                               80892
trqRTuMvjTN7X9KbuKz                              75043
DFpETuxoGQdcFNKbuKz                              71060
3FKElpuEMusyJqKbuKz                              63618
3FF-e59aG5syJqKbuKz                              44466
5F97t5E0BTK7XhNrUMpENpn                          28219
DD1SqS9rg5scFsf                                  28137
DDTSQuf0MTTNaqKIvMpENpn                          26922
31drTvprdN1RFt                                   26237
dd4270481b753dde29898e27c7c03920                 25227
trqRTummPvas1m58uG                               20594
3SCYZrn0Qo18XMB4JKTI                             20536
trqRTuN-XIuc1mKYUV                               18947
trqRTvFoMNmIFY5SaMpENpn                          18188
trqRTvFRLpscFU                                   17954
tK1jlK9rg5scFsf                                  17881
20fc675468

In [6]:
type(train.usertag[0])

float

In [7]:
train.columns

Index(['click', 'weekday', 'hour', 'bidid', 'userid', 'useragent', 'IP',
       'region', 'city', 'adexchange', 'domain', 'url', 'urlid', 'slotid',
       'slotwidth', 'slotheight', 'slotvisibility', 'slotformat', 'slotprice',
       'creative', 'bidprice', 'payprice', 'keypage', 'advertiser', 'usertag'],
      dtype='object')

In [8]:
train.creative.value_counts()

44966cc8da1ed40c95d59e863c8c75f0    147811
832b91d59d0cb5731431653204a76c0e    133077
86c2543527c86a893d4d4f68810a0416    126157
48f2e9ba15708c0146bda5e1dd653caa    107449
77819d3e0b3467fe5c7b16d68ad923a1    106436
e1af08818a6cd6bbba118bb54a651961     97033
c46090c887c257b61ab1fa11baee91d8     85682
10722                                78307
0cd33fcb336655841d3e1441b915748d     75359
b90c12ed2bd7950c6027bf9c6937c48a     74424
00fccc64a1ee2809348509b7ac2a97a5     57629
cb7c76e7784031272e37af8e7e9b062c     53706
10717                                50878
911b2d84826786018761e8c0b0a3a60c     50726
449a22cd91d9042eda3d3a1b89a22ea8     50549
7323                                 47191
59f065a795a663140e36eec106464524     41741
a10c31a8ff5f42930b4c34035e523886     40054
d01411218cc79bc49d2a4078c4093b76     37611
d881a6c788e76c2c27ed1ef04f119544     37118
a499988a822facd86dd0e8e4ffef8532     33868
f65c8bdb41e9015970bac52baa813239     31811
7184c9560e68e977187e67e45a4f3198     30125
cc9b344e950

# Preprocess Data

In [9]:
def preprocessData(data):
    click=data['click']
    weekday=pd.get_dummies(data['weekday'],prefix='weekday_')
    hour=pd.get_dummies(data['hour'],prefix='hour_')
    useragent=pd.get_dummies(data['useragent'],prefix='useragent_')
    region=pd.get_dummies(data['region'],prefix='region_')
    city=pd.get_dummies(data['city'],prefix='city_')
    adexchange=pd.get_dummies(data['adexchange'],prefix='adexchange_')
    slotwidth=pd.get_dummies(data['slotwidth'],prefix='slotwidth_')
    slotheight=pd.get_dummies(data['slotheight'],prefix='slotheight_')
    slotvisibility=pd.get_dummies(data['slotvisibility'],prefix='slotvisibility_')
    slotformat=pd.get_dummies(data['slotformat'],prefix='slotformat_')
    slotprice=data['slotprice']
    creative=pd.get_dummies(data['creative'],prefix='creative_')
    keypage=pd.get_dummies(data['keypage'],prefix='keypage_')
    advertiser=pd.get_dummies(data['advertiser'],prefix='advertiser_')
    
    data_usertag=data.usertag.fillna('0')
    data_usertag=data_usertag.str.replace(',',' ')
    vect=CountVectorizer()
    data_usertag_vect=vect.fit_transform(data_usertag)
    usertag=pd.DataFrame(data_usertag_vect.toarray(),columns=vect.get_feature_names())
    
    newdata=pd.concat([click,weekday,hour,useragent,region,city,adexchange,\
                       slotwidth,slotheight,slotvisibility,slotformat,\
                       slotprice,creative,keypage,advertiser,usertag],axis=1)
    
    return newdata

In [10]:
def preprocessTestData(data):
    weekday=pd.get_dummies(data['weekday'],prefix='weekday_')
    hour=pd.get_dummies(data['hour'],prefix='hour_')
    useragent=pd.get_dummies(data['useragent'],prefix='useragent_')
    region=pd.get_dummies(data['region'],prefix='region_')
    city=pd.get_dummies(data['city'],prefix='city_')
    adexchange=pd.get_dummies(data['adexchange'],prefix='adexchange_')
    slotwidth=pd.get_dummies(data['slotwidth'],prefix='slotwidth_')
    slotheight=pd.get_dummies(data['slotheight'],prefix='slotheight_')
    slotvisibility=pd.get_dummies(data['slotvisibility'],prefix='slotvisibility_')
    slotformat=pd.get_dummies(data['slotformat'],prefix='slotformat_')
    slotprice=data['slotprice']
    creative=pd.get_dummies(data['creative'],prefix='creative_')
    keypage=pd.get_dummies(data['keypage'],prefix='keypage_')
    advertiser=pd.get_dummies(data['advertiser'],prefix='advertiser_')
    
    data_usertag=data.usertag.fillna('0')
    data_usertag=data_usertag.str.replace(',',' ')
    vect=CountVectorizer()
    data_usertag_vect=vect.fit_transform(data_usertag)
    usertag=pd.DataFrame(data_usertag_vect.toarray(),columns=vect.get_feature_names())
    
    newdata=pd.concat([weekday,hour,useragent,region,city,adexchange,\
                       slotwidth,slotheight,slotvisibility,slotformat,\
                       slotprice,creative,keypage,advertiser,usertag],axis=1)
    
    return newdata

In [11]:
train_processed=preprocessData(train)

In [12]:
validation_processed=preprocessData(validation)

In [13]:
test_processed=preprocessTestData(test)

In [14]:
print('Number of features in processed training set',len(train_processed.columns))
print('Number of features in processed validation set',len(validation_processed.columns))
print('Number of features in processed test set',len(test_processed.columns))

Number of features in processed training set 756
Number of features in processed validation set 751
Number of features in processed test set 748


In [15]:
train_features=list(train_processed)
validation_features=list(validation_processed)
test_features=list(test_processed)

In [16]:
print(list(set(train_features)-set(validation_features)))
print(list(set(validation_features)-set(train_features)))
print(list(set(train_features)-set(test_features)))
print(list(set(test_features)-set(train_features)))

['useragent__android_maxthon', 'useragent__android_ie', 'useragent__other_firefox', 'creative__7332', 'creative__7324']
[]
['click', 'useragent__other_firefox', 'creative__7327', 'useragent__mac_sogou', 'useragent__linux_ie', 'useragent__android_ie', 'creative__7324', 'useragent__mac_maxthon']
[]


In [17]:
remove_features=['click', 'useragent__android_maxthon', 'useragent__linux_ie', 'creative__7324', 'useragent__other_firefox',\
                 'creative__7332', 'creative__7327', 'useragent__android_ie', 'useragent__mac_sogou', 'useragent__mac_maxthon']
feature=[]
for i in train_features:
    if(i not in remove_features):
        feature.append(i)

In [18]:
len(feature)

746

In [19]:
print(list(set(feature)-set(validation_features)))
print(list(set(validation_features)-set(feature)))
print(list(set(feature)-set(test_features)))
print(list(set(test_features)-set(feature)))

[]
['click', 'useragent__mac_sogou', 'useragent__linux_ie', 'useragent__mac_maxthon', 'creative__7327']
[]
['useragent__android_maxthon', 'creative__7332']


In [20]:
train_processed.click.value_counts()

0    2429188
1       1793
Name: click, dtype: int64

In [21]:
validation.click.value_counts()

0    303723
1       202
Name: click, dtype: int64

# Train Model for downsampling

In [22]:
downsampling_rate=0.025

In [23]:
train_not_clicked = train_processed[train_processed.click == 0].sample(n= round(downsampling_rate*2429188), random_state = 2)
train_clicked = train_processed[train_processed.click == 1]
train_sample = pd.concat([train_clicked,train_not_clicked],0)

In [24]:
train_set=train_sample[feature]
train_label=train_sample['click']
validation_set=validation_processed[feature]
validation_label=validation_processed['click']
test_set=test_processed[feature]

# Evaluation

In [25]:
def bid_linear(base_bid, p_ctr, avg_ctr):
    return (base_bid*p_ctr/avg_ctr)

In [26]:
def ortb(c,p_ctr,lamada):
    return(np.sqrt((c*p_ctr/lamada)+np.square(c))-c)

In [27]:
def evaluate(bidprice):
    budget=6250
    impression=0
    click=0
    for row in range(0,validation.shape[0]):
        if budget<validation.payprice[row]:
            continue
        if bidprice.iloc[row,0]>validation.payprice[row]:
            budget=budget-validation.payprice[row]/1000
            impression=impression+1
            if validation.click[row]==1:
                click=click+1
                cost=6250-budget
    return click,(click/impression),budget,(cost/impression),(cost/click)

In [28]:
def pred_ctr_calibration(pred_ctr):
    return pred_ctr/(pred_ctr+(1-pred_ctr)/downsampling_rate)

In [29]:
number_of_click=len(train[train.click==1])
number_of_impression=len(train[train.bidprice>train.payprice])

## GB+linear on testset 179 clicks

In [30]:
import numpy as np  
import pandas as pd  
from sklearn import linear_model  
from sklearn.preprocessing import OneHotEncoder  
from sklearn.ensemble import GradientBoostingRegressor 
from sklearn.ensemble import GradientBoostingClassifier

In [31]:


gbr=GradientBoostingRegressor()
gbr.fit(train_set,train_label)  
enc = OneHotEncoder()  
enc.fit(gbr.apply(train_set))
new_feature_train=enc.transform(gbr.apply(train_set))  
new_feature_train=new_feature_train.toarray()
new_train=np.concatenate([train_set,new_feature_train],axis=1) 

new_feature_test=enc.transform(gbr.apply(validation_set))  
new_feature_test=new_feature_test.toarray()  
new_test=np.concatenate([validation_set,new_feature_test],axis=1)  
   
lr = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, \
                              intercept_scaling=1, class_weight=None, random_state=None, solver='liblinear', \
                              max_iter=100, multi_class='ovr', verbose=0, warm_start=False, n_jobs=1) 
lr.fit(new_train, train_label)  

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [32]:
pred_ctr = []
for a, b in lr.predict_proba(new_test):
    pred_ctr.append(b)

auc_etcf = roc_auc_score(validation_label, pred_ctr)
print('AUC:', auc_etcf)

AUC: 0.8624586293340568


In [33]:
avg_ctr=number_of_click/number_of_impression
predCTR=pd.DataFrame(pred_ctr)
predCTR_calibrated=pred_ctr_calibration(predCTR)
for counter in range(152,154,1):
    base_bid=counter
    bidprice=bid_linear(base_bid, predCTR_calibrated, avg_ctr)
    print('basebid ', base_bid, ' click ', evaluate(bidprice))

basebid  152  click  (163, 0.0013737768750368728, 78.4699999980113, 0.05195088958375394, 37.81610429449073)
basebid  153  click  (162, 0.0013622030691612362, 66.43899999800907, 0.051791254992659165, 38.02021604939501)


In [158]:
new_feature_test=enc.transform(gbr.apply(test_set))
new_feature_test=new_feature_test.toarray()  
new_test=np.concatenate([test_set,new_feature_test],axis=1)
pred_ctr_test=[]
for a, b in lr.predict_proba(new_test):
    pred_ctr_test.append(b)
predCTR_test=pd.DataFrame(pred_ctr_test)
predCTR_calibrated_test=pred_ctr_calibration(predCTR_test)
bidprice=bid_linear(152, predCTR_calibrated_test, avg_ctr)

In [159]:
group8=pd.read_csv("/Users/dingding/Desktop/we_data/Group_8.csv")
group8['bidprice']=bidprice
group8.to_csv("/Users/dingding/Desktop/Group_8.csv")

## xgboost +linear 

In [42]:
xgboost=xgb.XGBClassifier(booster='gbtree',learning_rate = 0.1,
 n_estimators=50,
 max_depth=9,
 min_child_weight=9,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.85,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1, seed=27,n_jobs=4)

In [43]:
#gbr=GradientBoostingRegressor()  
#gbr=GradientBoostingRegressor()
xgboost.fit(train_set,train_label)
enc = OneHotEncoder()  
enc.fit(xgboost.apply(train_set))
new_feature_train=enc.transform(xgboost.apply(train_set))  
new_feature_train=new_feature_train.toarray()
new_train=np.concatenate([train_set,new_feature_train],axis=1)

new_feature_test=enc.transform(xgboost.apply(validation_set)) 
new_feature_test=new_feature_test.toarray()  
new_test=np.concatenate([validation_set,new_feature_test],axis=1)  
   
lr = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, \
                              intercept_scaling=1, class_weight=None, random_state=None, solver='liblinear', \
                              max_iter=100, multi_class='ovr', verbose=0, warm_start=False, n_jobs=1) 
lr.fit(new_train, train_label)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [44]:
ctr = []
for a, b in lr.predict_proba(new_test):
    ctr.append(b)

auc_etcf = roc_auc_score(validation_label, ctr)
print('AUC:', auc_etcf)

AUC: 0.8598725868082704


In [46]:
avg_ctr=number_of_click/number_of_impression
CTR=pd.DataFrame(ctr)
predCTR=pred_ctr_calibration(CTR)
for counter in range(221,227,5):
    base_bid=counter
    bidprice1=bid_linear(base_bid, predCTR, avg_ctr)
    print('basebid ', base_bid, ' click ', evaluate(bidprice1))

basebid  221  click  (158, 0.0015914745313711863, 117.05299999870564, 0.061690629438262826, 38.763189873425915)
basebid  226  click  (159, 0.0015890942162967109, 76.94799999871158, 0.061635557732105585, 38.78659748428483)


In [None]:
new_feature_test_xg=enc.transform(xgboost.apply(test_set))
new_feature_test_xg=new_feature_test_xg.toarray()  
new_test_xg=np.concatenate([test_set,new_feature_test_xg],axis=1)
ctr_test=[]
for a, b in lr.predict_proba(new_test_xg):
    ctr_test.append(b)
CTR_test=pd.DataFrame(ctr_test)
CTR_calibrated_test=pred_ctr_calibration(dCTR_test)
bidprice_xg=bid_linear(226, CTR_calibrated_test, avg_ctr)

In [79]:
group=pd.read_csv("/Users/dingding/Desktop/we_data/Group_8.csv")
group['bidprice']=bidprice
group.to_csv("/Users/dingding/Desktop/Group_8_new.csv")

## ORTB Evaluation 

In [34]:
def ortb(c,p_ctr,lamada):
    return(np.sqrt((c*p_ctr/lamada)+np.square(c))-c)

In [35]:
def ortb1(c,p_ctr,lamada):
    a=np.sqrt(np.square(c)*np.square(lamada)+np.square(p_ctr))
    b=np.cbrt((p_ctr+a)/(c*lamada))
    d=np.cbrt((c*lamada)/(p_ctr+a))
    return(c*(b-d))

In [39]:
for lamda in np.arange(0.00000057,0.00000059,0.00000001):
    for c in range(10,20,1):
        bidprice=ortb(c, predCTR_calibrated, lamda)
        print('c ', c,'lamada',lamda, ' click ', evaluate(bidprice))

c  10 lamada 5.7e-07  click  (155, 0.0012357293194719052, 996.2949999972076, 0.041796024937837184, 33.8229612903406)
c  11 lamada 5.7e-07  click  (156, 0.0012042705285666864, 707.9719999973206, 0.04269012420971815, 35.44894871796589)
c  12 lamada 5.7e-07  click  (157, 0.0011784751882182507, 443.2949999974181, 0.043489044684495784, 36.902808917213896)
c  13 lamada 5.7e-07  click  (157, 0.0011487608748143325, 189.62799999746554, 0.04424597384924551, 38.51626114651296)
c  14 lamada 5.7e-07  click  (152, 0.0010946355655737114, 37.07399999751532, 0.04432203890278977, 40.49022368422687)
c  15 lamada 5.7e-07  click  (147, 0.0010622231535743448, 16.10699999757576, 0.04479969506248635, 42.175408163281794)
c  16 lamada 5.7e-07  click  (145, 0.0010567434809859051, 11.159999997637104, 0.0454062121941082, 42.96805517243009)
c  17 lamada 5.7e-07  click  (143, 0.0010518495634456533, 8.888999997705204, 0.04587348382874929, 43.61220979022584)
c  18 lamada 5.7e-07  click  (140, 0.0010386296024274258, 7.

In [41]:
for lamda in np.arange(0.00000052,0.00000053,0.00000001):
    for c in range(1,19,1):
        bidprice=ortb1(c, predCTR_calibrated, lamda)
        print('c ', c,'lamada',lamda, ' click ', evaluate(bidprice))

c  1 lamada 5.2e-07  click  (52, 0.0036376355369010143, 6134.467999998998, 0.00789548793291179, 2.170500000018732)
c  2 lamada 5.2e-07  click  (72, 0.0027705094659073418, 5937.721999998114, 0.011697899030392105, 4.222291666692083)
c  3 lamada 5.2e-07  click  (80, 0.0021108736378268557, 5678.591999996936, 0.014656956648011026, 6.943550000037123)
c  4 lamada 5.2e-07  click  (92, 0.001896398903386721, 5400.209999996033, 0.017047224455379956, 8.98926086960704)
c  5 lamada 5.2e-07  click  (99, 0.0017105831533477322, 5115.748999995519, 0.019083732181500626, 11.156272727316654)
c  6 lamada 5.2e-07  click  (103, 0.0015558677361369165, 4820.644999995198, 0.021016298847519765, 13.507766990336465)
c  7 lamada 5.2e-07  click  (106, 0.0014313492492168088, 4502.788999995078, 0.022947377660213524, 16.03199056608276)
c  8 lamada 5.2e-07  click  (114, 0.0013990476657994206, 4167.485999995039, 0.025506565706211556, 18.23137719302581)
c  9 lamada 5.2e-07  click  (114, 0.001290848563081731, 3833.245999995