In [1]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns
import xgboost as xgb

from scipy.sparse import hstack
from xgboost.sklearn import XGBClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from collections import Counter
from sklearn.metrics import roc_auc_score, mean_squared_error
from sklearn.model_selection import train_test_split

In [2]:
train=pd.read_csv('D:\\web_economics_data\\train.csv')

In [3]:
validation=pd.read_csv('D:\\web_economics_data\\validation.csv')

In [4]:
test=pd.read_csv('D:\\web_economics_data\\test.csv')

In [5]:
def preprocessData(data):
    click=data['click']
    weekday=pd.get_dummies(data['weekday'],prefix='weekday_')
    hour=pd.get_dummies(data['hour'],prefix='hour_')
    useragent=pd.get_dummies(data['useragent'],prefix='useragent_')
    region=pd.get_dummies(data['region'],prefix='region_')
    city=pd.get_dummies(data['city'],prefix='city_')
    adexchange=pd.get_dummies(data['adexchange'],prefix='adexchange_')
    slotwidth=pd.get_dummies(data['slotwidth'],prefix='slotwidth_')
    slotheight=pd.get_dummies(data['slotheight'],prefix='slotheight_')
    slotvisibility=pd.get_dummies(data['slotvisibility'],prefix='slotvisibility_')
    slotformat=pd.get_dummies(data['slotformat'],prefix='slotformat_')
    slotprice=data['slotprice']
    creative=pd.get_dummies(data['creative'],prefix='creative_')
    keypage=pd.get_dummies(data['keypage'],prefix='keypage_')
    advertiser=pd.get_dummies(data['advertiser'],prefix='advertiser_')
    
    data_usertag=data.usertag.fillna('0')
    data_usertag=data_usertag.str.replace(',',' ')
    vect=CountVectorizer()
    data_usertag_vect=vect.fit_transform(data_usertag)
    usertag=pd.DataFrame(data_usertag_vect.toarray(),columns=vect.get_feature_names())
    
    newdata=pd.concat([click,weekday,hour,useragent,region,city,adexchange,\
                       slotwidth,slotheight,slotvisibility,slotformat,\
                       slotprice,creative,keypage,advertiser,usertag],axis=1)
    
    return newdata

In [6]:
def preprocessTestData(data):
    weekday=pd.get_dummies(data['weekday'],prefix='weekday_')
    hour=pd.get_dummies(data['hour'],prefix='hour_')
    useragent=pd.get_dummies(data['useragent'],prefix='useragent_')
    region=pd.get_dummies(data['region'],prefix='region_')
    city=pd.get_dummies(data['city'],prefix='city_')
    adexchange=pd.get_dummies(data['adexchange'],prefix='adexchange_')
    slotwidth=pd.get_dummies(data['slotwidth'],prefix='slotwidth_')
    slotheight=pd.get_dummies(data['slotheight'],prefix='slotheight_')
    slotvisibility=pd.get_dummies(data['slotvisibility'],prefix='slotvisibility_')
    slotformat=pd.get_dummies(data['slotformat'],prefix='slotformat_')
    slotprice=data['slotprice']
    creative=pd.get_dummies(data['creative'],prefix='creative_')
    keypage=pd.get_dummies(data['keypage'],prefix='keypage_')
    advertiser=pd.get_dummies(data['advertiser'],prefix='advertiser_')
    
    data_usertag=data.usertag.fillna('0')
    data_usertag=data_usertag.str.replace(',',' ')
    vect=CountVectorizer()
    data_usertag_vect=vect.fit_transform(data_usertag)
    usertag=pd.DataFrame(data_usertag_vect.toarray(),columns=vect.get_feature_names())
    
    newdata=pd.concat([weekday,hour,useragent,region,city,adexchange,\
                       slotwidth,slotheight,slotvisibility,slotformat,\
                       slotprice,creative,keypage,advertiser,usertag],axis=1)
    
    return newdata

In [None]:
train_processed=preprocessData(train)

In [None]:
validation_processed=preprocessData(validation)

In [None]:
test_processed=preprocessTestData(test)

In [None]:
train_features=list(train_processed)
validation_features=list(validation_processed)
test_features=list(test_processed)

In [None]:
city=pd.get_dummies(train['city'],prefix='city_')

In [None]:
creative=pd.get_dummies(train['creative'],prefix='creative_')

In [None]:
remove_features=['click', 'useragent__android_maxthon', 'useragent__linux_ie', 'creative__7324', 'useragent__other_firefox',\
                 'creative__7332', 'creative__7327', 'useragent__android_ie', 'useragent__mac_sogou', 'useragent__mac_maxthon']\
                +list(city)+list(creative)
feature=[]
for i in train_features:
    if(i not in remove_features):
        feature.append(i)

In [373]:
len(feature)

248

# Train Model

In [16]:
downsampling_rate=0.025

In [17]:
train_not_clicked = train_processed[train_processed.click == 0].sample(n= round(downsampling_rate*2429188), random_state = 2)
train_clicked = train_processed[train_processed.click == 1]
train_sample = pd.concat([train_clicked,train_not_clicked],0)

In [374]:
train_set=train_sample[feature]
train_label=train_sample['click']
validation_set=validation_processed[feature]
validation_label=validation_processed['click']
test_set=test_processed[feature]

### Logistic Regression

In [367]:
lr_model = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, \
                              intercept_scaling=1, class_weight='balanced', random_state=None, solver='liblinear', \
                              max_iter=100, multi_class='ovr', verbose=0, warm_start=False, n_jobs=1)

In [375]:
lr_model.fit(train_set,train_label)

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [376]:
lr_pred_ctr=[]
for a, b in lr_model.predict_proba(validation_set):
    lr_pred_ctr.append(b)
auc=roc_auc_score(validation_label, lr_pred_ctr)
print('AUC:',auc)

AUC: 0.845218796778


# Evaluation

In [25]:
def bid_linear(base_bid, p_ctr, avg_ctr):
    return (base_bid*p_ctr/avg_ctr)

In [382]:
def evaluate(bidprice):
    budget=6250
    impression=0
    click=0
    for row in range(0,validation.shape[0]):
        if budget<validation.payprice[row]:
            continue
        if bidprice.iloc[row,0]>validation.payprice[row]:
            budget=budget-validation.payprice[row]/1000
            impression=impression+1
            if validation.click[row]==1:
                click=click+1
    return click,(6250-budget),(click/impression),((6250-budget)/impression),((6250-budget)/click)

In [27]:
def pred_ctr_calibration(pred_ctr):
    return pred_ctr/(pred_ctr+(1-pred_ctr)/downsampling_rate)

In [28]:
number_of_click=len(train[train.click==1])
number_of_impression=len(train[train.bidprice>train.payprice])

### Logistic Regression Evaluation

In [377]:
avg_ctr=number_of_click/number_of_impression
predCTR=pd.DataFrame(lr_pred_ctr)
predCTR_calibrated=pred_ctr_calibration(predCTR)

In [383]:
for counter in np.arange(3,4,0.1):
    base_bid=counter
    bidprice=bid_linear(base_bid, predCTR_calibrated, avg_ctr)
    click,cost,CTR,CPM,CPC=evaluate(bidprice)
    print('basebid ', base_bid, ' click ', click, ' cost ', cost, ' CTR ', CTR, ' CPM ', CPM, ' CPC ', CPC)

basebid  3.0  click  155  cost  5331.392  CTR  0.0014015987268058017  CPM  0.0482094983181  CPC  34.3960774194
basebid  3.1  click  156  cost  5504.535  CTR  0.0013776415835813382  CPM  0.0486107456044  CPC  35.2854807692
basebid  3.2  click  158  cost  5678.198  CTR  0.0013635618306249084  CPM  0.0490036332882  CPC  35.9379620253
basebid  3.3  click  158  cost  5849.315  CTR  0.0013339412052749777  CPM  0.0493838120325  CPC  37.0209810127
basebid  3.4  click  160  cost  6014.338  CTR  0.0013233749369329132  CPM  0.0497451510715  CPC  37.5896125
basebid  3.5  click  161  cost  6156.859  CTR  0.0013058854065278048  CPM  0.0499388342742  CPC  38.2413602485
basebid  3.6  click  157  cost  6213.409  CTR  0.0012600927813538373  CPM  0.0498692473153  CPC  39.5758535032
basebid  3.7  click  153  cost  6231.104  CTR  0.001225146737346156  CPM  0.04989553422  CPC  40.7261699347
basebid  3.8  click  152  cost  6236.273  CTR  0.0012196883375326989  CPM  0.0500415095248  CPC  41.0281118421
basebid