In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
from sklearn import linear_model
from sklearn import preprocessing
from sklearn_pandas import DataFrameMapper
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score

In [None]:


train = pd.read_csv("data/train.csv")
#test = pd.read_csv("data/test.csv")
val = pd.read_csv("data/validation.csv")
#group = pd.read_csv("data/Group_xx.csv")

print("train total clicks", train["click"].sum())
print("val total clicks", val["click"].sum())

# for evaluation
train_prices=train[["bidprice","payprice"]]
val_prices=val[["bidprice","payprice"]]

x_train = None
x_val = None
x_train_features = None
x_val_features = None

In [None]:
train_cols = ['weekday', 'hour', 'useragent', 'region', 'city',
 'adexchange', 'domain', 'url', 'slotid', 'slotwidth', 'slotheight',
 'slotvisibility', 'slotformat', 'creative', 'keypage',
 'advertiser']

# excluded: usertag, ip, urlid, bidprice and slotprice (for now)
# the integers bidprice and slotprice cause errors when converting to csr later on
hot_enc_cols = ['weekday', 'hour', 'useragent', 'region', 'city', 
                    'adexchange', 'domain', 'url', 'slotid', 
                    'slotwidth', 'slotheight', 'slotvisibility', 
                    'slotformat', 'creative', 'keypage', 'advertiser']

x_train = train[train_cols]
y_train = train['click']

x_val = val[train_cols]
y_val = val['click']

train = None
val = None

x_val.head()

In [None]:
val_prices.head()

#### Save/Load Dataset

###### Saving train/val Data

In [None]:
save_folder='pickled_data/'
import pickle
with open(save_folder+'x_train_csr.dat', 'wb') as outfile:
    pickle.dump(x_train, outfile, pickle.HIGHEST_PROTOCOL)
with open(save_folder+'x_val_csr.dat', 'wb') as outfile:
    pickle.dump(x_val, outfile, pickle.HIGHEST_PROTOCOL)
with open(save_folder+'y_train.dat', 'wb') as outfile:
    pickle.dump(y_train, outfile, pickle.HIGHEST_PROTOCOL)
with open(save_folder+'y_val.dat', 'wb') as outfile:
    pickle.dump(y_val, outfile, pickle.HIGHEST_PROTOCOL)
with open(save_folder+'train_prices.dat', 'wb') as outfile:
    pickle.dump(train_prices, outfile, pickle.HIGHEST_PROTOCOL)
with open(save_folder+'val_prices.dat', 'wb') as outfile:
    pickle.dump(val_prices, outfile, pickle.HIGHEST_PROTOCOL)

###### Loading train/val Data

In [None]:
save_folder='pickled_data/'
import pickle
with open(save_folder+'x_train_csr.dat', 'rb') as infile:
    x_train = pickle.load(infile)
with open(save_folder+'x_val_csr.dat', 'rb') as infile:
    x_val = pickle.load(infile)
with open(save_folder+'y_train.dat', 'rb') as infile:
    y_train = pickle.load(infile)
with open(save_folder+'y_val.dat', 'rb') as infile:
    y_val = pickle.load(infile)
with open(save_folder+'train_prices.dat', 'rb') as infile:
    train_prices = pickle.load(infile)
with open(save_folder+'val_prices.dat', 'rb') as infile:
    val_prices = pickle.load(infile)
    
with open(save_folder+'x_test_csr.dat', 'rb') as infile:
    x_test = pickle.load(infile)

In [None]:
# faster iteration
val_prices=val_prices.to_records()

## Constant Bidding 

In [None]:
from tqdm import tqdm
from simulation import Simulation

In [None]:
sim = Simulation()

In [None]:
val_prices=val_prices.to_records()

In [None]:
const_bid_range = list(range(227, 301))
res_ctr = [] # ctr results list over grid search

In [None]:
print(x_val.shape)
print(y_val.shape)

In [None]:
from joblib import Parallel, delayed
from tqdm import tqdm

budget=6250
# highest 164 clicks and ctr 0.05396068108908448

def simulate(b):
    print("const bid:", b)
    sim_bids=sim.run(x_val, 'constant', b, None, None, None, None)
    result_ctr=sim.evaluate(val_prices, sim_bids, budget, y_val)
    return (b, result_ctr[2])
    
res_ctr = Parallel(n_jobs=10)(delayed(simulate)(n) for n in const_bid_range) 

In [None]:
res_ctr

In [None]:
bids_range = [r[0] for r in res_ctr]
ctr = [r[1] for r in res_ctr]

In [None]:
import plotly.plotly as py
import plotly.graph_objs as go

data=[]

data.append(
    go.Scatter(
        x=bids_range,
        y=ctr,
        mode = 'lines+markers'
    )
)

layout = go.Layout(
    xaxis=dict(
        title='Bid Value',
        showticklabels=True, 
        #range=[0.00009, 0.00046],
        #dtick=0.00005,
        #tickformat=".5f"
    ),
    yaxis=dict(
        title='CTR',
        showticklabels=True
    )
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

In [None]:
bid_ctr_res = [(const_bid_range[i], res_ctr[i][0]) for i in range(len(const_bid_range))]
print(bid_ctr_res)

In [None]:
print("max", train_prices['bidprice'].max())
print("min", train_prices['bidprice'].min())
print("mean", train_prices['bidprice'].mean())

## Random Bidding

In [None]:
from simulation import Simulation

In [None]:
sim = Simulation()

In [None]:
res = [] # ctr/clicks/cost results list over random bidding

In [None]:
val_prices

In [None]:
budget=6250
lower_bound=227
upper_bound=300

lower_intervals=list(range(lower_bound, upper_bound+1, 5))
upper_intervals=list(range(lower_bound, upper_bound+1, 5))
            
def simulate(low, high):
    print("low", low, "high", high)
    sim_bids=sim.run(x_val, 'random', (low, high), None, None, None, None)
    result_ctr=sim.evaluate(val_prices, sim_bids, budget, y_val)
    return (low, high, result_ctr[2])

In [None]:
res_ctr = Parallel(n_jobs=10)(delayed(simulate)(l, h) for h in upper_intervals for l in lower_intervals if l < h) 

In [None]:
l=275
res_ctr = Parallel(n_jobs=10)(delayed(simulate)(l, h) for h in upper_intervals if l < h) 
res.append(res_ctr)

In [None]:
res

In [None]:
bids_range = [r[0] for r in res_ctr]
ctr = [r[1] for r in res_ctr]

In [None]:
import plotly.plotly as py
import plotly.graph_objs as go

data=[]
for r in res:
    data.append(
        go.Scatter(
            x=[ri[1] for ri in r],
            y=[ri[2] for ri in r],
            mode='lines+markers',
            name=str(r[0][0])
        )
    )

layout = go.Layout(
    xaxis=dict(
        title='Upper Bound Bid Value',
        showticklabels=True, 
        #range=[0.00009, 0.00046],
        #dtick=0.00005,
        #tickformat=".5f"
    ),
    yaxis=dict(
        title='CTR',
        showticklabels=True
    )
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

In [None]:
res

In [None]:
x_test

In [None]:
test.shape