In [1]:
import numpy as np
import pandas as pd
#import xgboost
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import time
import collections
import itertools as it
from sklearn import ensemble, linear_model
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes=True)


In [2]:
# Load validation data
val = pd.read_csv('validation.csv')

In [3]:
# Load CTR estimators
CTR_models_val = pd.DataFrame(np.vstack([np.loadtxt('CTR_models/logr/r1_sams_logr_6.csv'),
                                        np.loadtxt('CTR_models/gbm_dam/pCTR_gbm_dam_val.csv'), 
                                        pd.read_csv('CTR_models/gbm2/pCTR_gbm2_val.csv', index_col=0).values.flatten(), 
                                        np.loadtxt('ensemble/ensemble_base_learners_set2/val/r7_mlp_nn_14.csv')]).T,
                            columns=['log_reg', 'Damian', 'Stefan2', 'NN'])

# Build ensemble
weighted_CTR_val = CTR_models_val[['Damian', 'Stefan2', 'NN']].copy()

w = np.atleast_2d([1, 6, 5]) #np.array([x1, x2, x3])
weighted_CTR_val = np.log(weighted_CTR_val/(1 + weighted_CTR_val))
weighted_CTR_val =  np.dot(weighted_CTR_val, w.T)
weighted_CTR_val /= np.sum(w) # normalize weights
weighted_CTR_val = 1 / (1 + np.exp(-weighted_CTR_val)) # take logistic / sigmoid
weighted_CTR_val = weighted_CTR_val.flatten()

CTR_models_val['ensemble1'] = weighted_CTR_val
CTR_models_val.head()



Unnamed: 0,log_reg,Damian,Stefan2,NN,ensemble1
0,0.000533,7.8e-05,0.000291,0.000155,0.0002
1,0.000205,9.3e-05,0.000211,1.7e-05,6.9e-05
2,0.000139,6.3e-05,0.000351,1.1e-05,7.2e-05
3,0.00021,7.5e-05,0.000274,2.5e-05,9.1e-05
4,0.000306,9.2e-05,0.000335,7e-06,6.1e-05


### Load bids

In [4]:
bid_look_up = pd.read_csv('proper_datasets/bid_look_up.csv', index_col=0)

In [7]:
# Load the auction function
# FUNCTION:

def simulate_auction_ORTB(t_bids, t_payprice, t_clicks, agents, t_budget, spending_info, verbose=0):
    
    nrand = t_payprice.shape[0]
    rand_helper_vals = np.random.random(size=(nrand,agents)) # to efficiently split ties randomly

    def calc_things(x, t_payprice):
        exc_price = np.subtract(x, t_payprice)
        return exc_price

    def element_compare(first_price, second_price, payprice):
        if (second_price == 0) & (first_price != 0):
            second_price = payprice        
        return second_price

    while True:

        # define valid bids (bids times a boolean of whether a bid is higher than the payprice (otherwise instant no-win))
        valid_bids = t_bids * np.any([(t_bids > t_payprice[:, None])], axis=0)
        # Find second highest prices

        sorted_prices = np.atleast_1d([np.sort(valid_bids[i,:])[::-1] for i in range(valid_bids.shape[0])]) 
        second_prices = sorted_prices[:,1]
        first_prices = sorted_prices[:,0]
        second_prices = np.vectorize(element_compare)(first_prices, second_prices, t_payprice)

        # Find highest bids (win prices) -> win goes to the first column among winners in case of ties (np.where returns duplicates!)
        win_cols = np.argmax(valid_bids+rand_helper_vals, axis=1)
        
        # Compute costs given win (second price is paid)
        valid_costs = np.zeros(shape=t_bids.shape) #tt_bids.copy()
        valid_costs[range(t_bids.shape[0]),win_cols] = second_prices
        
        costs_sum = np.cumsum(valid_costs, axis=0)
        bool_entry = (costs_sum >= t_budget)
        bool_sum = np.sum(bool_entry)

        if bool_sum > 0:

            t_bids[bool_entry] = 0 
            if verbose >= 1:
                print('t_bids was updated, %i bid removals' % bool_sum)

        else:
            break

    clicks = (valid_costs>0) * t_clicks[:, None]
            
    if verbose >= 1:
        print('DONE')
        
        print('clicks:')
        print(np.sum(clicks, axis=0)) # clicks
        print(np.mean(np.sum(clicks, axis=0))) # avg clicks per agent
        
        print('OUR MODEL')
        print(np.sum(clicks, axis=0)[-1])

        print('wins:')
        #print(valid_costs[:2,:])
        print(np.sum((valid_costs>0), axis=0)) # wins
        print(np.mean(np.sum((valid_costs>0), axis=0))) # avg wins per agent
        print('OUR MODEL')
        print(np.sum((valid_costs>0), axis=0)[-1]) # wins
        
        print('cost for OUR MODEL')
        print(np.sum((valid_costs), axis=0)[-1])
        
        print(time.time()-start)

        
        
    if spending_info:
        spendings = np.sum(valid_costs, axis=0)
        res = [agents, np.mean(np.sum(clicks, axis=0)), 
               np.mean(np.sum((valid_costs>0), axis=0)),
               spendings] 
        
    else:    
        res = np.sum(clicks, axis=0)[-1], np.sum((valid_costs>0), axis=0)[-1], np.sum(valid_costs, axis=0)[-1]
    
    return res

In [8]:
def build_custom_ortb2(cc, lamb, pCTR):
    
    return np.round(cc * (
                      (pCTR + np.sqrt(cc**2*lamb**2+pCTR**2)/(cc*lamb))**(1/3) 
                      - (cc*lamb/(pCTR + np.sqrt(cc**2*lamb**2+pCTR**2)))**(1/3)
                     ))

In [143]:
def build_time_preference(nsamp):
    
    passive_time = pd.DataFrame(np.zeros(shape=(nsamp, 7)), columns=['t1','t2','t3','t4','t5', 't6', 't7'])
    

    passive_time.iloc[:np.int(nsamp*0.5), 0] = 0.5
    
    
    
    passive_time.iloc[:np.int(nsamp*0.75), 1] = 0.5
    passive_time.iloc[:np.int(nsamp*0.5), 2] = 0.75
    
    passive_time.iloc[:np.int(nsamp*0.75), 3] = 0.75
    
    passive_time.iloc[:np.int(nsamp*0.5), 4] = 0.5
    passive_time.iloc[np.int(nsamp*0.5):np.int(nsamp*0.75), 4] = 0.75
    
    passive_time.iloc[:np.int(nsamp*0.85), 5] = 0.5
    
    passive_time.iloc[:np.int(nsamp*0.85), 6] = 0.25
        
    
    time_factor = pd.DataFrame() 
    
    for col in passive_time.columns:
        if col == 't1':
            start = 0.5
            end = 1
            current = 0.5
            time_factor[col + '_step1'] = add_increments(passive_time['t1'], splits=5, addition=0.2, nsamp=nsamp, start=start, end=end, current=current)    
            time_factor[col + '_step2'] = add_increments(passive_time['t1'], splits=5, addition=0.4, nsamp=nsamp, start=start, end=end, current=current)    
            time_factor[col + '_step3'] = add_increments(passive_time['t1'], splits=10, addition=0.1, nsamp=nsamp, start=start, end=end, current=current)    
            time_factor[col + '_step4'] = add_increments(passive_time['t1'], splits=20, addition=0.1, nsamp=nsamp, start=start, end=end, current=current)    
            time_factor[col + '_step5'] = add_increments(passive_time['t1'], splits=2, addition=0.5, nsamp=nsamp, start=start, end=end, current=current)    
            time_factor[col + '_step6'] = add_increments(passive_time['t1'], splits=2, addition=0.25, nsamp=nsamp, start=start, end=end, current=current)    
            time_factor[col + '_step7'] = add_increments(passive_time['t1'], splits=5, addition=0.1, nsamp=nsamp, start=start, end=end, current=current)    
        elif col == 't2':
            start = 0.75
            end = 1
            current = 0.5
            time_factor[col + '_step1'] = add_increments(passive_time['t2'], splits=5, addition=0.2, nsamp=nsamp, start=start, end=end, current=current)    
            time_factor[col + '_step2'] = add_increments(passive_time['t2'], splits=5, addition=0.4, nsamp=nsamp, start=start, end=end, current=current)    
            time_factor[col + '_step3'] = add_increments(passive_time['t2'], splits=10, addition=0.1, nsamp=nsamp, start=start, end=end, current=current)    
            time_factor[col + '_step4'] = add_increments(passive_time['t2'], splits=20, addition=0.1, nsamp=nsamp, start=start, end=end, current=current)    
            time_factor[col + '_step5'] = add_increments(passive_time['t2'], splits=2, addition=0.5, nsamp=nsamp, start=start, end=end, current=current)    
            time_factor[col + '_step6'] = add_increments(passive_time['t2'], splits=2, addition=0.25, nsamp=nsamp, start=start, end=end, current=current)    
            time_factor[col + '_step7'] = add_increments(passive_time['t2'], splits=5, addition=0.1, nsamp=nsamp, start=start, end=end, current=current)    
        elif col == 't3':
            start = 0.5
            end = 1
            current = 0.75
            time_factor[col + '_step1'] = add_increments(passive_time['t3'], splits=5, addition=0.2, nsamp=nsamp, start=start, end=end, current=current)    
            time_factor[col + '_step2'] = add_increments(passive_time['t3'], splits=5, addition=0.4, nsamp=nsamp, start=start, end=end, current=current)    
            time_factor[col + '_step3'] = add_increments(passive_time['t3'], splits=10, addition=0.1, nsamp=nsamp, start=start, end=end, current=current)    
            time_factor[col + '_step4'] = add_increments(passive_time['t3'], splits=20, addition=0.1, nsamp=nsamp, start=start, end=end, current=current)    
            time_factor[col + '_step5'] = add_increments(passive_time['t3'], splits=2, addition=0.5, nsamp=nsamp, start=start, end=end, current=current)    
            time_factor[col + '_step6'] = add_increments(passive_time['t3'], splits=2, addition=0.25, nsamp=nsamp, start=start, end=end, current=current)    
            time_factor[col + '_step7'] = add_increments(passive_time['t3'], splits=5, addition=0.1, nsamp=nsamp, start=start, end=end, current=current)    
        elif col == 't4':
            start = 0.75
            end = 1
            current = 0.75
            time_factor[col + '_step1'] = add_increments(passive_time['t4'], splits=5, addition=0.2, nsamp=nsamp, start=start, end=end, current=current)    
            time_factor[col + '_step2'] = add_increments(passive_time['t4'], splits=5, addition=0.4, nsamp=nsamp, start=start, end=end, current=current)    
            time_factor[col + '_step3'] = add_increments(passive_time['t4'], splits=10, addition=0.1, nsamp=nsamp, start=start, end=end, current=current)    
            time_factor[col + '_step4'] = add_increments(passive_time['t4'], splits=20, addition=0.1, nsamp=nsamp, start=start, end=end, current=current)    
            time_factor[col + '_step5'] = add_increments(passive_time['t4'], splits=2, addition=0.5, nsamp=nsamp, start=start, end=end, current=current)    
            time_factor[col + '_step6'] = add_increments(passive_time['t4'], splits=2, addition=0.25, nsamp=nsamp, start=start, end=end, current=current)    
            time_factor[col + '_step7'] = add_increments(passive_time['t4'], splits=5, addition=0.1, nsamp=nsamp, start=start, end=end, current=current)    
        elif col == 't5':
            start = 0.75
            end = 1
            current = 0.75
            time_factor[col + '_step1'] = add_increments(passive_time['t5'], splits=5, addition=0.2, nsamp=nsamp, start=start, end=end, current=current)    
            time_factor[col + '_step2'] = add_increments(passive_time['t5'], splits=5, addition=0.4, nsamp=nsamp, start=start, end=end, current=current)    
            time_factor[col + '_step3'] = add_increments(passive_time['t5'], splits=10, addition=0.1, nsamp=nsamp, start=start, end=end, current=current)    
            time_factor[col + '_step4'] = add_increments(passive_time['t5'], splits=20, addition=0.1, nsamp=nsamp, start=start, end=end, current=current)    
            time_factor[col + '_step5'] = add_increments(passive_time['t5'], splits=2, addition=0.5, nsamp=nsamp, start=start, end=end, current=current)    
            time_factor[col + '_step6'] = add_increments(passive_time['t5'], splits=2, addition=0.25, nsamp=nsamp, start=start, end=end, current=current)    
            time_factor[col + '_step7'] = add_increments(passive_time['t5'], splits=5, addition=0.1, nsamp=nsamp, start=start, end=end, current=current)    
        elif col == 't6':
            start = 0.85
            end = 1
            current = 0.5
            time_factor[col + '_step1'] = add_increments(passive_time['t6'], splits=5, addition=0.2, nsamp=nsamp, start=start, end=end, current=current)    
            time_factor[col + '_step2'] = add_increments(passive_time['t6'], splits=5, addition=0.4, nsamp=nsamp, start=start, end=end, current=current)    
            time_factor[col + '_step3'] = add_increments(passive_time['t6'], splits=10, addition=0.1, nsamp=nsamp, start=start, end=end, current=current)    
            time_factor[col + '_step4'] = add_increments(passive_time['t6'], splits=20, addition=0.1, nsamp=nsamp, start=start, end=end, current=current)    
            time_factor[col + '_step5'] = add_increments(passive_time['t6'], splits=2, addition=0.5, nsamp=nsamp, start=start, end=end, current=current)    
            time_factor[col + '_step6'] = add_increments(passive_time['t6'], splits=2, addition=0.25, nsamp=nsamp, start=start, end=end, current=current)    
            time_factor[col + '_step7'] = add_increments(passive_time['t6'], splits=5, addition=0.1, nsamp=nsamp, start=start, end=end, current=current)    
        elif col == 't7':
            start = 0.85
            end = 1
            current = 0.25
            time_factor[col + '_step1'] = add_increments(passive_time['t7'], splits=5, addition=0.2, nsamp=nsamp, start=start, end=end, current=current)    
            time_factor[col + '_step2'] = add_increments(passive_time['t7'], splits=5, addition=0.4, nsamp=nsamp, start=start, end=end, current=current)    
            time_factor[col + '_step3'] = add_increments(passive_time['t7'], splits=10, addition=0.1, nsamp=nsamp, start=start, end=end, current=current)    
            time_factor[col + '_step4'] = add_increments(passive_time['t7'], splits=20, addition=0.1, nsamp=nsamp, start=start, end=end, current=current)    
            time_factor[col + '_step5'] = add_increments(passive_time['t7'], splits=2, addition=0.5, nsamp=nsamp, start=start, end=end, current=current)    
            time_factor[col + '_step6'] = add_increments(passive_time['t7'], splits=2, addition=0.25, nsamp=nsamp, start=start, end=end, current=current)    
            time_factor[col + '_step7'] = add_increments(passive_time['t7'], splits=5, addition=0.1, nsamp=nsamp, start=start, end=end, current=current)    
               
    time_factor['regular'] = np.ones(nsamp)
        
    return time_factor

In [137]:
def add_increments(factors, splits, addition, nsamp, start, end, current):
    
    split_ids = np.linspace(start*nsamp, end*nsamp, splits+1, dtype=np.int)
    
    for i in range(len(split_ids)-1):
        
        factors.iloc[split_ids[i]:split_ids[i+1]] = current + addition 
        current += addition
        
    return factors

In [127]:
time_fact = build_time_preference(nsamp=CTR_models_val['ensemble1'].shape[0])
time_fact.mul(CTR_models_val['ensemble1'], axis=0)

Unnamed: 0,t1_step1,t1_step2,t1_step3,t1_step4,t1_step5,t1_step6,t1_step7,t2_step1,t2_step2,t2_step3,...,t5_step5,t5_step6,t5_step7,t6_step1,t6_step2,t6_step3,t6_step4,t6_step5,t6_step6,t6_step7
0,0.000100,0.000100,0.000100,0.000100,0.000100,0.000100,0.000100,0.000100,0.000100,0.000100,...,0.000100,0.000100,0.000100,0.000100,0.000100,0.000100,0.000100,0.000100,0.000100,0.000100
1,0.000035,0.000035,0.000035,0.000035,0.000035,0.000035,0.000035,0.000035,0.000035,0.000035,...,0.000035,0.000035,0.000035,0.000035,0.000035,0.000035,0.000035,0.000035,0.000035,0.000035
2,0.000036,0.000036,0.000036,0.000036,0.000036,0.000036,0.000036,0.000036,0.000036,0.000036,...,0.000036,0.000036,0.000036,0.000036,0.000036,0.000036,0.000036,0.000036,0.000036,0.000036
3,0.000046,0.000046,0.000046,0.000046,0.000046,0.000046,0.000046,0.000046,0.000046,0.000046,...,0.000046,0.000046,0.000046,0.000046,0.000046,0.000046,0.000046,0.000046,0.000046,0.000046
4,0.000030,0.000030,0.000030,0.000030,0.000030,0.000030,0.000030,0.000030,0.000030,0.000030,...,0.000030,0.000030,0.000030,0.000030,0.000030,0.000030,0.000030,0.000030,0.000030,0.000030
5,0.000153,0.000153,0.000153,0.000153,0.000153,0.000153,0.000153,0.000153,0.000153,0.000153,...,0.000153,0.000153,0.000153,0.000153,0.000153,0.000153,0.000153,0.000153,0.000153,0.000153
6,0.000239,0.000239,0.000239,0.000239,0.000239,0.000239,0.000239,0.000239,0.000239,0.000239,...,0.000239,0.000239,0.000239,0.000239,0.000239,0.000239,0.000239,0.000239,0.000239,0.000239
7,0.000314,0.000314,0.000314,0.000314,0.000314,0.000314,0.000314,0.000314,0.000314,0.000314,...,0.000314,0.000314,0.000314,0.000314,0.000314,0.000314,0.000314,0.000314,0.000314,0.000314
8,0.000166,0.000166,0.000166,0.000166,0.000166,0.000166,0.000166,0.000166,0.000166,0.000166,...,0.000166,0.000166,0.000166,0.000166,0.000166,0.000166,0.000166,0.000166,0.000166,0.000166
9,0.000041,0.000041,0.000041,0.000041,0.000041,0.000041,0.000041,0.000041,0.000041,0.000041,...,0.000041,0.000041,0.000041,0.000041,0.000041,0.000041,0.000041,0.000041,0.000041,0.000041


In [129]:
def build_fitted_bids(pCTR, cc, lamb, bid_type):

    if bid_type == 'ortb2':
        final_bids = cc * (
                          (pCTR + np.sqrt(cc**2*lamb**2+pCTR**2)/(cc*lamb))**(1/3) 
                          - (cc*lamb/(pCTR + np.sqrt(cc**2*lamb**2+pCTR**2)))**(1/3)
                         ) 

    fitted_bids = np.round(final_bids)
    
    return fitted_bids

In [145]:
## Run ensemble1 for 3-4 parameter combinations and search for best time settings
# ===================== BOOTSTRAPPING FOR FITTING ORTB2 ===================== #

start = time.time()
res = []
spending_info = False
t_payprice = np.array(val['payprice'])
t_clicks = np.array(val['click'])

res_bt_ids = []
res_bt_click = []
res_bt_win = []
res_bt_cost = []
res_bt = []

pCTR = CTR_models_val['ensemble1']

base_sub = [0,1,2,3,4,10,11,12,13,14,15,16,17,18,19] # [5:9] are rev_lin_bid and not desired here
bids_subset = bid_look_up.iloc[:,base_sub] 
bids_subset['extra_id0'] = bids_subset.iloc[:,0]
bids_subset['extra_id1'] = bids_subset.iloc[:,1]
bids_subset['extra_id2'] = bids_subset.iloc[:,2]
bids_subset['extra_id3'] = bids_subset.iloc[:,3]
bids_subset['extra_id4'] = bids_subset.iloc[:,4]

nrand = t_payprice.shape[0]
nsims = 50
nsamp = bid_look_up.shape[0]
nstrats = 19 # we take 19 rather than 14, as we have added 5x "extra_id" to align probability of selecting linear/non-linear

fitted_bids = build_fitted_bids(pCTR, cc=20, lamb=1.000000e-08, bid_type='ortb2')
time_adj_bids = build_time_preference(nsamp=nsamp)
time_adj_bids = time_adj_bids.mul(pd.Series(fitted_bids), axis=0)

for isim in range(nsims):
    for agents in [30]: 
        
        strategy_ids = np.random.randint(0, nstrats+1, agents-1)
        resamp_ids = np.random.randint(0, nsamp, nsamp)

        t_bids = bids_subset.iloc[resamp_ids,strategy_ids]
        t_bids['custom'] = np.zeros(nsamp)
        t_bids = np.array(t_bids)

        budget = 6250 * 1000
        
        # tune cc and/or lambda:
        for col in time_adj_bids.columns:
            
            t_bids[:,-1] = time_adj_bids.loc[resamp_ids,col]     
            out = simulate_auction_ORTB(t_bids, t_payprice, t_clicks, agents, budget, spending_info, verbose=0)
            res_bt.append([isim, col, out])
            
        print('===== Simulation %i done, runtime so far: %.f ======' % (isim, time.time()-start))
    
print(time.time()-start) 

pd.DataFrame(res_bt).to_csv('output_simulation/ortb_multi/r3_time_acer_ens1_params1.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is tryin

10582.422132253647
