In [23]:
import pandas as pd
import preprocessing as preprop
from sklearn import preprocessing, linear_model, svm, model_selection, metrics, ensemble
import matplotlib.pyplot as plt
import numpy as np
import timeit
import multiprocessing
import util
from collections import defaultdict

In [29]:
#THIS NOTEBOOK WAS RUN WHEN THE COURSEWORK WAS FIRST RELEASED - A LOT OF PREPROCESSING AND FEATURE ENGINEERING
#(INCLUDING 'PER-ADVERTISER' TRICK) HAVE BEEN INTRODUCED BY OTHER TEAM MEMBERS SINCE THEN
t0 = timeit.default_timer()
validation_data = pd.read_csv("./data/validation.csv",)
train_data      = pd.read_csv("./data/train.csv",nrows=500000)
test_data       = pd.read_csv("./data/test.csv")
print("Loading data took {:.3f}s".format(timeit.default_timer() - t0))
avg_ctr = train_data['click'].mean()

Loading data took 7.873s


In [None]:
TARGET_COLS=['payprice','bidprice','click']
EXCLUDED_COLS=TARGET_COLS + ['bidid', 'logtype', 'usertag', 'userid', 'urlid', 'slotid', 'IP', 'url', 'domain']
X_train=train_data[[x for x in train_data if x not in EXCLUDED_COLS]]

t0 = timeit.default_timer()

label_encoder=preprocessing.LabelEncoder
d = defaultdict(label_encoder)

X_train = X_train.apply(lambda x: d[x.name].fit_transform(x))

print("Preprocessing training data took {:.3f}s".format(timeit.default_timer() - t0))

t0 = timeit.default_timer()
X_val=validation_data[[x for x in validation_data if x not in EXCLUDED_COLS]]
X_val = X_val.apply(lambda x: d[x.name].fit_transform(x))

print("Preprocessing validation data took {:.3f}s".format(timeit.default_timer() - t0))
Y_train_click=train_data[TARGET_COLS[2]]
Y_val_click=validation_data[TARGET_COLS[2]]


In [None]:
parameters={'log regression': {'class_weight': ['balanced', None], 'C': np.logspace(-8, 5, 20)}, \
            
            'SGD classifier': {'class_weight': ['balanced', None],'eta0':np.logspace(-8,-4,10),\
                               'loss': ['log','perceptron','hinge'],'verbose':[0], 'l1_ratio':np.linspace(0,1,11), 'n_iter':[5,6,7,8,9,10],\
                              'learning_rate':['constant','optimal','invscaling']},
            
           'Gradient Boosting':{'loss':['exponential','deviance'],'learning_rate':np.logspace(-8,-1,30),'verbose':[1],\
                                'max_features':['auto','sqrt','log2',None],'max_depth':[3,4],'n_estimators':[200,300,400]},
             }

#WARNING - TRAINING ONE SINGLE ALGORITHM TAKES DAYS ON A 4-CORE MACBOOK PRO 2016. MIGHT WANT TO TURN OFF VERBOSE IN PARAMS.
def train_ctr_pred(sklearn_model, parameters, cv, scoring, X_train=X_train, Y_train_click=Y_train_click):
    
    t0 = timeit.default_timer()

    model = model_selection.GridSearchCV(sklearn_model, parameters, cv=3, verbose=True, n_jobs=multiprocessing.cpu_count(), pre_dispatch="1*n_jobs", scoring=scoring)
    model.fit(X_train, Y_train_click)
    print("Training {} took {:.2f}s".format(sklearn_model, timeit.default_timer() - t0))
            
    return model
            
model=train_ctr_pred(ensemble.GradientBoostingClassifier(),parameters['Gradient Boosting'],3,"roc_auc")
            
    


In [None]:
model_scores = list()
model_scores.append(model.best_score_)

print("=============================================")
print("{}".format('Gradient Boosting Classifier'))
print("=============================================")
print("Best parameters set found on training set:")
print()
print(model.best_params_)
print()
print("Grid scores on training set:")
print()
means = model.cv_results_['mean_test_score']
stds = model.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, model.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
        % (mean, std * 2, params))
print("=============================================")
print("=============================================")
print('THE BEST SCORE IS ',model_scores)

In [None]:
probs=model.predict_proba(X_train)
pred_ctr = [x for x in map(lambda proba: proba[1], probs)]
pred_ctr_df=pd.DataFrame({'predCTR':pred_ctr})
pred_ctr_df

In [None]:
avg_ctr=sum(pred_ctr)/len(pred_ctr)
X_train_with_ctr=pd.concat([X_train,pred_ctr_df],axis=1)


In [81]:
def linear(base_bid,round_bids=True):
    if round_bids is True:
        return lambda ctr: np.rint(base_bid * (ctr / avg_ctr))
    else:
        return lambda ctr: base_bid * (ctr / avg_ctr)
    
def ortb1(c,alpha,round_bids=True):
    if round_bids is True:
        return lambda ctr: np.rint(np.sqrt((c / alpha) * ctr + c**2) - c)
    else:
        return lambda ctr: np.sqrt((c / alpha) * ctr + c**2) - c
    
def ortb2(c,alpha,round_bids=True):
    if round_bids is True:
        return lambda ctr: np.rint(c * (np.power(((ctr + np.sqrt(c ** 2 * alpha ** 2 + ctr ** 2)) / (c * alpha)), 1 / 3) - np.power((c * alpha) / (ctr + np.sqrt(c ** 2 * alpha ** 2 + ctr ** 2)), 1 / 3)))
    else:
        return lambda ctr: c * (np.power(((ctr + np.sqrt(c ** 2 * alpha ** 2 + ctr ** 2)) / (c * alpha)), 1 / 3) - np.power((c * alpha) / (ctr + np.sqrt(c ** 2 * alpha ** 2 + ctr ** 2)), 1 / 3))

strategies={'linear':linear, 'ortb1':ortb1,'ortb2':ortb2}
                                         
def generate_bidprice(pred_ctr_df, strategy, round_bids=True):
    
    X_bidprice=pd.DataFrame({'bidprice':pred_ctr_df['predCTR'].apply(strategy)})
                                         
    return X_bidprice



In [None]:

def count_clicks(data,budget,verbose=False):
    t0 = timeit.default_timer()
    clicks=0
    won=0
    lost=0
    paid=0
    for row in data.itertuples():
        if row.payprice<row.bidprice:
            won+=1
            budget-=row.payprice
            paid+=row.payprice
            if row.click==1:
                clicks+=1
            if budget<0:
                print('paid ', paid)
                break
        else:
            lost+=1
    elapsed_time=timeit.default_timer() - t0
    if verbose is True:
        print("Counting clicks took {}s".format(elapsed_time))
    return {'clicks':clicks,'won':won,'lost':lost,'paid':paid}

def optimise_strategy(data,strategy,budget,round_bids=True):
    #we only need 4 columns (predCTR, bidprice,payprice,click), hence the name four_columns below
    t0 = timeit.default_timer()
    base_bid_range=np.linspace(1,5,25)
    ortb1={'C':np.linspace(40, 60, 5),'alpha': np.linspace(1e-10, 1e-2, 50)}
    ortb2={'C': np.linspace(40, 60, 5), "alpha": np.linspace(1e-10, 1e-2, 50)}
    
    global avg_ctr
    best_clicks=0
    best_base_bid=0
    best_res=None
    if strategy=='linear':
        for base_bid in base_bid_range:
            print('base bid is ', base_bid)
            print(data.head())
            bidprice_col=generate_bidprice(data,strategies[strategy](base_bid),avg_ctr)
            
            four_columns=pd.concat([data,bidprice_col],axis=1)
            res=count_clicks(four_columns,budget,verbose=True)
            print('Got {} clicks'.format(res['clicks']))
            
            if res['clicks']>best_clicks:
                best_clicks=res['clicks']
                best_base_bid=base_bid
                best_res=res
    return {'best_clicks':best_clicks,'best_base_bid':best_base_bid,'best_res':best_res}
    elapsed_time=timeit.default_timer() - t0
    print("Optimising {} took {}s".format(strategy,elapsed_time))

data=pd.concat([pred_ctr_df,train_data['payprice'],Y_train_click],axis=1)

res=optimise_strategy(data,'linear',6250)
print(res)

In [None]:
res['best_res']