In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
from sklearn import linear_model
from sklearn.svm import LinearSVC
from sklearn import preprocessing
from sklearn_pandas import DataFrameMapper
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
import numpy as np

In [None]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
val = pd.read_csv("data/validation.csv")

In [None]:

#group = pd.read_csv("data/Group_xx.csv")

print("train total clicks", train["click"].sum())
print("val total clicks", val["click"].sum())

# for evaluation
train_prices=train[["bidprice","payprice"]]
val_prices=val[["bidprice","payprice"]]

x_train = None
x_val = None
x_train_features = None
x_val_features = None

In [None]:
train_prices["bidprice"].min()

In [None]:
train_cols = ['weekday', 'hour', 'useragent', 'region', 'city',
 'adexchange', 'domain', 'url', 'slotid', 'slotwidth', 'slotheight',
 'slotvisibility', 'slotformat', 'creative', 'keypage',
 'advertiser']

# excluded: usertag, ip, urlid, bidprice and slotprice (for now)
# the integers bidprice and slotprice cause errors when converting to csr later on
hot_enc_cols = ['weekday', 'hour', 'useragent', 'region', 'city', 
                    'adexchange', 'domain', 'url', 'slotid', 
                    'slotwidth', 'slotheight', 'slotvisibility', 
                    'slotformat', 'creative', 'keypage', 'advertiser']

x_train = train[train_cols]
y_train = train['click']

x_val = val[train_cols]
y_val = val['click']

x_test = test[train_cols]

train = None
val = None
test = None

x_val.head()

In [None]:
val_prices.head()

### Feature Creation / Loading Pre-processed Features

In [None]:
x_train['useragent'].unique()

In [None]:
# change nan values in features to 0 using is null pandas
x_train['keypage'].fillna("unknown", inplace=True)
x_val['keypage'].fillna("unknown", inplace=True)
x_test['keypage'].fillna("unknown", inplace=True)

print(x_train['keypage'].unique())
print(x_val['keypage'].unique())
print(x_test['keypage'].unique())

In [None]:
x_train['adexchange'].fillna(0, inplace=True)
x_val['adexchange'].fillna(0, inplace=True)
x_test['adexchange'].fillna(0, inplace=True)

print(x_train['adexchange'].unique())
print(x_val['adexchange'].unique())
print(x_test['adexchange'].unique())


x_train['adexchange'].shape

In [None]:
x_train['domain'].fillna('unknown', inplace=True)
x_val['domain'].fillna('unknown', inplace=True)
x_test['domain'].fillna('unknown', inplace=True)

In [None]:
x_train['url'].fillna('unknown', inplace=True)
x_val['url'].fillna('unknown', inplace=True)
x_test['url'].fillna('unknown', inplace=True)

In [None]:
train_objs_num = len(x_train)
val_objs_num = len(x_val)
test_objs_num = len(x_test)

dataset = pd.concat(objs=[x_train, x_val], axis=0)
dataset[hot_enc_cols] = dataset[hot_enc_cols].astype('str') 
dataset_preprocessed = pd.get_dummies(dataset, columns=hot_enc_cols, sparse=True)

In [None]:
from scipy import sparse
def sparse_df_to_csr(df):
    return sparse.csr_matrix(df.to_coo())

In [None]:
dataset_preprocessed = sparse_df_to_csr(dataset_preprocessed)

In [None]:
x_train = dataset_preprocessed[:train_objs_num]
x_val = dataset_preprocessed[train_objs_num:]
#x_test = dataset_preprocessed[train_objs_num+val_objs_num-1:]

In [None]:
print(y_val.shape)
print(x_val.shape)

#### Save/Load Dataset

###### Saving train/val Data

In [None]:
save_folder='pickled_data/'
import pickle
with open(save_folder+'x_train_csr.dat', 'wb') as outfile:
    pickle.dump(x_train, outfile, pickle.HIGHEST_PROTOCOL)
with open(save_folder+'x_val_csr.dat', 'wb') as outfile:
    pickle.dump(x_val, outfile, pickle.HIGHEST_PROTOCOL)
with open(save_folder+'y_train.dat', 'wb') as outfile:
    pickle.dump(y_train, outfile, pickle.HIGHEST_PROTOCOL)
with open(save_folder+'y_val.dat', 'wb') as outfile:
    pickle.dump(y_val, outfile, pickle.HIGHEST_PROTOCOL)
with open(save_folder+'train_prices.dat', 'wb') as outfile:
    pickle.dump(train_prices, outfile, pickle.HIGHEST_PROTOCOL)
with open(save_folder+'val_prices.dat', 'wb') as outfile:
    pickle.dump(val_prices, outfile, pickle.HIGHEST_PROTOCOL)
    
#with open(save_folder+'x_test_csr.dat', 'wb') as outfile:
    #pickle.dump(x_test, outfile, pickle.HIGHEST_PROTOCOL)

###### Loading train/val Data

In [None]:
save_folder='pickled_data/'
import pickle
with open(save_folder+'x_train_csr.dat', 'rb') as infile:
    x_train = pickle.load(infile)
with open(save_folder+'x_val_csr.dat', 'rb') as infile:
    x_val = pickle.load(infile)
with open(save_folder+'y_train.dat', 'rb') as infile:
    y_train = pickle.load(infile)
with open(save_folder+'y_val.dat', 'rb') as infile:
    y_val = pickle.load(infile)
with open(save_folder+'train_prices.dat', 'rb') as infile:
    train_prices = pickle.load(infile)
with open(save_folder+'val_prices.dat', 'rb') as infile:
    val_prices = pickle.load(infile)
    
with open(save_folder+'x_test_csr.dat', 'rb') as infile:
    x_test = pickle.load(infile)

In [None]:
len(x_train["domain"].unique())

In [None]:
print(x_val.shape, y_val.shape)

#### Class Imbalance

In [None]:
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=0)
X_resampled, y_resampled = rus.fit_sample(x_train, y_train)
print(sorted(Counter(y_resampled).items()))

In [None]:
from imblearn.combine import SMOTEENN
smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_sample(x_train, y_train)
print(sorted(Counter(y_resampled).items()))

#### CTR Estimation 

###### Logistic RegressionClassifier

In [None]:
lr = linear_model.LogisticRegression(class_weight="balanced", verbose=1)
lr.fit(X_resampled, y_resampled)

In [None]:
y_val.shape

In [None]:
y_val_pred = lr.predict(x_val)
from sklearn.metrics import accuracy_score
print("Accuracy:", accuracy_score(y_val, y_val_pred))

In [None]:
from sklearn.metrics import mean_squared_error
print("rmse:", mean_squared_error(y_val, y_val_pred))

In [None]:
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(y_val, y_val_pred)
print("auc:", metrics.auc(fpr, tpr))                              

In [None]:
print("nr clicks:", y_val.sum(), "/", y_val.shape[0])

In [None]:
print("nr of impressions:", y_val.shape[0])

In [None]:
from sklearn.metrics import precision_score
print("precision:", precision_score(y_val, y_val_pred))                              

In [None]:
from sklearn.metrics import recall_score
print("recall:", recall_score(y_val, y_val_pred))  

###### Bayes Regression

In [None]:
from skbayes.linear_models import VBLinearRegression, EBLinearRegression
from sklearn.linear_model import BayesianRidge

clf = BayesianRidge()
clf.fit(x_train, y_train)

In [None]:
y_val_pred = clf.predict(x_val)

In [None]:
print("Mean squared error: %.2f"
      % mean_squared_error(y_val, y_val_pred))

### Bidding Strategies

###### ORTB

In [None]:
from simulation import Simulation
sim = Simulation()

In [None]:
# faster iteration
val_prices=val_prices.to_records()

In [None]:
p_ctr=lr.predict_proba(x_val)

In [None]:
import numpy as np
avg_ctr = np.mean(p_ctr, axis=0)[0]
avg_ctr

In [None]:
res=[]

In [None]:
p_ctr.shape

In [None]:
from joblib import Parallel, delayed

budget=6250
avg_ctr=100.0*(float(y_train.sum())/float(x_train.shape[0]))
alpha=budget/val_prices.shape[0]

alphas_range = list(np.arange(0.00005, 0.00050, 0.00002))
const_range = list(range(150, 600, 20))

p_ctr=lr.predict_proba(x_val)
res=[] # experiment results

def simulate(a, c):
    print("alpha:", a, "const:", c)
    sim_bids=sim.run(None, 'ortb', None, p_ctr, avg_ctr, a, c)
    expr_result=sim.evaluate(val_prices, sim_bids, budget, y_val)
    return (expr_result[2], a)

In [None]:
for con in [100, 125, 150, 175, 200, 225, 250, 300]:
    const_res = Parallel(n_jobs=10)(delayed(simulate)(a, con) for a in alphas_range)  
    res.append((con, const_res))

In [None]:
p_ctr.min(axis=0)

In [None]:
res

In [None]:
highest_ctr=float('-inf')
highest_ctr=[i[2][0] for i in res]
print(max(highest_ctr))

In [None]:
simulate(0.0001, 120)

In [None]:
res

In [None]:
import plotly.plotly as py
import plotly.graph_objs as go

data=[]

for r in [res[1], res[5], res[-1]]:
    data.append(
        go.Scatter(
            x=[i[1] for i in r[1]],
            y=[i[0] for i in r[1]],
            mode = 'lines+markers',
            name = str(r[0])
        )
    )

layout = go.Layout(
    xaxis=dict(
        title='Alpha',
        showticklabels=True, 
        range=[0.00009, 0.00046],
        dtick=0.00005,
        tickformat=".5f"
    ),
    yaxis=dict(
        title='CTR',
        showticklabels=True
    )
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

###### ORTB 2

In [None]:
res=[] # experiment results

In [None]:
from joblib import Parallel, delayed

budget=6250
avg_ctr=100.0*(float(y_train.sum())/float(x_train.shape[0]))
alpha=budget/val_prices.shape[0]
alpha=0.00040

alphas_range = list(np.arange(0.00010, 0.00050, 0.00005))
const_range = list(range(100, 500, 50))

p_ctr=lr.predict_proba(x_val)

def simulate(a, c):
    print("alpha:", a, "const:", c)
    sim_bids=sim.run(None, 'ortb2', None, p_ctr, avg_ctr, a, c)
    expr_result=sim.evaluate(val_prices, sim_bids, budget, y_val)
    return (expr_result[2], a)
    #return (a, c, expr_result)

In [None]:
const_res = Parallel(n_jobs=10)(delayed(simulate)(a, 300) for a in alphas_range)  
res.append((300, const_res))

In [None]:
res

In [None]:
y_val.sum()

In [None]:
p_ctr.mean(axis=0)

In [None]:
import plotly.plotly as py
import plotly.graph_objs as go

data=[]

for r in [res[4], res[3], res[1]]:
    data.append(
        go.Scatter(
            x=[i[1] for i in r[1]],
            y=[i[0] for i in r[1]],
            mode = 'lines+markers',
            name = str(r[0])
        )
    )

layout = go.Layout(
    xaxis=dict(
        title='Alpha',
        showticklabels=True, 
        range=[0.00009, 0.00046],
        dtick=0.00005,
        tickformat=".5f"
    ),
    yaxis=dict(
        title='CTR',
        showticklabels=True
    )
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

In [None]:
simulate(0.00001, 150)

In [None]:
res

###### ORTB Modified

In [None]:
from joblib import Parallel, delayed

budget=6250
avg_ctr=100.0*(float(y_train.sum())/float(x_train.shape[0]))
alpha=budget/val_prices.shape[0]

alphas_range = list(np.arange(0.00010, 0.00050, 0.00005))
const_range = list(range(100, 500, 50))

p_ctr=lr.predict_proba(x_val)

def simulate(a, c):
    print("alpha:", a, "const:", c)
    sim_bids=sim.run(x_val, 'ortb-modified', 200, p_ctr, avg_ctr, a, c)
    print(len(sim_bids))
    print(y_val.shape[0], val_prices.shape[0])
    expr_result=sim.evaluate(val_prices, sim_bids, budget, y_val)
    return (expr_result[2], a)
    #return (a, c, expr_result)

In [None]:
simulate(0.0001, 150)

In [None]:
y_val.sum()

##### Submission

In [None]:
p_ctr=lr.predict_proba(x_test)

In [None]:
def submit(a, c):
    sim_bids=sim.run(None, 'ortb', None, p_ctr, None, a, c)
    return sim_bids

In [None]:
test_bids = submit(0.00001, 105)

In [None]:
len(test_bids)

In [None]:
x_test

In [None]:
test.shape

In [None]:
# write submission to file
submission = pd.DataFrame(test["bidid"])
submission["bidprice"] = np.asarray(test_bids)

In [None]:
submission.head()

In [None]:
len(test_bids)

In [None]:
submission.shape

In [None]:
test["bidid"].shape

In [None]:
submission.to_csv("data/submission.csv", sep=',', index=False)

In [None]:
print(val.shape, x_val.shape, y_val.shape)
print(train.shape, x_train.shape, y_train.shape)
print(test.shape, x_test.shape)