# Web Economics

## Bidding Strategy
#### Gerard Cardoso

Date: 11th April 2017

## Overview
The purpose of this project is to bid for stuff.

In [1]:
#Step1: Importing the libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns #seaborn makes plots cleaner and extends functionality
import itertools
pd.options.mode.chained_assignment = None  # default='warn'

#Import additional required libraries
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.svm import LinearSVC, SVC, OneClassSVM
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_selection import RFE
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, precision_recall_fscore_support
from sklearn.metrics import confusion_matrix
from sklearn import preprocessing
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D

pd.options.mode.chained_assignment = None  # default='warn'

#magic to show the plots within the notebook
%matplotlib inline

import time
import random
import re

# Data PreProcessing

In [37]:
#Step 2: Importing the dataset
df_train = pd.read_csv("dataset/train.csv", low_memory=False)
df_val = pd.read_csv("dataset/validation.csv", low_memory=False)
df_test = pd.read_csv("dataset/test.csv", low_memory=False)

#remove error data
df_train = df_train[df_train.bidprice >= df_train.payprice]

In [3]:
#df_train.info()

### Scoring Function

In [3]:
def calc_score(n_impressions, n_clicks, n_rows_in_budget, cash_in_bank):
    #alphas = [0.8, 0.15, 0, 0.05, 0.005, 0.005]
    alphas = [1, 0, 0, 0, 0, 0]
    normalizers = [1, 100, 1, glob_cash_in_bank, 1/500, 1/100]
    alphas = [x/y for x, y in zip(alphas, normalizers)]
    
    if n_impressions > 0:
        ctr = n_clicks/n_impressions
    else:
        ctr = 0
    conversions = n_clicks
    cvr = 0
    spend = glob_cash_in_bank - cash_in_bank
    
    if n_impressions > 0:
        avg_cpm = spend/(n_impressions*1000)
    else:
        avg_cpm = 1
    if n_clicks > 0:
        avg_cpc = spend/n_clicks
    else:
        avg_cpc = 1
    
    score_components = [
                        alphas[0]*ctr,
                        alphas[1]*conversions,
                        alphas[2]*cvr,
                        -1*alphas[3]*(spend),
                        -1*alphas[4]*(avg_cpm), 
                        -1*alphas[5]*(avg_cpc)
                       ]
    score = sum(score_components)
    return score, score_components

### Constant Bidding Strategy

In [4]:
#Global Variables
glob_cash_in_bank = 6250000 #6250
glob_results = dict()
columns = ['strategy_name', 'strategy_params', 'strategy_impressions', 'strategy_clicks', 'n_rows_in_budget', 'cash_in_bank', \
           'score', 'score_components']
#del(df_results)
df_results = pd.DataFrame(columns = columns)
np.random.seed(27)

In [5]:
def apply_strategy(df, strategy, params):
    col_name = 'strategy_' + strategy + '_bid'
    col_name_validate = 'strategy_' + strategy + '_validate'
    if col_name not in df.keys():
        df[col_name] = int(0)
    if col_name_validate not in df.keys():
        df[col_name_validate] = int(0)
        
    if strategy == 'constant':
        df[col_name] = int(params)
    elif strategy == 'random':        
        df[col_name] = np.random.choice(range(100, params), df.shape[0])
    elif strategy == 'linear':
        df[col_name] = params * df['clickpred']/0.08
        
    df[col_name_validate] = df[col_name] - df['payprice']
    
    return df


def validate_strategy(df, strategy, params):
    col_name = 'strategy_' + strategy + '_bid'
    col_name_validate = 'strategy_' + strategy + '_validate'
    
    df_temp = df[df[col_name_validate] > 0]
    
    cash_in_bank = glob_cash_in_bank    
    strategy_impressions = 0
    strategy_clicks = 0
    n_rows_in_budget = 0
    
    if strategy == 'constant' or strategy == 'random' or strategy == 'linear':
        for row in df_temp.iterrows():
            row = row[1]
            if cash_in_bank > 0:
                n_rows_in_budget += 1
                if row[col_name] > row['payprice']:
                    strategy_impressions += 1
                    strategy_clicks += int(row['click'])
                    cash_in_bank -= row['payprice']
            else:
                break

        score, score_components = calc_score(strategy_impressions, strategy_clicks, n_rows_in_budget, cash_in_bank)
        dfr_index = len(df_results)
        df_results.loc[dfr_index] = [strategy, params, strategy_impressions, strategy_clicks, n_rows_in_budget, cash_in_bank, score, score_components]

In [6]:
c_list = list(np.arange(100,500,25))
for i, c in enumerate(c_list):
    start_time = time.time()
    strategy = 'constant'
    validate_strategy(apply_strategy(df, strategy, c), strategy, c)
    print ("C: {}  |  Score: {:.4f}  |  Time taken: {:.2f}s".format(c, float(df_results.tail(1).score), time.time() - start_time))

C: 100  |  Score: 0.0006  |  Time taken: 14.66s
C: 125  |  Score: 0.0006  |  Time taken: 12.87s
C: 150  |  Score: 0.0006  |  Time taken: 12.48s
C: 175  |  Score: 0.0006  |  Time taken: 13.54s
C: 200  |  Score: 0.0007  |  Time taken: 15.06s
C: 225  |  Score: 0.0008  |  Time taken: 12.54s
C: 250  |  Score: 0.0008  |  Time taken: 11.67s
C: 275  |  Score: 0.0008  |  Time taken: 11.78s
C: 300  |  Score: 0.0008  |  Time taken: 12.42s
C: 325  |  Score: 0.0008  |  Time taken: 10.36s
C: 350  |  Score: 0.0008  |  Time taken: 10.44s
C: 375  |  Score: 0.0008  |  Time taken: 10.75s
C: 400  |  Score: 0.0008  |  Time taken: 13.64s
C: 425  |  Score: 0.0008  |  Time taken: 12.29s
C: 450  |  Score: 0.0008  |  Time taken: 11.68s
C: 475  |  Score: 0.0008  |  Time taken: 11.05s


In [8]:
c_list = list(np.arange(200,1000,50))
for i, c in enumerate(c_list):
    start_time = time.time()
    strategy = 'random'
    validate_strategy(apply_strategy(df, strategy, c), strategy, c)
    print ("C: {}  |  Score: {:.4f}  |  Time taken: {:.2f}s".format(c, float(df_results.tail(1).score), time.time() - start_time))

C: 200  |  Score: 0.0000  |  Time taken: 0.65s
C: 250  |  Score: 0.0000  |  Time taken: 0.37s
C: 300  |  Score: 0.0000  |  Time taken: 1.28s
C: 350  |  Score: 0.0000  |  Time taken: 2.18s
C: 400  |  Score: 0.0000  |  Time taken: 1.99s
C: 450  |  Score: 0.0000  |  Time taken: 1.89s
C: 500  |  Score: 0.0000  |  Time taken: 2.08s
C: 550  |  Score: 0.0000  |  Time taken: 2.27s
C: 600  |  Score: 0.0000  |  Time taken: 2.36s
C: 650  |  Score: 0.0000  |  Time taken: 2.48s
C: 700  |  Score: 0.0000  |  Time taken: 2.59s
C: 750  |  Score: 0.0000  |  Time taken: 2.77s
C: 800  |  Score: 0.0000  |  Time taken: 3.06s
C: 850  |  Score: 0.0000  |  Time taken: 4.08s
C: 900  |  Score: 0.0000  |  Time taken: 4.10s
C: 950  |  Score: 0.0000  |  Time taken: 3.39s


In [146]:
c_list = list(np.arange(10,50,5))
for i, c in enumerate(c_list):
    start_time = time.time()
    strategy = 'linear'
    validate_strategy(apply_strategy(df, strategy, c), strategy, c)
    print ("C: {}  |  Score: {:.4f}  |  Time taken: {:.2f}s".format(c, float(df_results.tail(1).score), time.time() - start_time))

C: 10  |  Score: 0.0000  |  Time taken: 0.09s
C: 15  |  Score: 0.0000  |  Time taken: 0.10s
C: 20  |  Score: 0.0370  |  Time taken: 0.11s
C: 25  |  Score: 0.0300  |  Time taken: 0.31s
C: 30  |  Score: 0.0309  |  Time taken: 0.47s
C: 35  |  Score: 0.0316  |  Time taken: 0.84s
C: 40  |  Score: 0.0204  |  Time taken: 1.09s
C: 45  |  Score: 0.0105  |  Time taken: 1.31s


In [147]:
df_results

Unnamed: 0,strategy_name,strategy_params,strategy_impressions,strategy_clicks,n_rows_in_budget,cash_in_bank,score,score_components
0,constant,100.0,0.0,0.0,0.0,25000.0,0.0,"[0.0, 0.0, 0.0, -0.0, -0.0, -0.0]"
1,constant,125.0,0.0,0.0,0.0,25000.0,0.0,"[0.0, 0.0, 0.0, -0.0, -0.0, -0.0]"
2,constant,150.0,0.0,0.0,0.0,25000.0,0.0,"[0.0, 0.0, 0.0, -0.0, -0.0, -0.0]"
3,constant,175.0,0.0,0.0,0.0,25000.0,0.0,"[0.0, 0.0, 0.0, -0.0, -0.0, -0.0]"
4,constant,200.0,0.0,0.0,0.0,25000.0,0.0,"[0.0, 0.0, 0.0, -0.0, -0.0, -0.0]"
5,constant,225.0,0.0,0.0,0.0,25000.0,0.0,"[0.0, 0.0, 0.0, -0.0, -0.0, -0.0]"
6,constant,250.0,106.0,0.0,106.0,-169.0,0.0,"[0.0, 0.0, 0.0, -0.0, -0.0, -0.0]"
7,constant,275.0,105.0,0.0,105.0,-220.0,0.0,"[0.0, 0.0, 0.0, -0.0, -0.0, -0.0]"
8,constant,300.0,99.0,0.0,99.0,-259.0,0.0,"[0.0, 0.0, 0.0, -0.0, -0.0, -0.0]"
9,constant,325.0,92.0,0.0,92.0,-15.0,0.0,"[0.0, 0.0, 0.0, -0.0, -0.0, -0.0]"


In [91]:
df_results.to_csv('df_results.csv', encoding='utf8')

# Feature Engineering

In [38]:
def get_buckets(price):
    if price == 0:
        return [1,0,0,0,0]
    elif price > 0 and price <= 10:
        return [0,1,0,0,0]
    elif price > 10 and price <= 50:
        return [0,0,1,0,0]
    elif price > 50 and price <=100:
        return [0,0,0,1,0]
    elif price > 100:
        return [0,0,0,0,1]
    
def get_slotdim(row):
    return str(row['slotwidth']) + str(row['slotheight'])

# Turning slotdimension into new feature
df_train["slotdim"] = df_train.apply(get_slotdim, axis=1)

# Adding a OS, browser feature and a bucketed slotprice feature
df_train["os"],df_train["browser"] = zip(*df_train.useragent.map(lambda x: x.split("_")))
df_train["slotprice_0"],df_train["slotprice_1_10"],df_train["slotprice_11_50"],df_train["slotprice_50_100"],df_train["slotprice_50"] = zip(*df_train.slotprice.map(get_buckets))
df = df_train.copy()

df_val["os"],df_val["browser"] = zip(*df_val.useragent.map(lambda x: x.split("_")))
df_val["slotprice_0"],df_val["slotprice_1_10"],df_val["slotprice_11_50"],df_val["slotprice_50_100"],df_val["slotprice_50"] = zip(*df_val.slotprice.map(get_buckets))
df_val["slotdim"] = df_val.apply(get_slotdim, axis=1)
df2 = df_val.copy()

In [60]:
def get_num_usertag(x):
    if pd.isnull(x):
        return 0.0
    else:
        return len(x)
    
df_train['num_usertag'] = df_train.usertag.apply(get_num_usertag)
df_val['num_usertag'] = df_val.usertag.apply(get_num_usertag)

# Linear CTR Prediction

In [3]:
def get_slotdim(row):
    return str(row['slotwidth']) + str(row['slotheight'])

df["slotdim"] = df.apply(get_slotdim, axis=1)
#df2['slotdim'] = df2.apply(get_slotdim, axis=1)

In [4]:
# Imputation
df.slotvisibility.fillna(value=df.slotvisibility.value_counts().index[0], inplace=True)
df.slotformat.fillna(value=df.slotformat.value_counts().index[0], inplace=True)
df.keypage.fillna(value=df.keypage.value_counts().index[0], inplace=True)
df.adexchange.fillna(value=df.adexchange.value_counts().index[0], inplace=True)
df.url.fillna(value=df.url.value_counts().index[0], inplace=True)
df.domain.fillna(value=df.domain.value_counts().index[0], inplace=True)

# df2.slotvisibility.fillna(value=df.slotvisibility.value_counts().index[0], inplace=True)
# df2.slotformat.fillna(value=df.slotformat.value_counts().index[0], inplace=True)
# df2.keypage.fillna(value=df.keypage.value_counts().index[0], inplace=True)
# df2.adexchange.fillna(value=df.adexchange.value_counts().index[0], inplace=True)
# df2.url.fillna(value=df.url.value_counts().index[0], inplace=True)
# df2.domain.fillna(value=df2.domain.value_counts().index[0], inplace=True)

In [5]:
features = list(df.columns)
features_remove = ['click', 'bidid', 'logtype', 'userid', 'urlid', 'bidprice', 'payprice', 'usertag']
features = [x for x in features if x not in features_remove]

In [6]:
from collections import defaultdict
from sklearn.preprocessing import LabelEncoder, MinMaxScaler


categorical = ['useragent', 'slotid', 'slotvisibility', 'creative', 
               'keypage', 'advertiser', 'slotdim', 'os', 'browser', 'url', 'domain', 'IP']

le_dict = defaultdict(LabelEncoder)

df[categorical] = df[categorical].apply(lambda x: le_dict[x.name].fit_transform(x))

# for c in categorical:
#     df2[c] = df2[c].map(lambda s: '<unknown>' if s not in le_dict[c].classes_ else s)
#     le_dict[c].classes_ = np.append(le_dict[c].classes_, '<unknown>')

# df2[categorical] = df2[categorical].apply(lambda x: le_dict[x.name].transform(x))

# mms_dict = defaultdict(MinMaxScaler)

# df[numerical] = df[numerical].apply(lambda x: mms_dict[x.name].fit_transform(x))
# df2[numerical] = df2[numerical].apply(lambda x: mms_dict[x.name].transform(x))

train_X = df[features].as_matrix()
train_Y = df['click'].as_matrix()

# val_X = df2[features].as_matrix()
# val_Y = df2['click'].as_matrix()

### Model Training

In [2]:
df = pd.read_csv('C:\\Users\\gerar\\Dropbox\\Web Economics - Shared\\data\\train_ohe.csv')

In [11]:
joblib.dump(train_X, 'train_X.pkl')

['train_X.pkl']

In [2]:
import pandas as pd
from sklearn.externals import joblib
df_new = pd.DataFrame()
#train_X = joblib.load('train_X.pkl')
#rf = joblib.load('random_forest.pkl')
df_new['rf_pred'] = rf.predict(train_X)
df_new['rf_prob'] = rf.predict_proba(train_X)[:,1]

lr = joblib.load('logistic_regression.pkl')
df_new['lr_pred'] = lr.predict(train_X)
df_new['lr_prob'] = lr.predict_proba(train_X)[:,1]

df_new[['bidid', 'rf_pred', 'rf_prob', 'lr_pred', 'lr_prob']].to_csv('rf_lr_train_predictions.csv', encoding='utf-8')

MemoryError: 

In [16]:
df2 = pd.read_csv('C:\\Users\\gerar\\Dropbox\\Web Economics - Shared\\data\\validation_ohe.csv')

In [17]:
train_ind = list(df.loc[df.click==1].index) + list(df.loc[df.click==0].index)[:len(df.loc[df.click==1])]
np.random.shuffle(train_ind)

In [18]:
df = df.loc[train_ind]

In [14]:
train_fit = train_X[train_ind]
train_labels = train_Y[train_ind]

In [79]:
rf = RandomForestClassifier(n_estimators=500, random_state=1994)
rf.fit(train_X, train_Y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=1, oob_score=False, random_state=1994,
            verbose=0, warm_start=False)

In [53]:
lr = LogisticRegression(C=10, random_state=1994)
lr.fit(train_X, train_Y)

LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1994, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [80]:
rf.score(val_X, val_Y)

0.84131723542030168

In [54]:
lr.score(val_X, val_Y)

0.85279016777370398

In [74]:
lr = LogisticRegression(random_state=1994)
param_grid = {'C': [0.001,0.01,0.1,1,10]}
clf = GridSearchCV(lr, param_grid)
clf.fit(train_X, train_Y)

GridSearchCV(cv=None, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1994, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': [5, 10, 15, 20, 25]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score=True, scoring=None, verbose=0)

In [75]:
clf.best_estimator_

LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1994, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [55]:
from sklearn.metrics import confusion_matrix
pred = lr.predict(val_X)
confusion_matrix(val_Y, pred)

array([[255470,  44053],
       [    73,    153]])

In [81]:
from sklearn.metrics import confusion_matrix
pred = rf.predict(val_X)
confusion_matrix(val_Y, pred)

array([[252015,  47508],
       [    57,    169]])

In [82]:
from sklearn.externals import joblib
joblib.dump(rf, 'random_forest.pkl')
joblib.dump(lr, 'logistic_regression.pkl')

['logistic_regression.pkl']

In [83]:
num_outliers = len(np.where(val_Y==1)[0])
num_inliers = len(np.where(val_Y==0)[0])
pred = rf.predict(val_X)
outliers = 0
inliers = 0
for p, t in zip(pred, val_Y):
    if t==1:
        if p == t:
            outliers += 1
    else:
        if p == 1:
            inliers += 1
print('Random Forest Model:')
print('-----------------------')
print('Percentage of outliers predicted as outliers: {:.2f}%'.format(outliers*100/num_outliers))
print('Percentage of inliers predicted as outliers: {:.2f}%'.format(inliers*100/num_inliers))

Random Forest Model:
-----------------------
Percentage of outliers predicted as outliers: 74.78%
Percentage of inliers predicted as outliers: 15.86%


In [56]:
num_outliers = len(np.where(val_Y==1)[0])
num_inliers = len(np.where(val_Y==0)[0])
pred = lr.predict(val_X)
outliers = 0
inliers = 0
for p, t in zip(pred, val_Y):
    if t==1:
        if p == t:
            outliers += 1
    else:
        if p == 1:
            inliers += 1
print('Logistic Regression Model:')
print('-----------------------')
print('Percentage of outliers predicted as outliers: {:.2f}%'.format(outliers*100/num_outliers))
print('Percentage of inliers predicted as outliers: {:.2f}%'.format(inliers*100/num_inliers))

Logistic Regression Model:
-----------------------
Percentage of outliers predicted as outliers: 67.70%
Percentage of inliers predicted as outliers: 14.71%


In [118]:
df['clickpred'] = lr.predict_proba(train_X)[:,1]

In [58]:
df2['clickprob'] = lr.predict_proba(val_X)[:,1]
df2['clickpred'] = lr.predict(val_X)

In [84]:
df2['clickprob'] = rf.predict_proba(val_X)[:,1]
df2['clickpred'] = pred
df2[['bidid', 'clickprob', 'clickpred']].to_csv('rf_validation_results.csv', encoding='utf-8')

In [60]:
df2[['bidid','clickprob','clickpred']].to_csv('lr_validation_results.csv', encoding='utf-8')

In [61]:
df3 = pd.read_csv('C:\\Users\\gerar\\Dropbox\\Web Economics - Shared\\data\\test_ohe.csv')

In [62]:
df3['slotdim'] = df3.apply(get_slotdim, axis=1)

In [64]:
categorical = ['useragent', 'slotid', 'slotvisibility', 'creative', 
               'keypage', 'advertiser', 'slotdim', 'os', 'browser', 'url', 'domain', 'IP']

for c in categorical:
    df3[c] = df3[c].map(lambda s: '<unknown>' if s not in le_dict[c].classes_ else s)

df3[categorical] = df3[categorical].apply(lambda x: le_dict[x.name].transform(x))



In [70]:
df3.slotvisibility.fillna(value=df.slotvisibility.value_counts().index[0], inplace=True)
df3.slotformat.fillna(value=df.slotformat.value_counts().index[0], inplace=True)
df3.keypage.fillna(value=df.keypage.value_counts().index[0], inplace=True)
df3.adexchange.fillna(value=df.adexchange.value_counts().index[0], inplace=True)
df3.url.fillna(value=df.url.value_counts().index[0], inplace=True)
df3.domain.fillna(value=df.domain.value_counts().index[0], inplace=True)

In [71]:
test_X = df3[features].as_matrix()
df3['clickpred'] = lr.predict(test_X)
df3['clickprob'] = lr.predict_proba(test_X)[:,1]
df3[['bidid','clickprob','clickpred']].to_csv('lr_test_results.csv', encoding='utf-8')

In [85]:
df3['clickpred'] = rf.predict(test_X)
df3['clickprob'] = rf.predict_proba(test_X)[:,1]
df3[['bidid','clickprob','clickpred']].to_csv('rf_test_results.csv', encoding='utf-8')