In [1]:
import numpy as np
import pandas as pd
import sklearn as skl

In [2]:
#Wenjie's 
ds_test = pd.read_csv("we_data/test.csv")
ds_train = pd.read_csv("we_data/train.csv")
ds_val = pd.read_csv("we_data/validation.csv")

# Helper functions

In [3]:
def preprocess(df_proc):
    # useragent replaced by os and browser
    df = df_proc.copy()
    useless = ['bidid', 'userid', 'IP', 'slotid', 'useragent', 'creative', 'region',
               'city', 'slotwidth', 'slotheight']

    check_fields = ['bidprice', 'payprice']
    for field in check_fields:
        if field in df_proc.columns:
            useless.append(field)
            
    dum_req = ['os', 'browser', 'weekday', 'hour', 'slotvisibility', 'slotformat', 'advertiser', 'slotprice']

    x = df['useragent'].str.split('_', n=1, expand=True)
    df['os'] = x[0] 
    df['browser'] = x[1]
    df['os'] = pd.Categorical(df['os'], categories=os_values)
    df['browser'] = pd.Categorical(df['browser'], categories=browser_values)
#     df['slotformat'] = pd.Categorical(df['slotformat'], categories=slotf_values)
#     df['slotvisibility'] = pd.Categorical(df['slotvisibility'], categories=slotv_values)
   
    # Slot price
    bins = [-np.inf, 10, 50, 100, 150, 200, 250, np.inf]
    labels = ['sp_1','sp_2','sp_3','sp_4', 'sp_5', 'sp_6', 'sp_7']
    df['slotprice'] = pd.cut(df['slotprice'], bins=bins, labels=labels)

    df = pd.get_dummies(df, columns=dum_req)
    df.dropna(axis=1, inplace=True)
    df.drop(useless, axis=1, inplace=True)

    return df

In [4]:
def get_bids(model, df, base_bid, base_ctr):
    bidids = df['bidid']
    if 'click' in df.columns:
        df = df.drop('click', axis=1)
    processed_df = preprocess(df)
    prob = pd.DataFrame(model.predict_proba(processed_df))
    prob['bidprice'] = base_bid * prob[1] / base_ctr
    submit_bids = pd.concat([bidids, prob['bidprice']], axis=1)
    return submit_bids.round(2)

In [5]:
def check_clicks(df_val, df_bids, budget=6250):
    df_val = df_val[['bidid', 'payprice', 'click']]
    df_val = pd.merge(df_val, df_bids, on='bidid')
    return df_val

# Gathering categorical and essential data from training set

In [6]:
df = ds_train

In [7]:
ua_split = df['useragent'].str.split('_', n=1, expand=True)
os_values = ua_split[0].unique()
browser_values = ua_split[1].unique()
ua_values = df['useragent'].unique()
hour_values = df['hour'].unique()
weekday_values = df['weekday'].unique()
region_values = df['region'].unique()
city_values = df['city'].unique()
slotv_values = df['slotvisibility'].unique()
slotf_values = df['slotformat'].unique()

In [8]:
# Average clickthrough rate
base_ctr = ds_train['click'].mean()
base_pay = ds_train['payprice'].mean()
base_bid = ds_train['bidprice'].mean()
print(f"Base CTR: {base_ctr}, Base Bid: {base_bid}, Base Pay: {base_pay}")

Base CTR: 0.0007375623256619447, Base Bid: 272.9620461862927, Base Pay: 78.15141623895867


# Train Model

### Preparing data for training

In [9]:
train_df = preprocess(ds_train)
val_df = preprocess(ds_val)
X_train = train_df.drop('click', axis=1)
y_train = train_df['click']
# X_test = val_df.drop('click', axis=1)
# y_test = val_df['click']

### Logistic Regression with SGD

In [10]:
from sklearn.linear_model import SGDClassifier
sgdmodel = SGDClassifier(alpha=0.00005, loss='log', max_iter=10000, tol=1e-3)
sgdmodel.fit(X_train, y_train)

SGDClassifier(alpha=5e-05, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='log', max_iter=10000,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',
       power_t=0.5, random_state=None, shuffle=True, tol=0.001,
       validation_fraction=0.1, verbose=0, warm_start=False)

### Logistic Regression 

In [11]:
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression()
logmodel.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

# Using trained models

In [12]:
# Testing both models on validation dataset
models = [('sgd', sgdmodel), ('log', logmodel)]

for name, model in models:
    df = ds_val

    df = check_clicks(df, get_bids(model, df, base_pay, base_ctr))
    df[['bidid', 'bidprice']].to_csv('./val_bids.csv', index=False, header=True)

    df['win'] = (df['bidprice'] - df['payprice']) >= 0

    total = df.shape[0]
    total_clicks = df[df['click'] == 1].shape[0]
    won_clicks = df[(df['click'] == 1) & (df['win'] == 1)].shape[0]
    total_won = df[df['win'] == True].shape[0]

    print(f'Model: {name} | Clicks: {won_clicks} - {won_clicks/total_clicks * 100:.2f}%, Won: {total_won} - {total_won/total*100:.2f}%')

Model: sgd | Clicks: 130 - 64.36%, Won: 162933 - 53.61%
Model: log | Clicks: 125 - 61.88%, Won: 146028 - 48.05%


In [13]:
# Create test bid file for Deepmining website
df = ds_test
model = logmodel # select model to use

df = get_bids(logmodel, df, base_pay, base_ctr)
df[['bidid', 'bidprice']].to_csv('./test_bids.csv', index=False, header=True)