## A Neural Network (Multilayer perceptron) application for Click Through Rate (CTR) prediction

# 1. Data Preparation


### Imports

In [2]:
import numpy as np
import pandas as pd
from pickle import dump, load

np.random.seed(42)

# Following lines are needed for being able to read the data from personal google drive. Change this path accordingly to where you store the data sets.
from google.colab import drive
drive.mount('/content/drive')
dir = 'drive/My Drive/Colab Notebooks/data/RTB-data/'

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


### 1.2 Data Loading

In [2]:
# I first transformed the datasets as pickle objects such that loading the data is much quicker than when loading a csv file.
train_df = load(open(dir + 'train.pkl', 'rb'))
val_df = load(open(dir + 'validation.pkl', 'rb'))
test_df = load(open(dir + 'test.pkl', 'rb'))

print(train_df.shape, val_df.shape, test_df.shape)

(2430981, 25) (303925, 25) (303375, 22)


In [6]:
train_df.head()

Unnamed: 0,click,weekday,hour,bidid,userid,useragent,IP,region,city,adexchange,...,slotheight,slotvisibility,slotformat,slotprice,creative,bidprice,payprice,keypage,advertiser,usertag
0,0,5,22,b7bea80521fdecd95d2d761a38c91c3f09618066,2e880fb7d690cf7377b2e42e701728e3f3c0e4c1,windows_ie,125.37.175.*,2,2,2.0,...,200,2,0,5,a4f763f78ef3eedfe614263b94a8924e,238,5,0f951a030abdaedd733ee8d114ce2944,3427,NaN
1,0,1,20,4f51205475678f5a124bc76b2c54163bf8eaa7eb,3a1fe01360ff8100e7d006b83b77a3e4c01d928c,windows_chrome,171.36.92.*,238,239,1.0,...,250,FourthView,Na,0,10722,294,23,,2821,NaN
2,0,3,13,b604e3fd054a658ab7ced4285ebf2ef54d2bd890,801d18a056b6fe6b06a794aef17fb0d6daff2414,windows_ie,59.46.106.*,40,41,2.0,...,250,2,0,5,798b2d49952d77f1eace9f23c210d0b5,238,24,0f951a030abdaedd733ee8d114ce2944,3427,10052100061386610110
3,0,6,23,0348beeae93e561584c3b50fc9e7746a33048ad7,0d6eaf2259699990e38a1fc5116f112070b9ecdc,windows_ie,114.250.226.*,1,1,1.0,...,600,2,1,0,cb7c76e7784031272e37af8e7e9b062c,300,25,bebefa5efe83beee17a3d245e7c5085b,1458,138661006310111
4,0,5,6,268149c1789bce2bc9798ffd97ec431219bafeb3,a239d9bb642460d974ba67f85e63b8d3e214da0e,windows_ie,183.63.192.*,216,233,2.0,...,90,OtherView,Na,133,7330,277,133,,2259,NaN


In [0]:
def impute_nan(dataframe):
  dataframe["adexchange"] = dataframe["adexchange"].fillna(dataframe["adexchange"].dropna().mode()[0])
  dataframe["usertag"] = dataframe["usertag"].fillna("no_tag")
  dataframe["domain"] = dataframe["domain"].fillna("")
  return dataframe

train_df = impute_nan(train_df)
val_df = impute_nan(val_df)
test_df = impute_nan(test_df)

Let's exclude features that we do not want to include in the MLP training.

In [0]:
features_delete = ['bidprice', 'payprice']
features_nan = ['userid', 'url', 'urlid', 'keypage', 'bidid', 'IP'] 
train_df = train_df.drop(features_delete + features_nan, axis=1)
val_df = val_df.drop(features_delete + features_nan, axis=1)
test_df = test_df.drop(features_nan, axis=1)

In [0]:
X_train, y_train = train_df.drop('click', axis=1), train_df.click
X_val, y_val = val_df.drop('click', axis=1), val_df.click
X_test = test_df

In [7]:
# Let's concatenate the train, val, test data for feature engineering (dummy coding)
print(X_train.shape, X_val.shape, X_test.shape)
data = pd.concat([X_train, X_val, X_test], axis=0)
print(data.shape[0]==np.sum(X_train.shape[0] + X_val.shape[0] + X_test.shape[0]))


(2430981, 16) (303925, 16) (303375, 16)
True


### 1.3 Feature Engineering

one-hot-encoding of usertags:


In [8]:
# Step 1: split the usertags
data.usertag = data.usertag.apply(lambda x: x.split(","))
"""
X_train.usertag = X_train.usertag.apply(lambda x: x.split(","))
X_val.usertag = X_val.usertag.apply(lambda x: x.split(","))
X_test.usertag = X_test.usertag.apply(lambda x: x.split(","))
"""

'\nX_train.usertag = X_train.usertag.apply(lambda x: x.split(","))\nX_val.usertag = X_val.usertag.apply(lambda x: x.split(","))\nX_test.usertag = X_test.usertag.apply(lambda x: x.split(","))\n'

In [9]:
print(data.usertag.head())

0                        [no_tag]
1                        [no_tag]
2    [10052, 10006, 13866, 10110]
3           [13866, 10063, 10111]
4                        [no_tag]
Name: usertag, dtype: object


Let's create some dummy variables for slotid and domain whilst only keeping those who account for a non-neglible portion in the datasets:

In [10]:
from collections import defaultdict as dd

slot_id = dd(int)
domain = dd(int)

for x in data.slotid:
    slot_id[x]+=1
    
for x in data.domain:
    domain[x]+=1
    
    
n = 6000

# there are too many "domain" and "slotid"
# we only keep the ones with frequency over 3500 in the training set

keep_slot_id = set()
keep_domain = set()

for x,y in slot_id.items():
    if y>n:
        keep_slot_id |= {x}
        
for x,y in domain.items():
    if y>n:
        keep_domain |= {x}

'\nfrom collections import defaultdict as dd\n\nslid = dd(int)\ndom = dd(int)\n\nfor x in X_train.slotid:\n    slid[x]+=1\n    \nfor x in X_train.domain:\n    dom[x]+=1\n    \n    \nn = 3500\n\n# there are too many "domain" and "slotid"\n# we only keep the ones with frequency over 3500 in the training set\n\nkeep_slotid = set()\nkeep_domain = set()\n\nfor a,b in slid.items():\n    if b>n:\n        keep_slotid |= {a}\n        \nfor a,b in dom.items():\n    if b>n:\n        keep_domain |= {a}\n'

In [11]:
print(len(keep_domain))
print(len(keep_slotid))

63
76


In [12]:
def feature_map(x, S):
    if x in S:
        return(x)
    else:
        return("")

data.slotid = data.slotid.apply(lambda x: feature_map(x, keep_slot_id))
data.domain = data.domain.apply(lambda x: feature_map(x, keep_domain))
print(data.shape)
print(data.head())

(3038281, 16)
   weekday  hour       useragent  region  city  adexchange  \
0        5    22      windows_ie       2     2         2.0   
1        1    20  windows_chrome     238   239         1.0   
2        3    13      windows_ie      40    41         2.0   
3        6    23      windows_ie       1     1         1.0   
4        5     6      windows_ie     216   233         2.0   

                             domain                       slotid  slotwidth  \
0              trqRTvKaXTKfgg24JKTI                                     200   
1  20fc675468712705dbf5d3eda94126da   mm_10982364_973726_8930541        300   
2                trqRTJn7O95I1mKYUV                   1720123646        250   
3           5F97t5E0BTK7XhNrUMpENpn  mm_10027070_118039_10308280        160   
4  13625cb070ffb306b425cd803c4b7ab4                                     728   

   slotheight slotvisibility slotformat  slotprice  \
0         200              2          0          5   
1         250     FourthView  

'\nX_train.slotid = X_train.slotid.apply(lambda x: my_map(x, keep_slotid))\nX_val.slotid = X_val.slotid.apply(lambda x: my_map(x, keep_slotid))\nX_test.slotid = X_test.slotid.apply(lambda x: my_map(x, keep_slotid))\n\nX_train.domain = X_train.domain.apply(lambda x: my_map(x, keep_domain))\nX_val.domain = X_val.domain.apply(lambda x: my_map(x, keep_domain))\nX_test.domain = X_test.domain.apply(lambda x: my_map(x, keep_domain))\n'

In [13]:
# Step 2: Split dummy variables of the 'usertag' feature
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()

data = pd.concat([data, pd.DataFrame(mlb.fit_transform(data.usertag),
                          columns=mlb.classes_,
                          index=data.index)], axis=1) 
data.drop(['usertag'], axis=1, inplace=True)

"\nX_train = X_train.join(pd.DataFrame(mlb.fit_transform(X_train.pop('usertag')),\n                          columns=mlb.classes_,\n                          index=X_train.index))\n"

In [14]:
# Verify usertags have been correctly split into binary columns and 'usertag' feature is removed from dataframe
print(data.shape)
print(data.columns)

(3038281, 84)
Index(['weekday', 'hour', 'useragent', 'region', 'city', 'adexchange',
       'domain', 'slotid', 'slotwidth', 'slotheight', 'slotvisibility',
       'slotformat', 'slotprice', 'creative', 'advertiser', '10006', '10024',
       '10031', '10048', '10052', '10057', '10059', '10063', '10067', '10074',
       '10075', '10076', '10077', '10079', '10083', '10093', '10102', '10110',
       '10111', '10114', '10115', '10116', '10117', '10118', '10120', '10123',
       '10125', '10126', '10127', '10129', '10130', '10131', '10133', '10138',
       '10140', '10142', '10145', '10146', '10147', '10148', '10149', '10684',
       '11092', '11278', '11379', '11423', '11512', '11576', '11632', '11680',
       '11724', '11944', '13042', '13403', '13496', '13678', '13776', '13800',
       '13866', '13874', '14273', '15398', '16593', '16617', '16661', '16706',
       '16751', '16753', 'no_tag'],
      dtype='object')


In [0]:
"""
X_val = X_val.join(pd.DataFrame(mlb.fit_transform(X_val.pop('usertag')),
                          columns=mlb.classes_,
                          index=X_val.index))

X_test = X_test.join(pd.DataFrame(mlb.fit_transform(X_test.pop('usertag')),
                          columns=mlb.classes_,
                          index=X_test.index))
"""

Replace categorical features by dummy variables:

In [0]:
# Import data.pkl file
data = load(open(dir + 'data.pkl', 'rb'))

In [3]:
print(data.shape)
print(data.columns)
print(data.head())

(3038281, 84)
Index(['weekday', 'hour', 'useragent', 'region', 'city', 'adexchange',
       'domain', 'slotid', 'slotwidth', 'slotheight', 'slotvisibility',
       'slotformat', 'slotprice', 'creative', 'advertiser', '10006', '10024',
       '10031', '10048', '10052', '10057', '10059', '10063', '10067', '10074',
       '10075', '10076', '10077', '10079', '10083', '10093', '10102', '10110',
       '10111', '10114', '10115', '10116', '10117', '10118', '10120', '10123',
       '10125', '10126', '10127', '10129', '10130', '10131', '10133', '10138',
       '10140', '10142', '10145', '10146', '10147', '10148', '10149', '10684',
       '11092', '11278', '11379', '11423', '11512', '11576', '11632', '11680',
       '11724', '11944', '13042', '13403', '13496', '13678', '13776', '13800',
       '13866', '13874', '14273', '15398', '16593', '16617', '16661', '16706',
       '16751', '16753', 'no_tag'],
      dtype='object')
   weekday  hour       useragent  region  city  adexchange  \
0        5   

In [0]:
# Define categorical features that will be replaced by dummy variables
categorical_features = ['useragent', 'region', 'city', 'adexchange', 'domain', 'slotid', 'slotvisibility', 'slotformat', 'creative', 'advertiser']
data = pd.get_dummies(data, columns=categorical_features, drop_first=True) # drop_first gives k-1 dummies out of k categorical levels by removing the first level 

Use concatenation of train + val for normalising train, validation and test set
Fit only to our train, val data

In [0]:
from sklearn.preprocessing import StandardScaler

scaling_features = ['weekday', 'hour', 'slotwidth', 'slotheight', 'slotprice']
scaler = StandardScaler()
data = data_dummy[:(train_df.shape[0]+val_df.shape[0])]
scaler.fit(data[scaling_features]) 

Split data back in train, val and test and Apply transformation

In [0]:
X_train = pd.DataFrame(data_dummy[:train_df.shape[0]])
X_train[scaling_features] = scaler.transform(X_train[scaling_features])
train_nn = pd.concat([X_train, train_df.click], axis=1)

half = np.int(train_nn.shape[0] / 2) # Split training set in two for pickle usage
train_nn1 = train_nn.iloc[:half]
train_nn2 = train_nn.iloc[half:]
dump(train_nn1, open(dir + 'train_nn1.pkl', 'wb'))
dump(train_nn2, open(dir + 'train_nn2.pkl', 'wb'))

X_val = pd.DataFrame(data_dummy[train_df.shape[0]:(train_df.shape[0]+val_df.shape[0])])
X_val[scaling_features] = scaler.transform(X_val[scaling_features])
val_nn = pd.concat([X_val, val_df.click], axis=1)
dump(val_nn, open(dir + 'val_nn.pkl', 'wb'))

test_nn = pd.DataFrame(data_dummy[-test_df.shape[0]:])
test_nn[scaling_features] = scaler.transform(test_nn[scaling_features])
dump(test_nn, open(dir + 'test_nn.pkl', 'wb'))

## 1. Multilayer Perceptron (MLP) Training

### 1.1 Install Keras

In [0]:
!pip install -q --upgrade numpy
!pip install -q --upgrade pandas # Upgrade pandas otherwise can't read the dataset
!pip install -q keras


### 1.2 Run Multilayer peceptron with Keras


In [5]:
from sklearn.utils import class_weight
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint
from keras.initializers import glorot_normal
from sklearn.metrics import roc_auc_score
from keras.callbacks import Callback

train_nn1 = load(open(dir + 'train_nn1.pkl', 'rb'))
train_nn2 = load(open(dir + 'train_nn2.pkl', 'rb'))
train_df = pd.concat([train_nn1, train_nn2])
val_df = load(open(dir + 'val_nn.pkl', 'rb'))
test_df = load(open(dir + 'test_nn.pkl', 'rb'))

X_train, y_train = train_df.drop('click', axis=1), train_df.click
X_val, y_val = val_df.drop('click', axis=1), val_df.click


net = Sequential()
net.add(Dense(100, kernel_initializer=glorot_normal(seed=42), input_dim=806, activation='relu'))
net.add(Dense(100, kernel_initializer=glorot_normal(seed=42), activation='relu'))
net.add(Dense(1, kernel_initializer=glorot_normal(seed=42), activation='sigmoid'))


class roc_callback(Callback):
    def __init__(self, training_data, validation_data):
        self.x = training_data[0]
        self.y = training_data[1]
        self.x_val = validation_data[0]
        self.y_val = validation_data[1]

    def on_epoch_end(self, epoch, logs={}):
        y_pred = self.model.predict(self.x)
        roc = roc_auc_score(self.y, y_pred)
        y_pred_val = self.model.predict(self.x_val)
        roc_val = roc_auc_score(self.y_val, y_pred_val)
        print('\rroc-auc: %s - roc-auc_val: %s' % (str(round(roc, 4)), str(round(roc_val, 4))), end=100*' '+'\n')
        return

net.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# Checkpoint to save model if it has improved on accuracy in given epoch
filepath='models/net_keras_test.hdf5'
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=0, save_best_only=True, mode='max')
earlystop = EarlyStopping(monitor='val_loss', min_delta=0.0001, verbose=1, mode='min')
callbacks_list = [earlystop, checkpoint, roc_callback(training_data=(X_train, y_train), validation_data=(X_val, y_val))]

class_weights = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)
net.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=3, callbacks=callbacks_list,
          batch_size=1024, class_weight=class_weights)


# Load weights
net.load_weights(dir + 'models/net_keras_test.hdf5')
net.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
pred_val = net.predict_proba(X_val)
pred_test = net.predict_proba(test_df)



# Initialise dataframes for pCTR for validation and test set
val_df_original = load(open(dir + 'validation.pkl', 'rb'))
test_df_original = load(open(dir + 'test.pkl', 'rb'))
val_ctr_nn = pd.DataFrame(data={'bidid': val_df_original.bidid, 'pCTR': np.nan})
test_ctr_nn = pd.DataFrame(data={'bidid': test_df_original.bidid, 'pCTR': np.nan})

val_ctr_nn.pCTR = pred_val
test_ctr_nn.pCTR = pred_test

val_ctr_nn.to_csv(dir + 'models/pCTR_nn_keras_validation.csv', index=False)
test_ctr_nn.to_csv(dir + 'models/pCTR_nn_keras_test.csv', index=False)


Using TensorFlow backend.
