# Experiments with Support Vector Machine and Gradient Boosted Machine models

In [1]:
import graphlab as gl
import os
gl.canvas.set_target('ipynb')

In [2]:
# load train data
train_set = gl.SFrame.read_csv('../data/train_data.txt', delimiter='\t', verbose=False)
# load test data
test_set = gl.SFrame.read_csv('../data/test_data.txt', delimiter='\t', verbose=False)
# split train data to train set and validation set
train_set, validation_set = train_set.random_split(0.8, seed=1)

This non-commercial license of GraphLab Create is assigned to hernan.toral.15@ucl.ac.uk and will expire on December 01, 2016. For commercial licensing options, visit https://dato.com/buy/.


2016-04-19 10:27:52,198 [INFO] graphlab.cython.cy_server, 176: GraphLab Create v1.8.5 started. Logging: /tmp/graphlab_server_1461058067.log


## Feature engineering functions

In [3]:
from graphlab import feature_engineering as fe


def site_feat(data):
    label = 'site_map'
    if label in data.column_names():
        return data

    data[label] = data['domain'] + data['user_agent']
    return data, label


def slot_feat(data):
    label = 'slot_features'
    if label in data.column_names():
        return data

    data[label] = data['ad_slot_width'] + data['ad_slot_height']
    return data, label


def location_feat(data):
    label = 'location_feat'
    if label in data.column_names():
        return data

#     data.add_column(data.select_column('region').apply(lambda x: str(x)) + \
#                     data.select_column('city').apply(lambda x: str(x)), label)
    data[label] = data['region'] + data['city']
    return data, label

def ad_size_feat(data):
    label = 'ad_size_feat'
    if label in data.column_names():
        return data
#     data.add_column(data.select_column('ad_slot_width').apply(lambda x: str(x)) + \
#                     data.select_column('ad_slot_height').apply(lambda x: str(x)), label)
    data[label] = data['ad_slot_width'] + data['ad_slot_height']
    return data, label


def create_quad_features(train, interaction_columns, label='quadratic_features'):
    return fe.create(train, fe.QuadraticFeatures(features=interaction_columns, output_column_name=label)), label


def apply_quadratic(data, quad, label):

    if label in data.column_names():
        # operation already performed, do nothing
        return data
    # for feature in interaction_columns:
    #     dataset[feature] = data[feature].astype(int)
    return quad.transform(data)


def create_onehot_features(train, interaction_columns, categories=300, label='encoded_features'):
    return fe.create(train, fe.OneHotEncoder(features=interaction_columns,
                                             max_categories=categories, output_column_name=label)), label


def apply_feature_eng(data, impl, label):
    if label in data.column_names():
        return data
    return impl.transform(data)


def apply_dictionary(tags_l):
    if len(tags_l) > 0:
        tags = tags_l.split(',')
        return dict(zip(tags, [1 for tag in tags]))
    else:
        return {}
#     tags_to_dict = lambda tags: dict(zip(tags, [1 for tag in tags]))
    
#     data[label] = data.apply(lambda row: tags_to_dict(col.split(',')))
#     print data[label].head(5)
#     return data


def tags_to_dict(row):
    tags = row.split(',')
    return dict(zip(tags, [1 for _ in tags]))


def apply_separate_uagent(data, regexp, label):
    if label in data.column_names():
        return data
    data[label] = data.apply(lambda row: filter_uagent(row['user_agent'], regexp))
    return data


def filter_uagent(value, regexp):
    return 1 if regexp.search(value) is not None else 0

def floor_price_category(col):
    price = int(col)
    if price > 100:
        return "101+"
    elif price > 50:
        return "51-100"
    elif price > 10:
        return "11-50"
    elif price > 0:
        return "1-10"
    else:
        return "0"


## Platt Scaling for SVM

In [264]:
def platt_scaling(svm_output, svm_prediction, n_1, n_0):
    _a = 0.
    _b = math.log((n_0+1) / (n_1+1))
    hi_target = (n_1+1) / (n_1+2)
    lo_target = 1 / (n_0+2)
    lambda_v = 1e-3
    old_err = 1e300
    pp = gl.SArray(data=[((n_1+1)/(n_0+n_1+2)) for _ in xrange(svm_output.size())], dtype=float)
    count = 0
    for it in xrange(100):
        a = b = c = d = e = 0.
        # compute the Hessian & gradient error function w.r.t. A & B
        for i in xrange(pp.size()):
            t = hi_target if svm_prediction[i] else lo_target
            d1 = pp[i] - t
            d2 = pp[i] * (1 - pp[i])
            a += svm_output[i] * svm_output[i] * d2
            b += d2
            c += svm_output[i] * d2
            d += svm_output[i] * d1
            e += d1

        # if gradient is really tiny, then stop
        if abs(d) < 1e-9 and abs(e) < 1e-9:
            break
        old_a = _a
        old_b = _b
        err = 0.
        while True:
            det = (a + lambda_v) * (b + lambda_v) - c*c
            if det == 0.: # if determinant of Hessian is zero
                # increases stabilizer
                lambda_v *= 10
                continue
            _a = old_a + ((b + lambda_v) * d - c*e) / det
            _b = old_b + ((a + lambda_v) * e - c*d) / det

            # now perform the goodness of fit
            err = 0.
            for j in xrange(pp.size()):
                p = 1 / (1 + math.exp(svm_output[j]*_a + _b))
                pp[j] = p
                if p <= 1.383897e-87:
                    err -= t * (-200) + (1 - t) * math.log(1 - p)
                elif p == 1:
                    err -= t * math.log(p) + (1 - t) * (-200)
                else:
                    err -= t*math.log(p) + (1-t)*math.log(1-p)

            if err < old_err*(1 + 1e-7):
                lambda_v *= 0.1
                break

            # error did not decrease: increase stabilizer by factor of 10 & try again
            lambda_v *= 10
            if lambda_v >= 1e6: # something is broken: give up
                break

        diff = err - old_err
        scale = 0.5 * (err + old_err + 1)
        if diff > -1e-3*scale and diff < 1e-7*scale:
            count += 1
        else:
            count = 0

        old_err = err
        if count == 3:
            break

    return _a, _b


########################################################################################################################

def applyPlatt(x, _a, _b):
    return 1 / (1 + math.exp(x*_a + _b))


########################################################################################################################


## Feature Engineering

In [250]:
trainset, validationset = train_set.random_split(0.8, seed=1)

In [251]:
features = ['weekday', 'hour']#,
                #'ad_slot_id',
    
features_onehot = [
                'user_agent',
                'ad_slot_floor_price', 'ad_slot_width', 'ad_slot_height',
                'ad_slot_format'
                #'domain'
                #'ad_exchange'
                ]

In [None]:
# trainset, _ = site_feat(trainset)
# validationset, site_featl = site_feat(validationset)
# #test_set, site_featl = site_feat(test_set)
# features.append(site_featl)

In [104]:
# trainset, _ = location_feat(trainset)
# validationset, location_featl = location_feat(validationset)
# #test_set, location_feat = location_feat(test_set)
# features.append(location_featl)

In [72]:
# trainset, _ = ad_size_feat(trainset)
# validationset, size_label = ad_size_feat(validationset)
# features.append(size_label)

In [8]:
import re

In [136]:
# uagent_desktop_label = 'uagent_desktop'
# regexp = re.compile(r'windows|linux|mac')
# trainset = apply_separate_uagent(trainset, regexp, uagent_desktop_label)
# validationset = apply_separate_uagent(validationset, regexp, uagent_desktop_label)
# features.append(uagent_desktop_label)

In [137]:
# uagent_mobile_label = 'uagent_mobile'
# regexp = re.compile(r'android|ios|other')
# trainset = apply_separate_uagent(trainset, regexp, uagent_mobile_label)
# validationset = apply_separate_uagent(validationset, regexp, uagent_mobile_label)
# features.append(uagent_mobile_label)

In [None]:
# interaction_columns = ['ad_slot_width', 'ad_slot_height']
# quad, quad_site_label = create_quad_features(trainset, interaction_columns, quad_site_label)
# trainset = apply_quadratic(trainset, quad, quad_site_label)
# validationset = apply_quadratic(validationset, quad, quad_site_label)
# features.append(quad_site_label)

In [252]:
ip_fixed_label = 'ip_fixed'
trainset.add_column(trainset.select_column('ip').apply(lambda x: x.index('.', x.index('.')+1)), ip_fixed_label)
validationset.add_column(validationset.select_column('ip').apply(lambda x: x.index('.', x.index('.')+1)), ip_fixed_label)
test_set.add_column(test_set.select_column('ip').apply(lambda x: x.index('.', x.index('.')+1)), ip_fixed_label)
# features.append(ip_fixed_label)
features_onehot.append(ip_fixed_label)

In [38]:
# price_cat_label = 'price_cat_label'
# trainset.add_column(trainset.select_column('ad_slot_floor_price').apply(lambda x: floor_price_category(x)), 
#                    price_cat_label)
# validationset.add_column(validationset.select_column('ad_slot_floor_price').apply(lambda x: floor_price_category(x)), 
#                    price_cat_label)
# features.append(price_cat_label)

In [253]:
tags_label = 'tags'
trainset.add_column(trainset.select_column('user_tags').apply(lambda x: apply_dictionary(x)), tags_label)
validationset.add_column(validationset.select_column('user_tags').apply(lambda x: apply_dictionary(x)), tags_label)
test_set.add_column(test_set.select_column('user_tags').apply(lambda x: apply_dictionary(x)), tags_label)
# features.append(tags_label)
features_onehot.append(tags_label)

In [254]:
domain_enc_label = 'encoded_domain'
encoded_columns = ['domain', 'url']
onehot, _ = create_onehot_features(trainset, encoded_columns, 100, domain_enc_label)
trainset = apply_feature_eng(trainset, onehot, domain_enc_label)
validationset = apply_feature_eng(validationset, onehot, domain_enc_label)
test_set = apply_feature_eng(test_set, onehot, domain_enc_label)
#features.append(domain_enc_label)
features_onehot.append(domain_enc_label)

In [255]:
timestamp_fixed_label = 'timestamp_fixed'
trainset.add_column(trainset.select_column('timestamp').apply(lambda x: int(str(x)[:8])), timestamp_fixed_label)
validationset.add_column(validationset.select_column('timestamp').apply(lambda x: int(str(x)[:8])), timestamp_fixed_label)
test_set.add_column(test_set.select_column('timestamp').apply(lambda x: int(str(x)[:8])), timestamp_fixed_label)
#features.append(timestamp_fixed_label)
features_onehot.append(timestamp_fixed_label)

In [256]:
features_onehot

['user_agent',
 'ad_slot_floor_price',
 'ad_slot_width',
 'ad_slot_height',
 'ad_slot_format',
 'ip_fixed',
 'tags',
 'encoded_domain',
 'timestamp_fixed']

In [257]:
encoded_feat_label = 'encoded_features'
onehot, _ = create_onehot_features(trainset, features_onehot, 100, encoded_feat_label)
trainset = apply_feature_eng(trainset, onehot, encoded_feat_label)
validationset = apply_feature_eng(validationset, onehot, encoded_feat_label)
test_set = apply_feature_eng(test_set, onehot, encoded_feat_label)
features.append(encoded_feat_label)

In [258]:
features

['weekday', 'hour', 'encoded_features']

In [218]:
trainset[features].head(5)

weekday,hour,encoded_features
6,0,"{160: 1, 192: 1, 34: 1, 196: 1, 293: 1, 134: 1, ..."
6,0,"{160: 1, 35: 1, 310: 1, 198: 1, 135: 1, 201: 1, ..."
6,0,"{160: 1, 35: 1, 135: 1, 201: 1, 204: 1, 142: 1, ..."
6,0,"{35: 1, 135: 1, 142: 1, 310: 1, 151: 1, 185: 1, ..."
6,0,"{160: 1, 34: 1, 196: 1, 135: 1, 203: 1, 142: 1, ..."


# Models

## Logistic Regression

In [223]:
gbm_baseline = gl.logistic_classifier.create(trainset, target='click', features=features,# l2_penalty=0.001, 
                                             validation_set=validationset, max_iterations=20, class_weights='auto')


## Boosted Tree Classifier (GBM)

In [234]:
gbm_baseline = gl.boosted_trees_classifier.create(trainset, target='click', features=features,
                                                  validation_set=validationset, max_depth=5, step_size=0.01,
                                                  max_iterations=100,# min_child_weight=trainset.shape[0]/1000,
                                                  early_stopping_rounds=5)#, 
#                                                   class_weights='auto')

## SVM

In [259]:
gbm_baseline = gl.svm_classifier.create(trainset, target='click', features=features,
                                            validation_set=validationset,
                                            max_iterations=10)

# Evaluation

In [260]:
results = gbm_baseline.evaluate(validationset)
print results

{'f1_score': 0.0, 'recall': 0.0, 'confusion_matrix': Columns:
	target_label	int
	predicted_label	int
	count	int

Rows: 2

Data:
+--------------+-----------------+--------+
| target_label | predicted_label | count  |
+--------------+-----------------+--------+
|      1       |        0        |  296   |
|      0       |        0        | 455807 |
+--------------+-----------------+--------+
[2 rows x 3 columns]
, 'precision': None, 'accuracy': 0.9993510237819089}


## Output

In [None]:
gbm_predictions = gbm_baseline.predict(test_set, output_type='probability')

with open('./output/gbm_predictions.csv', mode='w') as prediction_file:
    # write headers to file
    prediction_file.write('Id,Prediction\n')
    prediction_id = 1
    for prediction in gbm_predictions:
        prediction_file.write('{},{:.5f}\n'.format(prediction_id, prediction))
        prediction_id += 1

prediction_file.close()

## SVM Output


In [262]:
import math

In [266]:
svm_predictions = gbm_baseline.predict(test_set, output_type='class')
svm_values = gbm_baseline.predict(test_set, output_type='margin')

n_1 = svm_predictions.filter(lambda x: x == 1).size()
n_0 = svm_predictions.filter(lambda x: x == 0).size()

_a, _b = platt_scaling(svm_values, svm_predictions, n_1, n_0)
svm_predictions = svm_predictions.apply(lambda x: applyPlatt(x, _a, _b))

# open support vector machines model predictions file
with open('./output/svm_predictions.csv', mode='w') as svm_prediction_file:
    # write headers to file
    svm_prediction_file.write('Id,Prediction\n')
    # set support vector machines model prediction id to 1
    svm_prediction_id = 1
    # for every support vector machines model prediction
    for svm_prediction in svm_predictions:
        # write support vector machines model prediction to file in requested format
        svm_prediction_file.write('{},{:.5f}\n'.format(svm_prediction_id, svm_prediction))
        # increment support vector machines model prediction id
        svm_prediction_id += 1

# close support vector machines model predictions file
svm_prediction_file.close()

0
545421
