# Experiments with Support Vector Machine and Gradient Boosted Machine models

In [1]:
import graphlab as gl
import os
gl.canvas.set_target('ipynb')

In [89]:
# load train data
train_set = gl.SFrame.read_csv('../data/train_data.txt', delimiter='\t', verbose=False)
# load test data
test_set = gl.SFrame.read_csv('../data/test_data.txt', delimiter='\t', verbose=False)
# split train data to train set and validation set
#train_set, validation_set = train_set.random_split(0.8, seed=1337)

## Feature engineering functions

In [86]:
from graphlab import feature_engineering as fe


def site_feat(data):
    label = 'site_map'
    if label in data.column_names():
        return data

    data[label] = data['domain'] + data['user_agent']
    return data, label


def slot_feat(data):
    label = 'slot_features'
    if label in data.column_names():
        return data

    data[label] = data['ad_slot_width'] + data['ad_slot_height']
    return data, label


def location_feat(data):
    label = 'location_feat'
    if label in data.column_names():
        return data

#     data.add_column(data.select_column('region').apply(lambda x: str(x)) + \
#                     data.select_column('city').apply(lambda x: str(x)), label)
    data[label] = data['region'] + data['city']
    return data, label

def ad_size_feat(data):
    label = 'ad_size_feat'
    if label in data.column_names():
        return data
#     data.add_column(data.select_column('ad_slot_width').apply(lambda x: str(x)) + \
#                     data.select_column('ad_slot_height').apply(lambda x: str(x)), label)
    data[label] = data['ad_slot_width'] + data['ad_slot_height']
    return data, label


def create_quad_features(train, interaction_columns, label='quadratic_features'):
    return fe.create(train, fe.QuadraticFeatures(features=interaction_columns, output_column_name=label)), label


def apply_quadratic(data, quad, label):

    if label in data.column_names():
        # operation already performed, do nothing
        return data
    # for feature in interaction_columns:
    #     dataset[feature] = data[feature].astype(int)
    return quad.transform(data)


def create_onehot_features(train, interaction_columns, categories=300, label='encoded_features'):
    return fe.create(train, fe.OneHotEncoder(features=interaction_columns,
                                             max_categories=categories, output_column_name=label)), label


def apply_feature_eng(data, impl, label):
    if label in data.column_names():
        return data
    return impl.transform(data)


def apply_dictionary(tags_l):
    if len(tags_l) > 0:
        tags = tags_l.split(',')
        return dict(zip(tags, [1 for tag in tags]))
    else:
        return {}
#     tags_to_dict = lambda tags: dict(zip(tags, [1 for tag in tags]))
    
#     data[label] = data.apply(lambda row: tags_to_dict(col.split(',')))
#     print data[label].head(5)
#     return data


def tags_to_dict(row):
    tags = row.split(',')
    return dict(zip(tags, [1 for _ in tags]))


def apply_separate_uagent(data, regexp, label):
    if label in data.column_names():
        return data
    data[label] = data.apply(lambda row: filter_uagent(row['user_agent'], regexp))
    return data

def apply_separate_uagent2(data, label_os, label_browser):
    if label_os in data.column_names():
        return data
    data[label_os] = data.apply(lambda row: row['user_agent'].split('_')[0])
    data[label_browser] = data.apply(lambda row: row['user_agent'].split('_')[1])
    return data


def filter_uagent(value, regexp):
    return 1 if regexp.search(value) is not None else 0

def floor_price_category(col):
    price = int(col)
    if price > 80:
        return "81+"
    elif price > 10:
        return "11-80"
    elif price > 0:
        return "1-10"
    else:
        return "0"


## Platt Scaling for SVM

In [4]:
def platt_scaling(svm_output, svm_prediction, n_1, n_0):
    _a = 0.
    _b = math.log((n_0+1) / (n_1+1))
    hi_target = (n_1+1) / (n_1+2)
    lo_target = 1 / (n_0+2)
    lambda_v = 1e-3
    old_err = 1e300
    pp = gl.SArray(data=[((n_1+1)/(n_0+n_1+2)) for _ in xrange(svm_output.size())], dtype=float)
    count = 0
    for it in xrange(100):
        a = b = c = d = e = 0.
        # compute the Hessian & gradient error function w.r.t. A & B
        for i in xrange(pp.size()):
            t = hi_target if svm_prediction[i] else lo_target
            d1 = pp[i] - t
            d2 = pp[i] * (1 - pp[i])
            a += svm_output[i] * svm_output[i] * d2
            b += d2
            c += svm_output[i] * d2
            d += svm_output[i] * d1
            e += d1

        # if gradient is really tiny, then stop
        if abs(d) < 1e-9 and abs(e) < 1e-9:
            break
        old_a = _a
        old_b = _b
        err = 0.
        while True:
            det = (a + lambda_v) * (b + lambda_v) - c*c
            if det == 0.: # if determinant of Hessian is zero
                # increases stabilizer
                lambda_v *= 10
                continue
            _a = old_a + ((b + lambda_v) * d - c*e) / det
            _b = old_b + ((a + lambda_v) * e - c*d) / det

            # now perform the goodness of fit
            err = 0.
            for j in xrange(pp.size()):
                p = 1 / (1 + math.exp(svm_output[j]*_a + _b))
                pp[j] = p
                ## At this step, make sure log(0) returns -200
                if p <= 1.383897e-87:
                    err -= t * (-200) + (1 - t) * math.log(1 - p)
                elif p == 1:
                    err -= t * math.log(p) + (1 - t) * (-200)
                else:
                    err -= t*math.log(p) + (1-t)*math.log(1-p)

                if err == -float("inf"):
                    print '==Something is wrong=='

            if err < old_err*(1 + 1e-7):
                lambda_v *= 0.1
                break

            # error did not decrease: increase stabilizer by factor of 10 & try again
            lambda_v *= 10
            if lambda_v >= 1e6: # something is broken: give up
                print '==Somethig is broken... giving up=='
                break

        diff = err - old_err
        scale = 0.5 * (err + old_err + 1)
        if diff > -1e-3*scale and diff < 1e-7*scale:
            count += 1
        else:
            count = 0
        print count
        old_err = err
        if count == 3:
            break

    return _a, _b


########################################################################################################################

def apply_platt(x, _a, _b):
    print math.exp(x*_a + _b)
    return 1. / (1. + math.exp(x*_a + _b))


########################################################################################################################


## Feature Engineering

In [109]:
trainset, validationset = train_set.random_split(0.9, seed=1337)

# /*

In [None]:
# features = ['weekday', 'hour']#,
#                 #'ad_slot_id',
    
# features_onehot = [
#                 'user_agent',
#                 'ad_slot_floor_price', 'ad_slot_width', 'ad_slot_height',
#                 'ad_slot_format'
#                 #'domain'
#                 #'ad_exchange'
#                 ]

# */

In [None]:
# trainset, _ = site_feat(trainset)
# validationset, site_featl = site_feat(validationset)
# #test_set, site_featl = site_feat(test_set)
# features.append(site_featl)

In [None]:
# trainset, _ = location_feat(trainset)
# validationset, location_featl = location_feat(validationset)
# #test_set, location_feat = location_feat(test_set)
# features.append(location_featl)

In [None]:
# trainset, _ = ad_size_feat(trainset)
# validationset, size_label = ad_size_feat(validationset)
# features.append(size_label)

In [None]:
import re

In [None]:
# uagent_desktop_label = 'uagent_desktop'
# regexp = re.compile(r'windows|linux|mac')
# trainset = apply_separate_uagent(trainset, regexp, uagent_desktop_label)
# validationset = apply_separate_uagent(validationset, regexp, uagent_desktop_label)
# features.append(uagent_desktop_label)

In [None]:
# uagent_mobile_label = 'uagent_mobile'
# regexp = re.compile(r'android|ios|other')
# trainset = apply_separate_uagent(trainset, regexp, uagent_mobile_label)
# validationset = apply_separate_uagent(validationset, regexp, uagent_mobile_label)
# features.append(uagent_mobile_label)

In [None]:
# interaction_columns = ['ad_slot_width', 'ad_slot_height']
# quad, quad_site_label = create_quad_features(trainset, interaction_columns, quad_site_label)
# trainset = apply_quadratic(trainset, quad, quad_site_label)
# validationset = apply_quadratic(validationset, quad, quad_site_label)
# features.append(quad_site_label)

# /*

In [None]:
# ip_fixed_label = 'ip_fixed'
# trainset.add_column(trainset.select_column('ip').apply(lambda x: x.index('.', x.index('.')+1)), ip_fixed_label)
# validationset.add_column(validationset.select_column('ip').apply(lambda x: x.index('.', x.index('.')+1)), ip_fixed_label)
# test_set.add_column(test_set.select_column('ip').apply(lambda x: x.index('.', x.index('.')+1)), ip_fixed_label)
# # features.append(ip_fixed_label)
# features_onehot.append(ip_fixed_label)

# */

In [None]:
# price_cat_label = 'price_cat_label'
# trainset.add_column(trainset.select_column('ad_slot_floor_price').apply(lambda x: floor_price_category(x)), 
#                    price_cat_label)
# validationset.add_column(validationset.select_column('ad_slot_floor_price').apply(lambda x: floor_price_category(x)), 
#                    price_cat_label)
# features.append(price_cat_label)

# /*

In [65]:
tags_label = 'tags'
trainset.add_column(trainset.select_column('user_tags').apply(lambda x: apply_dictionary(x)), tags_label)
validationset.add_column(validationset.select_column('user_tags').apply(lambda x: apply_dictionary(x)), tags_label)
test_set.add_column(test_set.select_column('user_tags').apply(lambda x: apply_dictionary(x)), tags_label)
# features.append(tags_label)
features_onehot.append(tags_label)

In [66]:
# domain_enc_label = 'encoded_domain'
# encoded_columns = ['domain', 'url']
# onehot, _ = create_onehot_features(trainset, encoded_columns, 100, domain_enc_label)
# trainset = apply_feature_eng(trainset, onehot, domain_enc_label)
# validationset = apply_feature_eng(validationset, onehot, domain_enc_label)
# test_set = apply_feature_eng(test_set, onehot, domain_enc_label)
# #features.append(domain_enc_label)
# features_onehot.append(domain_enc_label)

In [67]:
# timestamp_fixed_label = 'timestamp_fixed'
# trainset.add_column(trainset.select_column('timestamp').apply(lambda x: int(str(x)[:8])), timestamp_fixed_label)
# validationset.add_column(validationset.select_column('timestamp').apply(lambda x: int(str(x)[:8])), timestamp_fixed_label)
# test_set.add_column(test_set.select_column('timestamp').apply(lambda x: int(str(x)[:8])), timestamp_fixed_label)
# #features.append(timestamp_fixed_label)
# features_onehot.append(timestamp_fixed_label)

In [68]:
# features_onehot

In [69]:
# encoded_feat_label = 'encoded_features'
# onehot, _ = create_onehot_features(trainset, features_onehot, 100, encoded_feat_label)
# trainset = apply_feature_eng(trainset, onehot, encoded_feat_label)
# validationset = apply_feature_eng(validationset, onehot, encoded_feat_label)
# test_set = apply_feature_eng(test_set, onehot, encoded_feat_label)
# features.append(encoded_feat_label)

# */

### Feature Engineering used in all experiments

In [103]:
label_os = 'uagent_os'
label_browser = 'uagent_browser'
trainset = apply_separate_uagent2(trainset, label_os, label_browser)
validationset = apply_separate_uagent2(validationset, label_os, label_browser)
test_set = apply_separate_uagent2(test_set, label_os, label_browser)
# features.append([label_os, label_browser])

In [110]:
features_to_remove = ['city',
                              'region',
                              'user_id',
                              'log_type',
                              'timestamp',
                              'user_tags',
                              'creative_id',
                              'key_page_url',
                              'advertiser_id',
                              'anonymous_url_id']
trainset.remove_columns(features_to_remove)
validationset.remove_columns(features_to_remove)
# test_set.remove_columns(features_to_remove)

log_enc_features = ['url',
                'domain',
                'user_agent',
                'ad_slot_id',
                'ad_exchange',
                'ad_slot_width',
                'ad_slot_height',
                'ad_slot_format',
                'ad_slot_visibility',
                'ad_slot_floor_price']#, label_os, label_browser]

onehot, _ = create_onehot_features(trainset, log_enc_features, 108, 'log_encoded_features')
trainset = apply_feature_eng(trainset, onehot, 'log_encoded_features')
validationset = apply_feature_eng(validationset, onehot, 'log_encoded_features')
test_set = apply_feature_eng(test_set, onehot, 'log_encoded_features')


features = ['hour', 'weekday', 'log_encoded_features']




In [18]:
# To apply this feature, enable and run the related code in previous sections
features.append('tags')

### Just for SVM

In [110]:
features = ['hour', 'weekday', 'svm_encoded_features',#]
#svm_enc_features = [
#                 'url',
#                 'domain',
                'user_agent',
                'ad_slot_id',
                'ad_exchange',
                'ad_slot_width',
                'ad_slot_height',
                'ad_slot_format',
                'ad_slot_visibility',
                'ad_slot_floor_price']
svm_enc_features = [
                'url',
                'domain']
onehot, _ = create_onehot_features(trainset, svm_enc_features, 100, 'svm_encoded_features')
trainset = apply_feature_eng(trainset, onehot, 'svm_encoded_features')
validationset = apply_feature_eng(validationset, onehot, 'svm_encoded_features')
test_set = apply_feature_eng(test_set, onehot, 'svm_encoded_features')


### Some output

In [111]:
features

['hour', 'weekday', 'log_encoded_features']

In [112]:
trainset[features].head(5)

hour,weekday,log_encoded_features
0,6,"{385: 1, 360: 1, 204: 1, 493: 1, 373: 1, 250: 1, ..."
0,6,"{385: 1, 295: 1, 361: 1, 494: 1, 370: 1, 377: 1, ..."
0,6,"{385: 1, 289: 1, 168: 1, 361: 1, 494: 1, 369: 1, ..."
0,6,"{385: 1, 168: 1, 361: 1, 494: 1, 369: 1, 378: 1, ..."
0,6,"{385: 1, 327: 1, 360: 1, 199: 1, 493: 1, 368: 1, ..."


# Models (Run one model)

## Logistic Regression

In [27]:
params = {'target': 'click'}#, 'features':features, 'class_weights': 'auto'}
lparam_search = gl.toolkits.model_parameter_search.create((trainset, validationset),
                                    gl.logistic_classifier.create,params)
ps_results = lparam_search.get_results()
print ps_results

2016-04-21 11:22:53,261 [INFO] graphlab.deploy.job, 22: Validating job.
2016-04-21 11:22:54,717 [INFO] graphlab.deploy.job, 36: Creating a LocalAsync environment called 'async'.
2016-04-21 11:22:54,786 [INFO] graphlab.deploy.map_job, 186: Validation complete. Job: 'Model-Parameter-Search-Apr-21-2016-11-22-5300000' ready for execution
2016-04-21 11:23:20,677 [INFO] graphlab.deploy.map_job, 192: Job: 'Model-Parameter-Search-Apr-21-2016-11-22-5300000' scheduled.
2016-04-21 11:27:21,422 [INFO] graphlab.deploy.job, 22: Validating job.
2016-04-21 11:27:21,611 [INFO] graphlab.deploy.map_job, 220: A job with name 'Model-Parameter-Search-Apr-21-2016-11-22-5300000' already exists. Renaming the job to 'Model-Parameter-Search-Apr-21-2016-11-22-5300000-aae36'.
2016-04-21 11:27:21,620 [INFO] graphlab.deploy.map_job, 186: Validation complete. Job: 'Model-Parameter-Search-Apr-21-2016-11-22-5300000-aae36' ready for execution
2016-04-21 11:27:32,225 [INFO] graphlab.deploy.map_job, 192: Job: 'Model-Param

+----------+------------+------------+--------+-------------------+
| model_id | l1_penalty | l2_penalty | target | training_accuracy |
+----------+------------+------------+--------+-------------------+
|    9     |    1.0     |    1.0     | click  |   0.999288254006  |
|    8     |    0.01    |    0.0     | click  |   0.999289814852  |
|    1     |    1.0     |    10.0    | click  |   0.999289814852  |
|    0     |    1.0     |   100.0    | click  |   0.999287863794  |
|    3     |    1.0     |   0.0001   | click  |   0.999288254006  |
|    2     |    0.1     |    0.1     | click  |   0.999286693159  |
|    5     |    0.0     |    0.0     | click  |   0.998943306934  |
|    4     |    10.0    |    0.1     | click  |   0.999287863794  |
|    7     |   0.0001   |    0.01    | click  |   0.999287863794  |
|    6     |    1.0     |   0.0001   | click  |   0.999285522525  |
+----------+------------+------------+--------+-------------------+
+---------------------+
| validation_accuracy |


In [107]:
gbm_baseline = gl.logistic_classifier.create(trainset, target='click', features=features,# l2_penalty=0.01, 
                                             validation_set=validationset, max_iterations=20, class_weights='auto')


## Boosted Tree Classifier (GBM)

In [117]:
params = {'target': 'click'}#, 'features':features, 'class_weights': 'auto'}
bparam_search = gl.toolkits.model_parameter_search.create((trainset, validationset),
                                    gl.boosted_trees_classifier.create,params)
ps_results = bparam_search.get_results()
print ps_results

2016-04-21 00:39:11,306 [INFO] graphlab.deploy.job, 22: Validating job.
2016-04-21 00:39:11,765 [INFO] graphlab.deploy.map_job, 186: Validation complete. Job: 'Model-Parameter-Search-Apr-21-2016-00-39-1100000' ready for execution
2016-04-21 00:39:17,787 [INFO] graphlab.deploy.map_job, 192: Job: 'Model-Parameter-Search-Apr-21-2016-00-39-1100000' scheduled.
2016-04-21 00:50:18,651 [INFO] graphlab.deploy.job, 22: Validating job.
2016-04-21 00:50:18,821 [INFO] graphlab.deploy.map_job, 220: A job with name 'Model-Parameter-Search-Apr-21-2016-00-39-1100000' already exists. Renaming the job to 'Model-Parameter-Search-Apr-21-2016-00-39-1100000-ab55a'.
2016-04-21 00:50:18,848 [INFO] graphlab.deploy.map_job, 186: Validation complete. Job: 'Model-Parameter-Search-Apr-21-2016-00-39-1100000-ab55a' ready for execution
2016-04-21 00:50:25,410 [INFO] graphlab.deploy.map_job, 192: Job: 'Model-Parameter-Search-Apr-21-2016-00-39-1100000-ab55a' scheduled.


+----------+------------------+-----------+----------------+------------------+
| model_id | column_subsample | max_depth | max_iterations | min_child_weight |
+----------+------------------+-----------+----------------+------------------+
|    9     |       1.0        |     8     |       10       |        1         |
|    8     |       0.8        |     8     |       10       |        16        |
|    1     |       1.0        |     6     |      100       |        16        |
|    0     |       0.9        |     4     |      100       |        16        |
|    3     |       1.0        |     6     |      100       |        8         |
|    2     |       1.0        |     6     |       10       |        2         |
|    5     |       0.9        |     6     |      100       |        2         |
|    4     |       0.9        |     6     |       50       |        16        |
|    7     |       0.9        |     6     |       10       |        16        |
|    6     |       0.9        |     4   

In [113]:
gbm_baseline = gl.boosted_trees_classifier.create(trainset, target='click', features=features,
                                                  validation_set=validationset, max_depth=8, step_size=0.5,
                                                  max_iterations=10, min_child_weight=1, column_subsample=1,
                                                  row_subsample=0.9,
                                                  #early_stopping_rounds=5)#, 
                                                   class_weights='auto')

## SVM

In [34]:
params = {'target': 'click'}#, 'features':features, 'class_weights': 'auto'}
param_search = gl.toolkits.model_parameter_search.create((trainset, validationset),
                                    gl.svm_classifier.create,params)
ps_results = param_search.get_results()
print ps_results

2016-04-21 12:21:24,298 [INFO] graphlab.deploy.job, 22: Validating job.
2016-04-21 12:21:24,774 [INFO] graphlab.deploy.map_job, 186: Validation complete. Job: 'Model-Parameter-Search-Apr-21-2016-12-21-2400000' ready for execution
2016-04-21 12:21:32,324 [INFO] graphlab.deploy.map_job, 192: Job: 'Model-Parameter-Search-Apr-21-2016-12-21-2400000' scheduled.
2016-04-21 12:24:08,500 [INFO] graphlab.deploy.job, 22: Validating job.
2016-04-21 12:24:08,724 [INFO] graphlab.deploy.map_job, 220: A job with name 'Model-Parameter-Search-Apr-21-2016-12-21-2400000' already exists. Renaming the job to 'Model-Parameter-Search-Apr-21-2016-12-21-2400000-58e9e'.
2016-04-21 12:24:08,732 [INFO] graphlab.deploy.map_job, 186: Validation complete. Job: 'Model-Parameter-Search-Apr-21-2016-12-21-2400000-58e9e' ready for execution
2016-04-21 12:24:16,156 [INFO] graphlab.deploy.map_job, 192: Job: 'Model-Parameter-Search-Apr-21-2016-12-21-2400000-58e9e' scheduled.


+----------+---------+--------+-------------------+---------------------+
| model_id | penalty | target | training_accuracy | validation_accuracy |
+----------+---------+--------+-------------------+---------------------+
|    9     |   1.0   | click  |   0.999416633629  |    0.99883545547    |
|    8     |   10.0  | click  |   0.999401805587  |    0.998817917149   |
|    1     |  0.001  | click  |   0.999490383625  |    0.999003823354   |
|    0     |   0.01  | click  |   0.999410000031  |    0.998856501456   |
|    3     |  0.001  | click  |   0.999480628334  |    0.998961731383   |
|    2     |   1.0   | click  |   0.999423657438  |    0.99888456277    |
|    5     |   1.0   | click  |   0.999397123048  |    0.998789855835   |
|    4     |   0.1   | click  |   0.999416243417  |    0.998842470799   |
|    7     |   0.1   | click  |   0.999425218284  |    0.998898593427   |
|    6     |   10.0  | click  |   0.999415853206  |    0.99886000912    |
+----------+---------+--------+-------

In [62]:
gbm_baseline = gl.svm_classifier.create(trainset, target='click', features=features, 
                                         penalty=1,
                                            validation_set=validationset, class_weights='auto',
                                            max_iterations=10)

# Evaluation

In [114]:
results = gbm_baseline.evaluate(validationset)
print results

{'f1_score': 0.003549378858699728, 'auc': 0.7828071837814041, 'recall': 0.6859903381642513, 'precision': 0.0017792925432606162, 'log_loss': 0.5360333657749382, 'roc_curve': Columns:
	threshold	float
	fpr	float
	tpr	float
	p	int
	n	int

Rows: 100001

Data:
+-----------+-----+-----+-----+--------+
| threshold | fpr | tpr |  p  |   n    |
+-----------+-----+-----+-----+--------+
|    0.0    | 1.0 | 1.0 | 207 | 284883 |
|   1e-05   | 1.0 | 1.0 | 207 | 284883 |
|   2e-05   | 1.0 | 1.0 | 207 | 284883 |
|   3e-05   | 1.0 | 1.0 | 207 | 284883 |
|   4e-05   | 1.0 | 1.0 | 207 | 284883 |
|   5e-05   | 1.0 | 1.0 | 207 | 284883 |
|   6e-05   | 1.0 | 1.0 | 207 | 284883 |
|   7e-05   | 1.0 | 1.0 | 207 | 284883 |
|   8e-05   | 1.0 | 1.0 | 207 | 284883 |
|   9e-05   | 1.0 | 1.0 | 207 | 284883 |
+-----------+-----+-----+-----+--------+
[100001 rows x 5 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns., 'confusion_matr

In [61]:
results['roc_curve'].select_columns(['fpr', 'tpr']).save('./output/gbm_roc_curve.csv')

## Output

In [None]:
gbm_predictions = gbm_baseline.predict(test_set, output_type='probability')

with open('./output/gbm_predictions.csv', mode='w') as prediction_file:
    # write headers to file
    prediction_file.write('Id,Prediction\n')
    prediction_id = 1
    for prediction in gbm_predictions:
        prediction_file.write('{},{:.5f}\n'.format(prediction_id, prediction))
        prediction_id += 1

prediction_file.close()

## SVM Output (Using Platt scaling)

In [53]:
import math

In [63]:
svm_predictions = gbm_baseline.predict(test_set, output_type='class')
svm_values = gbm_baseline.predict(test_set, output_type='margin')

n_1 = svm_predictions.filter(lambda x: x == 1).size()
n_0 = svm_predictions.filter(lambda x: x == 0).size()

_a, _b = platt_scaling(svm_values, svm_predictions, n_1, n_0)
print _a
print _b
svm_predictions = svm_predictions.apply(lambda x: apply_platt(x, _a, _b))

# open support vector machines model predictions file
with open('./output/svm_predictions.csv', mode='w') as svm_prediction_file:
    # write headers to file
    svm_prediction_file.write('Id,Prediction\n')
    # set support vector machines model prediction id to 1
    svm_prediction_id = 1
    # for every support vector machines model prediction
    for svm_prediction in svm_predictions:
        # write support vector machines model prediction to file in requested format
        svm_prediction_file.write('{},{:.5f}\n'.format(svm_prediction_id, svm_prediction))
        # increment support vector machines model prediction id
        svm_prediction_id += 1

# close support vector machines model predictions file
svm_prediction_file.close()

### AUC for SVM

In [51]:
targets = validationset['click']
val_predictions = gbm_baseline.predict(validationset, output_type='class')
svm_auc = gl.evaluation.auc(targets, val_predictions)
print svm_auc

0.610000366114
