In [1]:
import datetime
import pandas as pd
import numpy as np
import geopy as gp
from geopy.distance import vincenty
from sklearn.cross_validation import train_test_split
import xgboost as xgb
import random
from operator import itemgetter
import zipfile
from sklearn.metrics import roc_auc_score
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

random.seed(2016)

def intersect(a, b):
    return list(set(a) & set(b))

def get_features(train, test):
    trainval = list(train.columns.values)
    testval = list(test.columns.values)
    output = intersect(trainval, testval)
    output.remove('itemID_1')
    output.remove('itemID_2')
    return output


def create_feature_map(features):
    outfile = open('xgb.fmap', 'w')
    for i, feat in enumerate(features):
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))
    outfile.close()


def get_importance(gbm, features):
    create_feature_map(features)
    importance = gbm.get_fscore(fmap='xgb.fmap')
    importance = sorted(importance.items(), key=itemgetter(1), reverse=True)
    return importance

def print_features_importance(imp):
    for i in range(len(imp)):
        print("# " + str(imp[i][1]))
        print('output.remove(\'' + imp[i][0] + '\')')


def run_default_test(train, test, features, target, random_state=0):
    eta = 0.1
    max_depth = 5
    subsample = 0.8
    colsample_bytree = 0.8
    start_time = time.time()

    print('XGBoost params. ETA: {}, MAX_DEPTH: {}, SUBSAMPLE: {}, COLSAMPLE_BY_TREE: {}'.format(eta, max_depth, subsample, colsample_bytree))
    params = {
        "objective": "binary:logistic",
        "booster" : "gbtree",
        "eval_metric": "auc",
        "eta": eta,
        "max_depth": max_depth,
        "subsample": subsample,
        "colsample_bytree": colsample_bytree,
        "silent": 1,
        "seed": random_state
    }
    num_boost_round = 300
    early_stopping_rounds = 20
    test_size = 0.1

    X_train, X_valid = train_test_split(train, test_size=test_size, random_state=random_state)
    y_train = X_train[target]
    y_valid = X_valid[target]
    dtrain = xgb.DMatrix(X_train[features], y_train)
    dvalid = xgb.DMatrix(X_valid[features], y_valid)

    watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
    gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=True)

    print("Validating...")
    check = gbm.predict(xgb.DMatrix(X_valid[features]), ntree_limit=gbm.best_ntree_limit)
    score = roc_auc_score(X_valid[target].values, check)
    print('Check error value: {:.6f}'.format(score))

    imp = get_importance(gbm, features)
    print('Importance array: ', imp)

    print("Predict test set...")
    test_prediction = gbm.predict(xgb.DMatrix(test[features]), ntree_limit=gbm.best_ntree_limit)

    print('Training time: {} minutes'.format(round((time.time() - start_time)/60, 2)))
    return test_prediction.tolist(), score


def create_submission(score, test, prediction):
    # Make Submission
    now = datetime.datetime.now()
    sub_file = 'submission_' + str(score) + '_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
    print('Writing submission: ', sub_file)
    f = open(sub_file, 'w')
    f.write('id,probability\n')
    total = 0
    for id in test['id']:
        str1 = str(id) + ',' + str(prediction[total])
        str1 += '\n'
        total += 1
        f.write(str1)
    f.close()

    # print('Creating zip-file...')
    # z = zipfile.ZipFile(sub_file + ".zip", "w", zipfile.ZIP_DEFLATED)
    # z.write(sub_file)
    # z.close()



In [2]:
def read_test_train():
    
    # A version without preprocessing

    train = pd.read_csv('input/train_merged.csv')
    test = pd.read_csv('input/test_merged.csv')
    
    features = get_features(train, test)
    return train, test, features
        

In [None]:
train, test, features = read_test_train()
test_prediction, score = run_default_test(train, test, features, 'isDuplicate')
print('Real score = {}'.format(score))
create_submission(score, test, test_prediction)

XGBoost params. ETA: 0.1, MAX_DEPTH: 5, SUBSAMPLE: 0.8, COLSAMPLE_BY_TREE: 0.8
Will train until eval error hasn't decreased in 20 rounds.
[0]	train-auc:0.726801	eval-auc:0.726486
[1]	train-auc:0.735297	eval-auc:0.734875
[2]	train-auc:0.751493	eval-auc:0.751567
[3]	train-auc:0.750622	eval-auc:0.750656
[4]	train-auc:0.755380	eval-auc:0.755278
[5]	train-auc:0.756313	eval-auc:0.756266
[6]	train-auc:0.757541	eval-auc:0.757354
[7]	train-auc:0.759855	eval-auc:0.759600
[8]	train-auc:0.763878	eval-auc:0.763547
[9]	train-auc:0.765845	eval-auc:0.765479
[10]	train-auc:0.765878	eval-auc:0.765387
[11]	train-auc:0.768155	eval-auc:0.767723
[12]	train-auc:0.769464	eval-auc:0.769084
[13]	train-auc:0.769999	eval-auc:0.769578
[14]	train-auc:0.772055	eval-auc:0.771668
[15]	train-auc:0.772509	eval-auc:0.772112
[16]	train-auc:0.773249	eval-auc:0.772866
[17]	train-auc:0.773723	eval-auc:0.773329
[18]	train-auc:0.774321	eval-auc:0.773899
[19]	train-auc:0.774609	eval-auc:0.774183
[20]	train-auc:0.774855	eval-auc