In [20]:
import datetime
import os
import pandas as pd
import numpy as np
import geopy as gp
import xgboost as xgb
import random
import zipfile
import time

from geopy.distance import vincenty
from ml_metrics import auc
from operator import itemgetter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

random.seed(4029861075L)


def create_feature_map(features):
    outfile = open('xgb.fmap', 'w')
    for i, feat in enumerate(features):
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))
    outfile.close()


def get_importance(gbm, features):
    create_feature_map(features)
    importance = gbm.get_fscore(fmap='xgb.fmap')
    importance = sorted(importance.items(), key=itemgetter(1), reverse=True)
    return importance

def print_features_importance(imp):
    for i in range(len(imp)):
        print("# " + str(imp[i][1]))
        print('output.remove(\'' + imp[i][0] + '\')')


def run_default_test(train, test, features, target, random_state=963254170L, create_submission=False):
    eta = 0.5
    max_depth = 5
    subsample = 0.8
    colsample_bytree = 0.8
    start_time = time.time()
    
    folds = os.listdir("folds")

    print('XGBoost params. ETA: {}, MAX_DEPTH: {}, SUBSAMPLE: {}, COLSAMPLE_BY_TREE: {}'.format(eta, max_depth, subsample, colsample_bytree))
    
    params = {
        "objective": "binary:logistic",
        "booster" : "gbtree",
        "eval_metric": "auc",
        "eta": eta,
        "max_depth": max_depth,
        "subsample": subsample,
        "colsample_bytree": colsample_bytree,
        "silent": 1,
        "seed": random_state
    }
    num_boost_round = 50
    early_stopping_rounds = 20
    test_size = 0.1
    
    if not create_submission:
        scores = {k: 0 for k in folds}

        for fold in folds:

            foldnp = np.loadtxt('folds/' + fold, dtype=np.bool)

            X_train, X_valid = train[~foldnp], train[foldnp]
            # X_train, X_valid = train_test_split(train, test_size=test_size, random_state=random_state)
            y_train = X_train[target]
            y_valid = X_valid[target]
            dtrain = xgb.DMatrix(X_train[features], y_train)
            dvalid = xgb.DMatrix(X_valid[features], y_valid)

            watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
            gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=True)

            print("Validating...")
            check = gbm.predict(xgb.DMatrix(X_valid[features]), ntree_limit=gbm.best_ntree_limit)
            # Use the same metric as kaggle instead of sklearn's one.
            # score = roc_auc_score(X_valid[target].values, check)
            score = auc(X_valid[target].values, check)
            scores[fold] = score
            print('FOLD: ' + fold)
            print('Check error value: {:.6f}'.format(score))

            imp = get_importance(gbm, features)
            print('Importance array: ', imp)
        
        print('Training time: {} minutes'.format(round((time.time() - start_time)/60, 2)))
        return scores
    else:

        print("Train on whole set")
        dtrain_full = xgb.DMatrix(train[features], train[target])
        gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=True)

        print("Predict test set...")
        test_prediction = gbm.predict(xgb.DMatrix(test[features]), ntree_limit=gbm.best_ntree_limit)

        
        print('Training time: {} minutes'.format(round((time.time() - start_time)/60, 2)))
        return test_prediction.tolist()


def create_submission(prediction):
    # Make Submission
    now = datetime.datetime.now()
    sub_file = 'submission_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
    print('Writing submission: ', sub_file)
    f = open(sub_file, 'w')
    f.write('id,probability\n')
    total = 0
    for id in test['id']:
        str1 = str(id) + ',' + str(prediction[total])
        str1 += '\n'
        total += 1
        f.write(str1)
    f.close()

    # print('Creating zip-file...')
    # z = zipfile.ZipFile(sub_file + ".zip", "w", zipfile.ZIP_DEFLATED)
    # z.write(sub_file)
    # z.close()

In [8]:

def get_features(train, test):
    trainval = list(train.columns.values)
    testval = list(test.columns.values)
    output = intersect(trainval, testval)
    output.remove('itemID_1')
    output.remove('itemID_2')
    return output

def intersect(a, b):
    return list(set(a) & set(b))

def read_test_train():
    
    # A version without preprocessing

    train = pd.read_csv('input/train_merged.csv')
    test = pd.read_csv('input/test_merged.csv')
    
    features = get_features(train, test)
    return train, test, features
        

In [9]:

train, test, features = read_test_train()
print('Length of train: ', len(train))
print('Length of test: ', len(test))
print('Features [{}]: {}'.format(len(features), sorted(features)))


('Length of train: ', 2991396)
('Length of test: ', 1044196)
Features [28]: ['Unnamed: 0', 'categoryID_1', 'categoryID_2', 'categoryID_same', 'distance', 'lat_1', 'lat_2', 'lat_same', 'len_attrsJSON', 'len_description', 'len_title', 'locationID_1', 'locationID_2', 'locationID_same', 'lon_1', 'lon_2', 'lon_same', 'metroID_1', 'metroID_2', 'metroID_same', 'parentCategoryID_1', 'parentCategoryID_2', 'price_1', 'price_2', 'price_same', 'regionID_1', 'regionID_2', 'regionID_same']


In [None]:
scores = run_default_test(train, test, features, 'isDuplicate', False)
print(scores)
print('Real score = {}'.format(score))

In [None]:
test_prediction = run_default_test(train, test, features, 'isDuplicate', True)
create_submission(test_prediction)