In [1]:
import datetime
import os
import pandas as pd
import numpy as np
import geopy as gp
import xgboost as xgb
import random
import zipfile
import time

from geopy.distance import vincenty
from ml_metrics import auc
from operator import itemgetter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

random.seed(4029861075L)


def create_feature_map(features):
    outfile = open('xgb.fmap', 'w')
    for i, feat in enumerate(features):
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))
    outfile.close()


def get_importance(gbm, features):
    create_feature_map(features)
    importance = gbm.get_fscore(fmap='xgb.fmap')
    importance = sorted(importance.items(), key=itemgetter(1), reverse=True)
    return importance

def print_features_importance(imp):
    for i in range(len(imp)):
        print("# " + str(imp[i][1]))
        print('output.remove(\'' + imp[i][0] + '\')')


def run_default_test(train, test, features, target, random_state=963254170L, create_submission=False,
                     eta=0.5, rounds=50):
    max_depth = 5
    subsample = 0.8
    colsample_bytree = 0.8
    start_time = time.time()
    
    folds = os.listdir("folds")

    print('XGBoost params. ETA: {}, MAX_DEPTH: {}, SUBSAMPLE: {}, COLSAMPLE_BY_TREE: {}'.format(eta, max_depth, subsample, colsample_bytree))
    
    params = {
        "objective": "binary:logistic",
        "booster" : "gbtree",
        "eval_metric": "auc",
        "eta": eta,
        "max_depth": max_depth,
        "subsample": subsample,
        "colsample_bytree": colsample_bytree,
        "silent": 1,
        "seed": random_state
    }
    num_boost_round = rounds
    early_stopping_rounds = 20
    test_size = 0.1
    
    if not create_submission:
        scores = {k: 0 for k in folds}

        for fold in folds:

            foldnp = np.loadtxt('folds/' + fold, dtype=np.bool)

            X_train, X_valid = train[~foldnp], train[foldnp]
            # X_train, X_valid = train_test_split(train, test_size=test_size, random_state=random_state)
            y_train = X_train[target]
            y_valid = X_valid[target]
            dtrain = xgb.DMatrix(X_train[features], y_train)
            dvalid = xgb.DMatrix(X_valid[features], y_valid)

            watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
            gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=True)

            print("Validating...")
            check = gbm.predict(xgb.DMatrix(X_valid[features]), ntree_limit=gbm.best_ntree_limit)
            # Use the same metric as kaggle instead of sklearn's one.
            # score = roc_auc_score(X_valid[target].values, check)
            score = auc(X_valid[target].values, check)
            scores[fold] = score
            print('FOLD: ' + fold)
            print('Check error value: {:.6f}'.format(score))

            imp = get_importance(gbm, features)
            print('Importance array: ', imp)
        
        print('Training time: {} minutes'.format(round((time.time() - start_time)/60, 2)))
        return scores
    else:

        print("Train on whole set")
        dtrain_full = xgb.DMatrix(train[features], train[target])
        watchlist = [(dtrain_full, 'train')]
        gbm = xgb.train(params, dtrain_full, num_boost_round, evals=watchlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=True)

        print("Predict test set...")
        test_prediction = gbm.predict(xgb.DMatrix(test[features]), ntree_limit=gbm.best_ntree_limit)

        
        print('Training time: {} minutes'.format(round((time.time() - start_time)/60, 2)))
        return test_prediction.tolist()


def create_submission(prediction):
    # Make Submission
    now = datetime.datetime.now()
    sub_file = 'submission_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
    print('Writing submission: ', sub_file)
    f = open(sub_file, 'w')
    f.write('id,probability\n')
    total = 0
    for id in test['id']:
        str1 = str(id) + ',' + str(prediction[total])
        str1 += '\n'
        total += 1
        f.write(str1)
    f.close()

    # print('Creating zip-file...')
    # z = zipfile.ZipFile(sub_file + ".zip", "w", zipfile.ZIP_DEFLATED)
    # z.write(sub_file)
    # z.close()

In [2]:

def get_features(train, test):
    trainval = list(train.columns.values)
    testval = list(test.columns.values)
    output = intersect(trainval, testval)
    output.remove('itemID_1')
    output.remove('itemID_2')
    return output

def intersect(a, b):
    return list(set(a) & set(b))

def read_test_train():
    
    # A version without preprocessing

    train = pd.read_csv('input/train_merged.csv')
    test = pd.read_csv('input/test_merged.csv')
    
    features = get_features(train, test)
    return train, test, features
        

In [3]:

train, test, features = read_test_train()
print('Length of train: ', len(train))
print('Length of test: ', len(test))
print('Features [{}]: {}'.format(len(features), sorted(features)))


('Length of train: ', 2991396)
('Length of test: ', 1044196)
Features [17]: ['Unnamed: 0', 'category', 'description_dist', 'distance', 'lat_diff', 'lat_same', 'lat_sum', 'location', 'lon_diff', 'lon_same', 'lon_sum', 'metro', 'price_abs', 'price_diff', 'price_sum', 'region', 'title_dist']


In [4]:
scores = run_default_test(train, test, features, 'isDuplicate', False)
print(scores)

XGBoost params. ETA: 0.5, MAX_DEPTH: 5, SUBSAMPLE: 0.8, COLSAMPLE_BY_TREE: 0.8


Will train until eval error hasn't decreased in 20 rounds.
[0]	train-auc:0.780933	eval-auc:0.783026
[1]	train-auc:0.788174	eval-auc:0.789685
[2]	train-auc:0.801107	eval-auc:0.802603
[3]	train-auc:0.808234	eval-auc:0.809484
[4]	train-auc:0.811185	eval-auc:0.812281
[5]	train-auc:0.814788	eval-auc:0.815796
[6]	train-auc:0.816918	eval-auc:0.817963
[7]	train-auc:0.817967	eval-auc:0.818902
[8]	train-auc:0.822484	eval-auc:0.823381
[9]	train-auc:0.824798	eval-auc:0.825546
[10]	train-auc:0.828976	eval-auc:0.829573
[11]	train-auc:0.830210	eval-auc:0.830795
[12]	train-auc:0.830918	eval-auc:0.831451
[13]	train-auc:0.831338	eval-auc:0.831885
[14]	train-auc:0.832160	eval-auc:0.832640
[15]	train-auc:0.834090	eval-auc:0.834518
[16]	train-auc:0.835021	eval-auc:0.835368
[17]	train-auc:0.835835	eval-auc:0.836213
[18]	train-auc:0.837489	eval-auc:0.837889
[19]	train-auc:0.837914	eval-auc:0.838274
[20]	train-auc:0.839332	eval-auc:0.839725
[21]	train-auc:0.840343	eval-auc:0.840757
[22]	train-auc:0.840796	eva

Validating...
FOLD: fold_end.txt
Check error value: 0.851347
('Importance array: ', [('category', 320), ('price_abs', 203), ('description_dist', 184), ('title_dist', 175), ('lat_sum', 140), ('lon_sum', 133), ('Unnamed: 0', 69), ('distance', 61), ('metro', 56), ('region', 51), ('location', 50), ('lat_diff', 33), ('lon_diff', 30), ('len_description', 7), ('len_attrsJSON', 3), ('len_title', 1)])


Will train until eval error hasn't decreased in 20 rounds.
[0]	train-auc:0.782619	eval-auc:0.781324
[1]	train-auc:0.797197	eval-auc:0.796052
[2]	train-auc:0.804474	eval-auc:0.803490
[3]	train-auc:0.807260	eval-auc:0.806180
[4]	train-auc:0.815332	eval-auc:0.814067
[5]	train-auc:0.817670	eval-auc:0.816307
[6]	train-auc:0.823089	eval-auc:0.821910
[7]	train-auc:0.824828	eval-auc:0.823568
[8]	train-auc:0.827394	eval-auc:0.826166
[9]	train-auc:0.828965	eval-auc:0.827662
[10]	train-auc:0.830739	eval-auc:0.829502
[11]	train-auc:0.831477	eval-auc:0.830305
[12]	train-auc:0.833393	eval-auc:0.832191
[13]	train-auc:0.834295	eval-auc:0.833021
[14]	train-auc:0.835942	eval-auc:0.834662
[15]	train-auc:0.836998	eval-auc:0.835725
[16]	train-auc:0.837945	eval-auc:0.836586
[17]	train-auc:0.838214	eval-auc:0.836787
[18]	train-auc:0.838526	eval-auc:0.837084
[19]	train-auc:0.839303	eval-auc:0.837802
[20]	train-auc:0.840111	eval-auc:0.838686
[21]	train-auc:0.841474	eval-auc:0.839934
[22]	train-auc:0.841786	eva

Validating...
FOLD: fold_start.txt
Check error value: 0.847107
('Importance array: ', [('category', 279), ('title_dist', 190), ('price_abs', 184), ('description_dist', 164), ('lat_sum', 133), ('lon_sum', 129), ('location', 73), ('distance', 67), ('metro', 58), ('Unnamed: 0', 52), ('lon_diff', 51), ('region', 47), ('lat_diff', 41), ('len_attrsJSON', 14), ('len_description', 12), ('len_title', 6)])


Will train until eval error hasn't decreased in 20 rounds.
[0]	train-auc:0.751639	eval-auc:0.752743
[1]	train-auc:0.794018	eval-auc:0.793997
[2]	train-auc:0.794410	eval-auc:0.794557
[3]	train-auc:0.810616	eval-auc:0.810545
[4]	train-auc:0.815379	eval-auc:0.815170
[5]	train-auc:0.816286	eval-auc:0.816034
[6]	train-auc:0.818590	eval-auc:0.818390
[7]	train-auc:0.821277	eval-auc:0.821139
[8]	train-auc:0.826478	eval-auc:0.826098
[9]	train-auc:0.829091	eval-auc:0.828723
[10]	train-auc:0.831320	eval-auc:0.830854
[11]	train-auc:0.832626	eval-auc:0.832161
[12]	train-auc:0.833674	eval-auc:0.833191
[13]	train-auc:0.834374	eval-auc:0.833854
[14]	train-auc:0.835302	eval-auc:0.834859
[15]	train-auc:0.835740	eval-auc:0.835274
[16]	train-auc:0.836376	eval-auc:0.835920
[17]	train-auc:0.836797	eval-auc:0.836307
[18]	train-auc:0.838446	eval-auc:0.837930
[19]	train-auc:0.838656	eval-auc:0.838068
[20]	train-auc:0.838822	eval-auc:0.838203
[21]	train-auc:0.839173	eval-auc:0.838506
[22]	train-auc:0.840110	eva

Validating...
FOLD: fold_middle.txt
Check error value: 0.849534
('Importance array: ', [('category', 307), ('price_abs', 174), ('description_dist', 162), ('lat_sum', 157), ('title_dist', 153), ('lon_sum', 137), ('Unnamed: 0', 76), ('distance', 70), ('region', 65), ('location', 53), ('metro', 52), ('lat_diff', 42), ('lon_diff', 39), ('len_description', 12), ('len_title', 10), ('len_attrsJSON', 7)])


Will train until eval error hasn't decreased in 20 rounds.
[0]	train-auc:0.782231	eval-auc:0.782282
[1]	train-auc:0.792934	eval-auc:0.793058
[2]	train-auc:0.796961	eval-auc:0.797130
[3]	train-auc:0.809050	eval-auc:0.809095
[4]	train-auc:0.815280	eval-auc:0.815399
[5]	train-auc:0.819125	eval-auc:0.819364
[6]	train-auc:0.820312	eval-auc:0.820514
[7]	train-auc:0.821669	eval-auc:0.821921
[8]	train-auc:0.826348	eval-auc:0.826431
[9]	train-auc:0.828137	eval-auc:0.828132
[10]	train-auc:0.830139	eval-auc:0.830113
[11]	train-auc:0.832297	eval-auc:0.832254
[12]	train-auc:0.832889	eval-auc:0.832850
[13]	train-auc:0.834144	eval-auc:0.834081
[14]	train-auc:0.834913	eval-auc:0.834817
[15]	train-auc:0.836433	eval-auc:0.836314
[16]	train-auc:0.837291	eval-auc:0.837147
[17]	train-auc:0.838168	eval-auc:0.838024
[18]	train-auc:0.838955	eval-auc:0.838803
[19]	train-auc:0.839423	eval-auc:0.839267
[20]	train-auc:0.840314	eval-auc:0.840140
[21]	train-auc:0.840625	eval-auc:0.840421
[22]	train-auc:0.841341	eva

Validating...
FOLD: fold_random.txt
Check error value: 0.850802
('Importance array: ', [('category', 305), ('price_abs', 186), ('description_dist', 176), ('title_dist', 164), ('lat_sum', 152), ('lon_sum', 137), ('Unnamed: 0', 80), ('distance', 67), ('region', 54), ('location', 54), ('metro', 48), ('lon_diff', 40), ('lat_diff', 22), ('len_description', 15), ('len_attrsJSON', 12), ('len_title', 11)])
Training time: 6.86 minutes
{'fold_start.txt': 0.8471074034499643, 'fold_random.txt': 0.8508019482857087, 'fold_end.txt': 0.851347432022137, 'fold_middle.txt': 0.8495338735010312}


In [5]:
test_prediction = run_default_test(train, test, features, 'isDuplicate', create_submission=True, eta=0.1, rounds=500)
create_submission(test_prediction)

XGBoost params. ETA: 0.1, MAX_DEPTH: 5, SUBSAMPLE: 0.8, COLSAMPLE_BY_TREE: 0.8
Train on whole set


Will train until train error hasn't decreased in 20 rounds.
[0]	train-auc:0.661892
[1]	train-auc:0.795420
[2]	train-auc:0.803870
[3]	train-auc:0.808281
[4]	train-auc:0.810293
[5]	train-auc:0.813578
[6]	train-auc:0.814400
[7]	train-auc:0.814605
[8]	train-auc:0.815193
[9]	train-auc:0.816434
[10]	train-auc:0.816806
[11]	train-auc:0.817613
[12]	train-auc:0.817928
[13]	train-auc:0.818085
[14]	train-auc:0.818295
[15]	train-auc:0.819936
[16]	train-auc:0.820363
[17]	train-auc:0.820953
[18]	train-auc:0.821263
[19]	train-auc:0.822157
[20]	train-auc:0.822497
[21]	train-auc:0.823705
[22]	train-auc:0.824569
[23]	train-auc:0.825251
[24]	train-auc:0.826221
[25]	train-auc:0.827104
[26]	train-auc:0.827429
[27]	train-auc:0.827764
[28]	train-auc:0.828379
[29]	train-auc:0.829125
[30]	train-auc:0.829822
[31]	train-auc:0.830555
[32]	train-auc:0.830902
[33]	train-auc:0.831600
[34]	train-auc:0.832151
[35]	train-auc:0.832765
[36]	train-auc:0.833066
[37]	train-auc:0.833418
[38]	train-auc:0.833750
[39]	train-auc

Predict test set...
Training time: 23.01 minutes
('Writing submission: ', 'submission_2016-07-01-16-49.csv')


In [None]:
# Należy przepuścić przez filtry predykcję -> logit?