In [None]:
import datetime
import pandas as pd
import numpy as np
import geopy as gp
from geopy.distance import vincenty
from sklearn.cross_validation import train_test_split
import xgboost as xgb
import random
from operator import itemgetter
import zipfile
from sklearn.metrics import roc_auc_score
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

random.seed(2016)


def create_feature_map(features):
    outfile = open('xgb.fmap', 'w')
    for i, feat in enumerate(features):
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))
    outfile.close()


def get_importance(gbm, features):
    create_feature_map(features)
    importance = gbm.get_fscore(fmap='xgb.fmap')
    importance = sorted(importance.items(), key=itemgetter(1), reverse=True)
    return importance

def print_features_importance(imp):
    for i in range(len(imp)):
        print("# " + str(imp[i][1]))
        print('output.remove(\'' + imp[i][0] + '\')')


def run_default_test(train, test, features, target, random_state=0):
    eta = 0.1
    max_depth = 5
    subsample = 0.8
    colsample_bytree = 0.8
    start_time = time.time()

    print('XGBoost params. ETA: {}, MAX_DEPTH: {}, SUBSAMPLE: {}, COLSAMPLE_BY_TREE: {}'.format(eta, max_depth, subsample, colsample_bytree))
    params = {
        "objective": "binary:logistic",
        "booster" : "gbtree",
        "eval_metric": "auc",
        "eta": eta,
        "max_depth": max_depth,
        "subsample": subsample,
        "colsample_bytree": colsample_bytree,
        "silent": 1,
        "seed": random_state
    }
    num_boost_round = 300
    early_stopping_rounds = 20
    test_size = 0.1

    X_train, X_valid = train_test_split(train, test_size=test_size, random_state=random_state)
    y_train = X_train[target]
    y_valid = X_valid[target]
    dtrain = xgb.DMatrix(X_train[features], y_train)
    dvalid = xgb.DMatrix(X_valid[features], y_valid)

    watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
    gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=True)

    print("Validating...")
    check = gbm.predict(xgb.DMatrix(X_valid[features]), ntree_limit=gbm.best_ntree_limit)
    score = roc_auc_score(X_valid[target].values, check)
    print('Check error value: {:.6f}'.format(score))

    imp = get_importance(gbm, features)
    print('Importance array: ', imp)

    print("Predict test set...")
    test_prediction = gbm.predict(xgb.DMatrix(test[features]), ntree_limit=gbm.best_ntree_limit)

    print('Training time: {} minutes'.format(round((time.time() - start_time)/60, 2)))
    return test_prediction.tolist(), score


def create_submission(score, test, prediction):
    # Make Submission
    now = datetime.datetime.now()
    sub_file = 'submission_' + str(score) + '_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
    print('Writing submission: ', sub_file)
    f = open(sub_file, 'w')
    f.write('id,probability\n')
    total = 0
    for id in test['id']:
        str1 = str(id) + ',' + str(prediction[total])
        str1 += '\n'
        total += 1
        f.write(str1)
    f.close()

    # print('Creating zip-file...')
    # z = zipfile.ZipFile(sub_file + ".zip", "w", zipfile.ZIP_DEFLATED)
    # z.write(sub_file)
    # z.close()

In [None]:
def read_test_train():
    
    # A version without preprocessing

    train = pd.read_csv('input/train_merged.csv')
    test = pd.read_csv('input/test_merged.csv')
    
    features = get_features(train, test)
    return train, test, features
        

In [None]:

train, test, features = read_test_train()
print('Length of train: ', len(train))
print('Length of test: ', len(test))
print('Features [{}]: {}'.format(len(features), sorted(features)))


In [2]:
train, test, features = read_test_train()
test_prediction, score = run_default_test(train, test, features, 'isDuplicate')
print('Real score = {}'.format(score))
create_submission(score, test, test_prediction)

XGBoost params. ETA: 0.1, MAX_DEPTH: 5, SUBSAMPLE: 0.8, COLSAMPLE_BY_TREE: 0.8


Will train until eval error hasn't decreased in 20 rounds.
[0]	train-auc:0.728439	eval-auc:0.728526
[1]	train-auc:0.730408	eval-auc:0.730204
[2]	train-auc:0.739963	eval-auc:0.739905
[3]	train-auc:0.742056	eval-auc:0.742282
[4]	train-auc:0.743453	eval-auc:0.743584
[5]	train-auc:0.751931	eval-auc:0.751844
[6]	train-auc:0.754318	eval-auc:0.753984
[7]	train-auc:0.757511	eval-auc:0.757395
[8]	train-auc:0.757756	eval-auc:0.757617
[9]	train-auc:0.759601	eval-auc:0.759497
[10]	train-auc:0.761117	eval-auc:0.761103
[11]	train-auc:0.762935	eval-auc:0.762893
[12]	train-auc:0.766299	eval-auc:0.766104
[13]	train-auc:0.767053	eval-auc:0.766810
[14]	train-auc:0.768141	eval-auc:0.767943
[15]	train-auc:0.770951	eval-auc:0.770802
[16]	train-auc:0.771522	eval-auc:0.771406
[17]	train-auc:0.773014	eval-auc:0.772954
[18]	train-auc:0.773419	eval-auc:0.773363
[19]	train-auc:0.773779	eval-auc:0.773648
[20]	train-auc:0.774094	eval-auc:0.773996
[21]	train-auc:0.774920	eval-auc:0.774827
[22]	train-auc:0.775079	eva

Validating...
Check error value: 0.825286
('Importance array: ', [('categoryID_2', 1410), ('price_1', 1173), ('price_2', 1001), ('parentCategoryID_1', 718), ('distance', 590), ('lat_2', 512), ('lon_2', 507), ('lon_1', 487), ('lat_1', 443), ('locationID_1', 376), ('locationID_2', 329), ('categoryID_1', 237), ('price_same', 186), ('metroID_2', 182), ('metroID_1', 160), ('parentCategoryID_2', 128), ('regionID_same', 110), ('locationID_same', 98), ('Unnamed: 0', 91), ('regionID_1', 82), ('regionID_2', 47), ('len_attrsJSON', 44), ('len_description', 38), ('len_title', 34), ('metroID_same', 32), ('lat_same', 18), ('lon_same', 11)])
Predict test set...
Training time: 16.16 minutes
Real score = 0.825286316733
('Writing submission: ', 'submission_0.825286316733_2016-05-17-13-15.csv')


In [3]:
dataset_pairs_path = "input/ItemPairs_train.csv"
dataset_info_path = "input/ItemInfo_train.csv"

types1 = {
    'itemID_1': np.dtype(int),
    'itemID_2': np.dtype(int),
    'isDuplicate': np.dtype(int),
    'generationMethod': np.dtype(int),
}

types2 = {
    'itemID': np.dtype(int),
    'categoryID': np.dtype(int),
    'title': np.dtype(str),
    'description': np.dtype(str),
    'images_array': np.dtype(str),
    'attrsJSON': np.dtype(str),
    'price': np.dtype(float),
    'locationID': np.dtype(int),
    'metroID': np.dtype(float),
    'lat': np.dtype(float),
    'lon': np.dtype(float),
}

# Add 'id' column for easy merge
items = pd.read_csv(dataset_info_path, dtype=types2)

In [4]:
items.head()

Unnamed: 0,itemID,categoryID,title,description,images_array,attrsJSON,price,locationID,metroID,lat,lon
0,1,81,Продам Камаз 6520,Продам Камаз 6520 20 тонн,"1064094, 5252822, 6645873, 6960145, 9230265","{""Вид техники"":""Грузовики""}",300000.0,648140,,64.686946,30.815924
1,3,14,Yamaha r6,Весь в тюнинге.,"11919573, 14412228, 3204180, 6646877","{""Вид техники"":""Мотоциклы"", ""Вид мотоцикла"":""С...",300000.0,639040,,55.678037,37.256548
2,4,84,iPhone 3gs 8gb,"Телефон в хорошем состоянии, трещин и сколов н...","14384831, 6102021","{""Вид телефона"":""iPhone""}",3500.0,640650,,56.239398,43.460458
3,7,84,Xiaomi Mi4 3гб RAM + 16гб ROM белый,"Отличный подарок на новый год от ""китайской ap...",,"{""Вид телефона"":""Другие марки""}",13500.0,662210,,55.77717,37.586194
4,8,39,Лыжные ботинки,"Лыжные ботинки в хорошем состоянии, 34 размер","13718854, 4787310","{""Вид товара"":""Зимние виды спорта""}",500.0,624360,,55.77717,37.586194


In [13]:
tfidf = TfidfVectorizer(stop_words = stopwords.words('russian')).fit_transform(items.head().description)

In [14]:
tfidf.todense().shape

(5, 128)

In [None]:
# Sprawdz ze stop words i bez