# <img src="../images/sberbank.png">
# <center> Финальное задание </center>
## <center> Предсказание пола клиента по транзакциям</center>

## Описание задачи 
### В рамках финального задания будет необходимо предсказывать пол клиента, основываясь на его транзакционных исторических данных. Выполнение финального задания - это маленький шаг в большую Data Science-всесенную, поэтому отнеситесь к нему максимально серьёзно :)
### Вы будете строить предиктивные модели и отправлять результаты своего моделирования на платформу [Kaggle](https://www.kaggle.com/t/e8a939488d274dab9051cce14d5ca952), где и будет оцениваться каждое решение и положение участников. Но переживать не стоит - код, связанный с построением модели мы уже написали, поэтому вашим основным заданием будет создание новых переменных для генерации новых инсайдов из данных, которые смогут улучшить полученные значения метрики.
### В роли метрики выступает [ROC AUC](https://dyakonov.org/2017/07/28/auc-roc-%D0%BF%D0%BB%D0%BE%D1%89%D0%B0%D0%B4%D1%8C-%D0%BF%D0%BE%D0%B4-%D0%BA%D1%80%D0%B8%D0%B2%D0%BE%D0%B9-%D0%BE%D1%88%D0%B8%D0%B1%D0%BE%D0%BA/), который и нужно будет оптимизировать.

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import re
import matplotlib.pyplot as plt

from tqdm._tqdm_notebook import tqdm_notebook
from warnings import filterwarnings

%matplotlib inline
filterwarnings('ignore')

In [2]:
# Считываем данные
tr_mcc_codes = pd.read_csv('../data/tr_mcc_codes.csv', sep=';', index_col='mcc_code')
tr_types = pd.read_csv('../data/tr_types.csv', sep=';', index_col='tr_type')

transactions = pd.read_csv('../data/transactions.csv', index_col='customer_id')
gender_train = pd.read_csv('../data/gender_train.csv', index_col='customer_id')
gender_test = pd.read_csv('../data/gender_test.csv', index_col='customer_id')
transactions_train = transactions.join(gender_train, how='inner')
transactions_test = transactions.join(gender_test, how='inner')

del transactions

In [None]:
#transactions_test.info()

In [None]:
#transactions_train.head()

In [230]:
# Функции, которыми можно пользоваться для построения классификатора, 
# оценки его результатов и построение прогноза для тестовой части пользователей

# Cross-validation score (среднее значение метрики ROC AUC на тренировочных данных)
def cv_score(params, train, y_true):
    cv_res=xgb.cv(params, xgb.DMatrix(train, y_true),
                  early_stopping_rounds=10, maximize=True, 
                  num_boost_round=10000, nfold=5, stratified=True)
#    print(cv_res)
    index_argmax = cv_res['test-auc-mean'].argmax()
    print('Cross-validation, ROC AUC: {:.3f}+-{:.3f}, Trees: {}'.format(cv_res.loc[index_argmax]['test-auc-mean'],
                                                                        cv_res.loc[index_argmax]['test-auc-std'],
                                                                        index_argmax))
    return cv_res.loc[index_argmax]['test-auc-mean'], cv_res.loc[index_argmax]['test-auc-std']

# Построение модели + возврат результатов классификации тестовых пользователей
def fit_predict(params, num_trees, train, test, target):
    params['learning_rate'] = params['eta']
    clf = xgb.train(params, xgb.DMatrix(train.values, target, feature_names=list(train.columns)), 
                    num_boost_round=num_trees, maximize=True)
    y_pred = clf.predict(xgb.DMatrix(test.values, feature_names=list(train.columns)))
    submission = pd.DataFrame(index=test.index, data=y_pred, columns=['probability'])
    return clf, submission

# Отрисовка важности переменных. Важность переменной - количество разбиений выборки, 
# в которых участвует данная переменная. Чем больше - тем она, вероятно, лучше 
def draw_feature_importances(clf, top_k=10):
    plt.figure(figsize=(10, 10))
    
    importances = dict(sorted(clf.get_score().items(), key=lambda x: x[1])[-top_k:])
    y_pos = np.arange(len(importances))
    
    plt.barh(y_pos, list(importances.values()), align='center', color='green')
    plt.yticks(y_pos, importances.keys(), fontsize=12)
    plt.xticks(fontsize=12)
    plt.xlabel('Feature importance', fontsize=15)
    plt.title('Features importances, Sberbank Gender Prediction', fontsize=18)
    plt.ylim(-0.5, len(importances) - 0.5)
    plt.show()

### Так как код для оценки модели на тренировочных данных и её применения на тестовых данных уже дан, то мы будем работать над тем, чтобы создать переменные для улучшения результатов моделирования. 

### (!) В рамках данного задания Вы можете делать всё, что угодно - использовать другие алгоритмы и/или их комбинации, подбирать гиперпараметры своих моделей, отбирать переменые, etc. Мы создали шаблон для простоты и для Вашего понимания верхнеуровневого процесса разработки модели, опустив при этом большое число деталей.

## Basic features
Начнём с того, что сформируем базовые переменные по каждому пользователю. На этом этапе будем использовать стандартные агрегации, посчитанные на расходах и приходах клиента:
- минимум
- максимум
- среднее
- медиана
- среднеквадратичное отклонение
- количество

Также параметры модели выберем стандартные, запишем их в словарь params, и будем использовать для дальнейшего построения модели (не забывайте, что с этим Вы можете тоже экспериментировать).

In [None]:
params = {
    'eta': 0.1,
    'max_depth': 3,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    
    'gamma': 0,
    'lambda': 0,
    'alpha': 0,
    'min_child_weight': 0,
    
    'eval_metric': 'auc',
    'objective': 'binary:logistic' ,
    'booster': 'gbtree',
    'njobs': -1,
    'tree_method': 'approx'
}

In [4]:
tqdm_notebook.pandas(desc="Progress:")

In [None]:
def features_creation_basic(x): 
    features = []
    features.append(pd.Series(x[x['amount']>0]['amount'].agg(['min', 'max', 'mean', 'median', 'std', 'count'])\
                                                        .add_prefix('positive_transactions_')))
    features.append(pd.Series(x[x['amount']<0]['amount'].agg(['min', 'max', 'mean', 'median', 'std', 'count'])\
                                                        .add_prefix('negative_transactions_')))
 
    return pd.concat(features)

data_train = transactions_train.groupby(transactions_train.index).progress_apply(features_creation_basic)
data_test = transactions_test.groupby(transactions_test.index).progress_apply(features_creation_basic)

In [None]:
#data_test.head()

In [None]:
#data_train.head()

In [None]:
target = data_train.join(gender_train, how='inner')['gender']
cv_score(params, data_train, target)

In [None]:
#target.head()

In [None]:
### Число деревьев для XGBoost имеет смысл выставлять по результатам на кросс-валидации 
clf, submission = fit_predict(params, 88, data_train, data_test, target)
#clf, submission = fit_predict(params, 70, data_train, data_test, target)

In [None]:
draw_feature_importances(clf, 10)

In [None]:
submission.to_csv('../data/basic_features_submission.csv')

### Видим, что результат на кросс-валидации - 62.5% ROC AUC.

## Advanced features
Добавим дополнительные переменные по каждому пользователю в модель. <br>
Для этого будем анализировать дни недели, часы и состояние дня/ночи во время покупки - в каждом из случаев будем считать частоту транзакций в соответствующей категории относитеьно всех остальных категорий. <br>
То есть если, например, клиент в 70% случае совершал ночные траты, то мы получим вектор [0.7, 0.3] для этого случая в качестве частот транзакций ночью/днём.

In [5]:
for df in [transactions_train, transactions_test]:
    df['day'] = df['tr_datetime'].str.split().apply(lambda x: int(x[0]) % 7)
    df['hour'] = df['tr_datetime'].apply(lambda x: re.search(' \d*', x).group(0)).astype(int)
    df['night'] = ~df['hour'].between(6, 22).astype(int)
    df['mcc_tr'] = df['mcc_code'].astype(str)+df['tr_type'].astype(str)
    df['mcc_hour'] = df['mcc_code'].astype(str)+df['tr_datetime'].apply(lambda x: re.search(' \d*', x).group(0)).astype(str)

In [None]:
#transactions_train.head()

In [None]:
#transactions_train.info()

In [None]:
def features_creation_advanced(x): 
    features = []
    features.append(pd.Series(x['day'].value_counts(normalize=True).add_prefix('day_')))
    features.append(pd.Series(x['hour'].value_counts(normalize=True).add_prefix('hour_')))
    features.append(pd.Series(x['night'].value_counts(normalize=True).add_prefix('night_')))
    features.append(pd.Series(x[x['amount']>0]['amount'].agg(['min', 'max', 'mean', 'median', 'std', 'count'])\
                                                        .add_prefix('positive_transactions_')))
    features.append(pd.Series(x[x['amount']<0]['amount'].agg(['min', 'max', 'mean', 'median', 'std', 'count'])\
                                                        .add_prefix('negative_transactions_')))
 
    return pd.concat(features)

In [None]:
data_train = transactions_train.groupby(transactions_train.index)\
                               .progress_apply(features_creation_advanced).unstack(-1)
data_test = transactions_test.groupby(transactions_test.index)\
                             .progress_apply(features_creation_advanced).unstack(-1)

In [None]:
#data_train.head()

In [None]:
target = data_train.join(gender_train, how='inner')['gender']
cv_score(params, data_train, target)

In [None]:
### Число деревьев для XGBoost имеет смысл выятавлять по результатам на кросс-валидации 
clf, submission = fit_predict(params, 148, data_train, data_test, target)
#clf, submission = fit_predict(params, 70, data_train, data_test, target)

In [None]:
draw_feature_importances(clf, 10)

### Добавление новых переменных улучшило наши результаты ROC AUC с 62.5% до 68.2%, на тестовой выборке результат будет аналогичным, так что мы явно не переобучились. При этом есть куда стремиться!

### В итоге можем отправить полученное решение на платформу в Kaggle In-Class Competition. Для этого выгрузим его в *.csv - файл, после чего полученный файл можем загружать в качестве ответа.

In [None]:
submission.to_csv('../data/submission_advanced.csv')

In [None]:
#transactions_train.index

# (!) Цель задания:
## Полученная модель должна иметь ROC AUC на Public-части тестовой выборки (на лидерборде) не менее 80%.

In [259]:
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from sklearn.metrics import roc_auc_score
import gc

In [289]:
params = {
    'eta': 0.1,
    'tree_method': "hist", #"approx", #"hist"
    'grow_policy': "lossguide",
    'max_leaves': 1000,  
    'max_depth': 3, 
    'subsample': 0.9, 
    'alpha':1,
    'objective': 'binary:logistic', 
    'scale_pos_weight':100,
    'eval_metric': 'auc', 
    'nthread':4,
    'silent': 0
}

In [282]:
#dmtrain= xgb.DMatrix(data_train.values, target, feature_names=list(data_train.columns))
#dmvalid= xgb.DMatrix(data_test.values, feature_names=list(data_train.columns))
dmtrain= xgb.DMatrix(data_train[:train_size].values, target[:train_size], feature_names=list(data_train.columns))
dmvalid= xgb.DMatrix(data_train[train_size:].values, target[train_size:], feature_names=list(data_train.columns))

In [65]:
#del dmvalid
#data_train
#list(data_train.columns)
#data_train.info()

In [290]:
def objective(params):
    num_round = int(params['n_estimators'])
    del params['n_estimators']
    watchlist = [(dmtrain, 'train'), (dmvalid, 'valid')]
    model = xgb.train(params, dmtrain, num_round, watchlist, maximize=True, early_stopping_rounds=20, verbose_eval=1)
    pred = model.predict(dmvalid, ntree_limit=model.best_ntree_limit)
    auc = roc_auc_score(dmvalid.get_label(), pred)
    del model, pred
    gc.collect()
    print(f"SCORE: {auc}")
#    print("params: ",params)
    return { 'loss': 1-auc, 'status': STATUS_OK }

In [284]:
# hyperparameter optimization space
# find more parameters in docs https://github.com/dmlc/xgboost/blob/443ff746e9723dcf571769b0d6ea28fbcb3e4a3f/doc/parameter.md
space = {
    'n_estimators': hp.quniform('n_estimators', 200, 600, 50),
    #'n_estimators': 3, # WARNING: increse number of estimators, e.g. uncomment the above line (it's small for the sake of example)
    'eta': hp.quniform('eta', 0.025, 0.1, 0.025),
    'max_depth': hp.choice('max_depth', np.arange(1, 14, dtype=int)),
    'min_child_weight': hp.quniform('min_child_weight', 1, 10, 1),
    'subsample': hp.quniform('subsample', 0.7, 1, 0.05),
    'gamma': hp.quniform('gamma', 0.5, 1, 0.05),
    'colsample_bytree': hp.quniform('colsample_bytree', 0.7, 1, 0.05),
    'alpha' : hp.quniform('alpha', 0, 10, 1),
    'lambda': hp.quniform('lambda', 1, 2, 0.1),
    'scale_pos_weight': hp.quniform('scale_pos_weight', 50, 200, 10),
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'tree_method': "approx", #"hist",
    'booster': 'gbtree',
    'nthread': 4, 
    'silent': 0
}

In [285]:
trials = Trials()

In [286]:
best = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=100, # WARNING: increase number of evaluations (it's small for the sake of example)
    trials=trials
)
print("\n\n\n The best hyperparameters:")
print(best)

                                                                               
SCORE: 0.8644204731182796
params:                                                                        
{'alpha': 8.0, 'booster': 'gbtree', 'colsample_bytree': 0.9, 'eta': 0.07500000000000001, 'eval_metric': 'auc', 'gamma': 0.7000000000000001, 'lambda': 1.2000000000000002, 'max_depth': 13, 'min_child_weight': 6.0, 'nthread': 4, 'objective': 'binary:logistic', 'scale_pos_weight': 130.0, 'silent': 0, 'subsample': 0.8500000000000001, 'tree_method': 'approx'}
                                                                               
SCORE: 0.8651339928315411
params:                                                                        
{'alpha': 6.0, 'booster': 'gbtree', 'colsample_bytree': 0.7000000000000001, 'eta': 0.05, 'eval_metric': 'auc', 'gamma': 0.6000000000000001, 'lambda': 1.0, 'max_depth': 10, 'min_child_weight': 4.0, 'nthread': 4, 'objective': 'binary:logistic', 'scale_pos_weight': 140.0, 

{'alpha': 2.0, 'booster': 'gbtree', 'colsample_bytree': 0.8, 'eta': 0.07500000000000001, 'eval_metric': 'auc', 'gamma': 0.65, 'lambda': 2.0, 'max_depth': 12, 'min_child_weight': 1.0, 'nthread': 4, 'objective': 'binary:logistic', 'scale_pos_weight': 100.0, 'silent': 0, 'subsample': 0.8, 'tree_method': 'approx'}
                                                                               
SCORE: 0.8652694480286738
params:                                                                        
{'alpha': 1.0, 'booster': 'gbtree', 'colsample_bytree': 1.0, 'eta': 0.05, 'eval_metric': 'auc', 'gamma': 0.9, 'lambda': 1.6, 'max_depth': 2, 'min_child_weight': 10.0, 'nthread': 4, 'objective': 'binary:logistic', 'scale_pos_weight': 90.0, 'silent': 0, 'subsample': 0.9, 'tree_method': 'approx'}
                                                                               
SCORE: 0.8606528458781363
params:                                                                        
{'alpha': 4.0, 'boo

{'alpha': 10.0, 'booster': 'gbtree', 'colsample_bytree': 0.7000000000000001, 'eta': 0.05, 'eval_metric': 'auc', 'gamma': 1.0, 'lambda': 1.6, 'max_depth': 8, 'min_child_weight': 9.0, 'nthread': 4, 'objective': 'binary:logistic', 'scale_pos_weight': 90.0, 'silent': 0, 'subsample': 0.8, 'tree_method': 'approx'}
                                                                               
SCORE: 0.8658245734767025
params:                                                                        
{'alpha': 6.0, 'booster': 'gbtree', 'colsample_bytree': 0.75, 'eta': 0.05, 'eval_metric': 'auc', 'gamma': 0.9500000000000001, 'lambda': 1.0, 'max_depth': 13, 'min_child_weight': 7.0, 'nthread': 4, 'objective': 'binary:logistic', 'scale_pos_weight': 70.0, 'silent': 0, 'subsample': 0.9, 'tree_method': 'approx'}
                                                                               
SCORE: 0.8661816200716846
params:                                                                        
{'alp

{'alpha': 1.0, 'booster': 'gbtree', 'colsample_bytree': 0.8500000000000001, 'eta': 0.05, 'eval_metric': 'auc', 'gamma': 0.9, 'lambda': 1.6, 'max_depth': 12, 'min_child_weight': 10.0, 'nthread': 4, 'objective': 'binary:logistic', 'scale_pos_weight': 160.0, 'silent': 0, 'subsample': 0.8500000000000001, 'tree_method': 'approx'}
                                                                               
SCORE: 0.8597246164874552
params:                                                                        
{'alpha': 2.0, 'booster': 'gbtree', 'colsample_bytree': 0.9500000000000001, 'eta': 0.1, 'eval_metric': 'auc', 'gamma': 1.0, 'lambda': 1.4000000000000001, 'max_depth': 9, 'min_child_weight': 8.0, 'nthread': 4, 'objective': 'binary:logistic', 'scale_pos_weight': 200.0, 'silent': 0, 'subsample': 0.9500000000000001, 'tree_method': 'approx'}
                                                                               
SCORE: 0.8643174767025089
params:                                 

{'alpha': 7.0, 'booster': 'gbtree', 'colsample_bytree': 0.75, 'eta': 0.025, 'eval_metric': 'auc', 'gamma': 0.5, 'lambda': 1.4000000000000001, 'max_depth': 11, 'min_child_weight': 7.0, 'nthread': 4, 'objective': 'binary:logistic', 'scale_pos_weight': 70.0, 'silent': 0, 'subsample': 0.9, 'tree_method': 'approx'}
                                                                               
SCORE: 0.8632552831541219
params:                                                                        
{'alpha': 4.0, 'booster': 'gbtree', 'colsample_bytree': 0.8500000000000001, 'eta': 0.1, 'eval_metric': 'auc', 'gamma': 0.55, 'lambda': 1.2000000000000002, 'max_depth': 6, 'min_child_weight': 8.0, 'nthread': 4, 'objective': 'binary:logistic', 'scale_pos_weight': 50.0, 'silent': 0, 'subsample': 0.8, 'tree_method': 'approx'}
                                                                               
SCORE: 0.8687148960573476
params:                                                               

{'alpha': 0.0, 'booster': 'gbtree', 'colsample_bytree': 0.7000000000000001, 'eta': 0.025, 'eval_metric': 'auc', 'gamma': 0.75, 'lambda': 1.0, 'max_depth': 9, 'min_child_weight': 10.0, 'nthread': 4, 'objective': 'binary:logistic', 'scale_pos_weight': 50.0, 'silent': 0, 'subsample': 0.75, 'tree_method': 'approx'}
                                                                               
SCORE: 0.8678354121863799
params:                                                                        
{'alpha': 1.0, 'booster': 'gbtree', 'colsample_bytree': 0.7000000000000001, 'eta': 0.025, 'eval_metric': 'auc', 'gamma': 0.8, 'lambda': 1.0, 'max_depth': 9, 'min_child_weight': 10.0, 'nthread': 4, 'objective': 'binary:logistic', 'scale_pos_weight': 150.0, 'silent': 0, 'subsample': 0.75, 'tree_method': 'approx'}
                                                                               
SCORE: 0.8657807598566309
params:                                                                        


{'alpha': 1.0, 'booster': 'gbtree', 'colsample_bytree': 0.9500000000000001, 'eta': 0.07500000000000001, 'eval_metric': 'auc', 'gamma': 0.65, 'lambda': 1.2000000000000002, 'max_depth': 4, 'min_child_weight': 9.0, 'nthread': 4, 'objective': 'binary:logistic', 'scale_pos_weight': 130.0, 'silent': 0, 'subsample': 0.7000000000000001, 'tree_method': 'approx'}
                                                                               
SCORE: 0.8685656774193548
params:                                                                        
{'alpha': 2.0, 'booster': 'gbtree', 'colsample_bytree': 0.7000000000000001, 'eta': 0.05, 'eval_metric': 'auc', 'gamma': 0.8, 'lambda': 1.0, 'max_depth': 4, 'min_child_weight': 10.0, 'nthread': 4, 'objective': 'binary:logistic', 'scale_pos_weight': 120.0, 'silent': 0, 'subsample': 0.75, 'tree_method': 'approx'}
                                                                               
SCORE: 0.8525717562724013
params:                               

In [146]:
params = best

In [188]:
params = {
    'eta': 0.1, #0.1
    'max_depth': 13, #3
    'subsample': 0.75, #0.8
    'colsample_bytree': 0.7,
    
    'gamma': 0.5,
    'lambda': 1.4,
    'alpha': 0,
    'min_child_weight': 4, #0
    
    'eval_metric': 'auc',
    'objective': 'binary:logistic' ,
    'booster': 'gbtree',
    'njobs': -1,
    'tree_method': 'approx',
    'nthread': 4,
    'scale_pos_weight': 170.0
}

In [254]:
gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(3,12)
    for min_child_weight in range(0,8)
]

In [None]:
num_boost_round = 999
len (gridsearch_params)

In [256]:
max_auc = 0
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))
    # Update our parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    # Run CV
#    cv_results = xgb.cv(
#        params,
#        dmtrain,
#        num_boost_round=num_boost_round,
#        seed=42,
#        nfold=5,
#        metrics={'auc'},
#        early_stopping_rounds=73
        #73
 #   )
    cv_results = xgb.cv(params, xgb.DMatrix(data_train, target),
                  early_stopping_rounds=100, maximize=True, 
                  num_boost_round=10000, nfold=5, stratified=True)
    
    # Update best MAE
#    mean_mae = cv_results['test-mae-mean'].min()    
    mean_auc = cv_results['test-auc-mean'].max()
#    boost_rounds = cv_results['test-mae-mean'].argmin()
    boost_rounds = cv_results['test-auc-mean'].argmax()
    print("\tAUC {} for {} rounds".format(mean_auc, boost_rounds))
#    if mean_mae < min_mae:    
    if mean_auc > max_auc:
        max_auc = mean_auc
        best_params = (max_depth,min_child_weight)
print("Best params: {}, {}, AUC: {}".format(best_params[0], best_params[1], max_auc))

CV with max_depth=3, min_child_weight=0
	AUC 0.8777479999999999 for 1008 rounds
CV with max_depth=3, min_child_weight=1
	AUC 0.8792652000000001 for 554 rounds
CV with max_depth=3, min_child_weight=2
	AUC 0.8793759999999999 for 538 rounds
CV with max_depth=3, min_child_weight=3
	AUC 0.8793008 for 599 rounds
CV with max_depth=3, min_child_weight=4
	AUC 0.8795628000000001 for 543 rounds
CV with max_depth=3, min_child_weight=5
	AUC 0.8796772 for 616 rounds
CV with max_depth=3, min_child_weight=6
	AUC 0.8804130000000001 for 535 rounds
CV with max_depth=3, min_child_weight=7
	AUC 0.8794536000000001 for 559 rounds
CV with max_depth=4, min_child_weight=0
	AUC 0.876961 for 766 rounds
CV with max_depth=4, min_child_weight=1
	AUC 0.8786326000000001 for 312 rounds
CV with max_depth=4, min_child_weight=2
	AUC 0.8795912000000001 for 401 rounds
CV with max_depth=4, min_child_weight=3
	AUC 0.8798646 for 372 rounds
CV with max_depth=4, min_child_weight=4
	AUC 0.8792498 for 386 rounds
CV with max_depth=

KeyboardInterrupt: 

In [246]:
gridsearch_params = [
    (subsample, colsample)
    for subsample in [i/10. for i in range(7,11)]
    for colsample in [i/10. for i in range(7,11)]
]

In [252]:
max_auc = 0
best_params = None
for subsample, colsample in reversed(gridsearch_params):
    print("CV with subsample={}, colsample={}".format(
                             subsample,
                             colsample))
    # We update our parameters
    params['subsample'] = subsample
    params['colsample_bytree'] = colsample
    # Run CV
#    cv_results = xgb.cv(
#        params,
#        dmtrain,
#        num_boost_round=num_boost_round,
#        seed=42,
#        nfold=5,
#        metrics={'auc'},
#        early_stopping_rounds=73
#    )
    cv_results = xgb.cv(params, xgb.DMatrix(data_train, target),
                  early_stopping_rounds=100, maximize=True, 
                  num_boost_round=10000, nfold=5, stratified=True)
    
    mean_auc = cv_results['test-auc-mean'].max()
    boost_rounds = cv_results['test-auc-mean'].argmax()
    print("\tAUC {} for {} rounds".format(mean_auc, boost_rounds))
    if mean_auc > max_auc:
        max_auc = mean_auc
        best_params = (subsample,colsample)
print("Best params: {}, {}, AUC: {}".format(best_params[0], best_params[1], max_auc))

CV with subsample=1.0, colsample=1.0
	AUC 0.8755928000000001 for 417 rounds
CV with subsample=1.0, colsample=0.9
	AUC 0.8760705999999999 for 383 rounds
CV with subsample=1.0, colsample=0.8
	AUC 0.8759897999999999 for 414 rounds
CV with subsample=1.0, colsample=0.7
	AUC 0.8761713999999999 for 415 rounds
CV with subsample=0.9, colsample=1.0
	AUC 0.8772196000000001 for 360 rounds
CV with subsample=0.9, colsample=0.9
	AUC 0.8769624 for 414 rounds
CV with subsample=0.9, colsample=0.8
	AUC 0.8775906000000001 for 405 rounds
CV with subsample=0.9, colsample=0.7
	AUC 0.8781674 for 382 rounds
CV with subsample=0.8, colsample=1.0
	AUC 0.8778556 for 352 rounds
CV with subsample=0.8, colsample=0.9
	AUC 0.8784606 for 411 rounds
CV with subsample=0.8, colsample=0.8
	AUC 0.8770932 for 311 rounds
CV with subsample=0.8, colsample=0.7
	AUC 0.8784922 for 409 rounds
CV with subsample=0.7, colsample=1.0
	AUC 0.8782272000000001 for 317 rounds
CV with subsample=0.7, colsample=0.9
	AUC 0.8784646 for 315 rounds

In [8]:
def features_creation_my1(x): 
    features = []
#Cross-validation, ROC AUC: 0.876+-0.010, Trees: 309
    features.append(pd.Series(x['mcc_code'].value_counts(normalize=True).add_prefix('mcc_')))

#Cross-validation, ROC AUC: 0.853+-0.010, Trees: 647
#    features.append(pd.Series(x['mcc_hour'].value_counts(normalize=True).add_prefix('mcch_')))

#Cross-validation, ROC AUC: 0.705+-0.019, Trees: 238
#    features.append(pd.Series(x['tr_type'].value_counts(normalize=True).add_prefix('tr_')))
#    features.append(pd.Series(x['mcc_tr'].value_counts(normalize=True).add_prefix('mcc_tr_')))
    features.append(pd.Series(x['day'].value_counts(normalize=True).add_prefix('day_')))
    features.append(pd.Series(x['hour'].value_counts(normalize=True).add_prefix('hour_')))
    features.append(pd.Series(x['night'].value_counts(normalize=True).add_prefix('night_')))
    features.append(pd.Series(x[x['amount']>0]['amount'].agg(['min', 'max', 'mean', 'median', 'std', 'count'])\
                                                        .add_prefix('positive_transactions_')))
    features.append(pd.Series(x[x['amount']<0]['amount'].agg(['min', 'max', 'mean', 'median', 'std', 'count'])\
                                                        .add_prefix('negative_transactions_')))
 
    return pd.concat(features)

In [9]:
data_train = transactions_train.groupby(transactions_train.index)\
                               .progress_apply(features_creation_my1).unstack(-1)
data_test = transactions_test.groupby(transactions_test.index)\
                             .progress_apply(features_creation_my1).unstack(-1)

HBox(children=(IntProgress(value=0, description='Progress:', max=8400, style=ProgressStyle(description_width='…




HBox(children=(IntProgress(value=0, description='Progress:', max=3600, style=ProgressStyle(description_width='…




In [280]:
temp_=[0,1,2,3,4,5,6,7,8,9,10]
print (temp_[:6])
print (temp_[6:])
train_size = 4200

[0, 1, 2, 3, 4, 5]
[6, 7, 8, 9, 10]


In [281]:
dmtrain= xgb.DMatrix(data_train[:train_size].values, target[:train_size], feature_names=list(data_train.columns))
dmvalid= xgb.DMatrix(data_train[train_size:].values, target[train_size:], feature_names=list(data_train.columns))

In [232]:
def opt_params(param_range, text_opt):
    max_auc = 0
    best_params = None
    for param_x in param_range:
        print("CV with ", text_opt,"={}".format(param_x))
        # We update our parameters
        params[text_opt] = param_x
        # Run CV
        cv_results = xgb.cv(
            params,
            dmtrain,
            num_boost_round=num_boost_round,
            seed=42,
            nfold=5,
            metrics={'auc'},
            early_stopping_rounds=73
            )
        # Update best MAE
        mean_auc = cv_results['test-auc-mean'].max()
        boost_rounds = cv_results['test-auc-mean'].argmax()
        print("\tAUC {} for {} rounds".format(mean_auc, boost_rounds))
        if mean_auc > max_auc:
            max_auc = mean_auc
            best_params = param_x
    return text_opt, best_params, max_auc

In [236]:
def opt_params_cv(param_range, text_opt):
    max_auc = 0
    best_params = None
    for param_x in param_range:
        print("CV with ", text_opt,"={}".format(param_x))
        # We update our parameters
        params[text_opt] = param_x
        # Run CV
        mean_auc = cv_score(params, data_train, target)[0]
        if mean_auc > max_auc:
            max_auc = mean_auc
            best_params = param_x
    return text_opt, best_params, max_auc

In [239]:
num_boost_round = 999
opt_params_cv([.3, .2, .1, .05, .01, .005], "eta")
#print("Best params: {}, AUC: {}".format(best_params, max_auc))

CV with  eta =0.3
Cross-validation, ROC AUC: 0.867+-0.013, Trees: 75
CV with  eta =0.2
Cross-validation, ROC AUC: 0.872+-0.008, Trees: 101
CV with  eta =0.1
Cross-validation, ROC AUC: 0.878+-0.010, Trees: 189
CV with  eta =0.05
Cross-validation, ROC AUC: 0.880+-0.010, Trees: 372
CV with  eta =0.01
Cross-validation, ROC AUC: 0.879+-0.010, Trees: 1607
CV with  eta =0.005
Cross-validation, ROC AUC: 0.878+-0.010, Trees: 2821


('eta', 0.05, 0.8795428)

In [243]:
print ("best param -",opt_params_cv(np.arange(0.5, 1, 0.05), "gamma")) #'gamma', 0.5, 1, 0.05

CV with  gamma =0.5
Cross-validation, ROC AUC: 0.879+-0.009, Trees: 387
CV with  gamma =0.55
Cross-validation, ROC AUC: 0.879+-0.009, Trees: 389
CV with  gamma =0.6000000000000001
Cross-validation, ROC AUC: 0.879+-0.010, Trees: 394
CV with  gamma =0.6500000000000001
Cross-validation, ROC AUC: 0.879+-0.010, Trees: 376
CV with  gamma =0.7000000000000002
Cross-validation, ROC AUC: 0.878+-0.010, Trees: 339
CV with  gamma =0.7500000000000002
Cross-validation, ROC AUC: 0.878+-0.010, Trees: 339
CV with  gamma =0.8000000000000003
Cross-validation, ROC AUC: 0.878+-0.010, Trees: 339
CV with  gamma =0.8500000000000003
Cross-validation, ROC AUC: 0.878+-0.010, Trees: 339
CV with  gamma =0.9000000000000004
Cross-validation, ROC AUC: 0.879+-0.010, Trees: 369
CV with  gamma =0.9500000000000004
Cross-validation, ROC AUC: 0.878+-0.010, Trees: 330
best param - ('gamma', 0.55, 0.8792262)


In [208]:
print ("best param -",opt_params(range(0, 10, 1), "alpha")) #'alpha', 0, 10, 1

CV with  alpha =0
	AUC 0.882091 for 603 rounds
CV with  alpha =1
	AUC 0.8810466 for 604 rounds
CV with  alpha =2
	AUC 0.880774 for 561 rounds
CV with  alpha =3
	AUC 0.8795006000000001 for 528 rounds
CV with  alpha =4
	AUC 0.8793846000000001 for 480 rounds
CV with  alpha =5
	AUC 0.8782835999999999 for 505 rounds
CV with  alpha =6
	AUC 0.8783915999999999 for 571 rounds
CV with  alpha =7
	AUC 0.8776746000000001 for 570 rounds
CV with  alpha =8
	AUC 0.8780469999999999 for 643 rounds
CV with  alpha =9
	AUC 0.8773896000000001 for 571 rounds
best param - ('alpha', 0, 0.882091)


In [237]:
#print ("best param -",opt_params(np.arange(0, 2, 0.1), "lambda")) #'lambda', 1, 2, 0.1
print ("best param -",opt_params_cv(np.arange(0, 2, 0.1), "lambda")) #'lambda', 1, 2, 0.1

CV with  lambda =0.0
Cross-validation, ROC AUC: 0.880+-0.010, Trees: 372
CV with  lambda =0.1
Cross-validation, ROC AUC: 0.878+-0.009, Trees: 317
CV with  lambda =0.2
Cross-validation, ROC AUC: 0.879+-0.010, Trees: 397
CV with  lambda =0.30000000000000004
Cross-validation, ROC AUC: 0.879+-0.010, Trees: 372
CV with  lambda =0.4
Cross-validation, ROC AUC: 0.878+-0.010, Trees: 333
CV with  lambda =0.5
Cross-validation, ROC AUC: 0.879+-0.010, Trees: 368
CV with  lambda =0.6000000000000001
Cross-validation, ROC AUC: 0.879+-0.010, Trees: 394
CV with  lambda =0.7000000000000001
Cross-validation, ROC AUC: 0.878+-0.009, Trees: 339
CV with  lambda =0.8
Cross-validation, ROC AUC: 0.879+-0.010, Trees: 333
CV with  lambda =0.9
Cross-validation, ROC AUC: 0.880+-0.010, Trees: 422
CV with  lambda =1.0
Cross-validation, ROC AUC: 0.879+-0.010, Trees: 421
CV with  lambda =1.1
Cross-validation, ROC AUC: 0.879+-0.010, Trees: 368
CV with  lambda =1.2000000000000002
Cross-validation, ROC AUC: 0.879+-0.010, T

In [229]:
#print ("best param -",opt_params(range(0, 200, 10), "scale_pos_weight")) #'scale_pos_weight', 50, 200, 10

In [257]:
params = {
    'eta': 0.05, #0.1
    'max_depth': 3, #3
    'subsample': 0.7, #0.8
    'colsample_bytree': 0.8,
    
    'gamma': 0.55, #0
    'lambda': 0,
    'alpha': 0,
    'min_child_weight': 4, #0
    
    'eval_metric': 'auc',
    'objective': 'binary:logistic' ,
    'booster': 'gbtree',
    'njobs': -1,
    'tree_method': 'approx'
}

In [148]:
target = data_train.join(gender_train, how='inner')['gender']
#cv_score(xgb_reg_params, data_train, target)

In [258]:
cv_score(params, data_train, target)

Cross-validation, ROC AUC: 0.879+-0.009, Trees: 389


(0.8792262, 0.009485198308944294)

In [None]:
### Число деревьев для XGBoost имеет смысл выятавлять по результатам на кросс-валидации 
clf, submission = fit_predict(params, 372, data_train, data_test, target)
#clf, submission = fit_predict(params, 70, data_train, data_test, target)

In [None]:
#draw_feature_importances(clf, 10)

In [None]:
submission.to_csv('../data/gender_test_kaggle_sample_submission.csv')