<a href="https://colab.research.google.com/github/alexey9019/data/blob/master/Markdown_Guide.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Предсказание пола клиента по транзакциям

In [0]:
import pandas as pd
import numpy as np
import xgboost as xgb
import re
import matplotlib.pyplot as plt
from tqdm._tqdm_notebook import tqdm_notebook
from warnings import filterwarnings
filterwarnings('ignore')

%matplotlib inline

In [3]:
!git clone https://github.com/alexey9019/data.git

Cloning into 'data'...
remote: Enumerating objects: 15, done.[K
remote: Counting objects: 100% (15/15), done.[K
remote: Compressing objects: 100% (14/14), done.[K
remote: Total 15 (delta 2), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (15/15), done.


In [0]:
tr_mcc_codes = pd.read_csv('/content/data/tr_mcc_codes.csv', sep=';', index_col='mcc_code')
tr_types = pd.read_csv('/content/data/tr_types.csv', sep=';', index_col='tr_type')
transactions = pd.read_csv('/content/data/transactions.csv', index_col='customer_id')
gender_train = pd.read_csv('/content/data/gender_train.csv', index_col='customer_id')
gender_test = pd.read_csv('/content/data/gender_test.csv', index_col='customer_id')

transactions_train = transactions.join(gender_train, how='inner')
transactions_test = transactions.join(gender_test, how='inner')

Создадим функции для построения классификатора. Оценим результаты и построим прогноз для тестовой части пользователей

In [0]:
# среднее значение метрики ROC AUC на тренировочных данных
def cv_score(params, train, y_true):
    cv_res=xgb.cv(params, xgb.DMatrix(train, y_true),
    early_stopping_rounds=10, maximize=True,
    num_boost_round=10000, nfold=5, stratified=True)
    index_argmax = cv_res['test-auc-mean'].argmax()
    print('Cross-validation, ROC AUC: {:.3f}+-{:.3f}, Trees: {}'.format(cv_res.loc[index_argmax]['test-auc-mean'],
                                                                        cv_res.loc[index_argmax]['test-auc-std'],
                                                                        index_argmax))

In [0]:
# построим модель и выведем результаты классификации тестовой части пользователей
def fit_predict(params, num_trees, train, test, target):
    params['learning_rate'] = params['eta']
    clf = xgb.train(params, xgb.DMatrix(train.values, target, feature_names=list(train.columns)),
                    num_boost_round=num_trees, maximize=True)
    y_pred = clf.predict(xgb.DMatrix(test.values, feature_names=list(train.columns)))
    submission = pd.DataFrame(index=test.index, data=y_pred, columns=['probability'])
    return clf, submission

In [0]:
# отобразим важность переменных
def draw_feature_importances(clf, top_k=10):
    plt.figure(figsize=(10, 10))
    importances = dict(sorted(clf.get_score().items(), key=lambda x: x[1])[-top_k:])
    y_pos = np.arange(len(importances))
    plt.barh(y_pos, list(importances.values()), align='center', color='green')
    plt.yticks(y_pos, importances.keys(), fontsize=12)
    plt.xticks(fontsize=12)
    plt.xlabel('Важность переменной', fontsize=15)
    plt.ylim(-0.5, len(importances) - 0.5)
    plt.show()