
###  Essa solucao foi baseada no GitHub
### https://github.com/MiladShahidi/Fraud-Detection-XGBoost/blob/master/XGBoost_Fraud_Detection.ipynb


In [21]:

# Importando as bibliotecas necessarias

import pandas as pd
import numpy as np
# For Bayesian hyper-parameter optimization
import hyperopt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import make_scorer, precision_recall_curve, recall_score, precision_score
from functools import partial
# To supress a deprecation warning caused due to an issue between XGBoost and SciPy
import warnings
import seaborn as sns

MIN_PRECISION = 0.05

# The current version of XGBoost uses a conditional statement that
# the current version SciPy (internally used by XGBoost) doesn't like.
# This supresses SciPy's deprecation warning message
warnings.filterwarnings('ignore', category = DeprecationWarning)



In [8]:
# Importando as bases de dados

bdatrasos=pd.read_csv(r'C:\Base - Hackapan\BASE_ATRASOS_HACKAPAN.txt',sep=";",encoding='cp1252')
bdtrans=pd.read_csv(r'C:\Base - Hackapan\BASE_TRANSACOES_HACKAPAN.txt',sep=";",encoding='cp1252')
bdcartoes=pd.read_csv(r'C:\Base - Hackapan\BASE_CARTOES_HACKAPAN.txt',sep=";",encoding='cp1252')


In [11]:
bdcartoes.head(3)

Unnamed: 0,ID,FRAUDE,DT_CONTRATACAO,DATA_ATIVACAO,LIMITE_CREDITO_CARTAO,ORIGEM_VENDA_CONTRATO,DATA_NASCIMENTO_CLIENTE,GENERO,ESTADO_CIVIL,NATURALIDADE,...,CIDADE3_BUREAU1,CIDADE4_BUREAU1,CIDADE5_BUREAU1,RENDA_INFORMADA,RENDA_BUREAU1,RENDA_BUREAU2,SCORE_CREDITO_BUREAU1,SCORE_CREDITO_BUREAU2,SCORE_FRAUDE,FLAG_DESENVOLVIMENTO
0,E6524F55,0,2018-06-28 00:00:00,2018-07-12,1050,INBOUND,1971-07-07,MASCULINO,,,...,LINHARES,LINHARES,LINHARES,420000,1000,3550.0,470.0,629.0,919.0,0
1,6E8857A2,0,2018-06-05 00:00:00,2018-07-23,2700,OUTBOUND,1962-03-30,MASCULINO,CASADO,BR,...,DUQUE DE CAXIAS,DUQUE DE CAXIAS,,300000,998,1800.0,815.0,893.0,703.0,1
2,64A9E7CB,0,2018-06-13 00:00:00,2019-03-27,400,OUTBOUND,1992-01-13,FEMININO,CASADO,BRASIELIRA,...,PARDINHO,IGARACU DO TIETE,IGARACU DO TIETE,150000,886,1350.0,509.0,349.0,685.0,0


In [12]:
bdatrasos.head(3)


Unnamed: 0,ID,REFERENCIA,QT_DIAS_ATRASO
0,B4DABB73,2018-06-30,0
1,55AC2B52,2018-12-31,0
2,AF92CC5F,2018-11-30,128


In [13]:
bdtrans.head(3)

Unnamed: 0,ID,DATA_HORA,LIMITE_DISPONIVEL_APOS_TRANSACAO,VALOR_TRANSACAO,APROVADO_NEGADO,COMPRA_PRESENCIAL,NUMERO_PARCELAS,PAIS,BANDEIRA,VARIANTE_CARTAO,CODIGO_LOJISTA_MCC,RAMO_LOJISTA,LOJA
0,5BDA5905,2018-02-22 17:30:26,1938,1876,APR,N PRES,,BRA,M,INT,5965,M.O.T.O.,MERCPAGO*NULL
1,5BDA5905,2018-02-22 15:25:29,2125,40,APR,N PRES,,BRA,M,INT,7338,SERVIÇO,PG *LOJADOCONVITE
2,5BDA5905,2018-02-22 13:11:44,2165,588,APR,PRES,,BRA,M,INT,5651,VESTUARIO,CRISTIANE MORENO SILVA


In [None]:




# Aqui vai o codigo de preparacas das variaveis





In [16]:
# Definicao da funcao que vai evoluir o classificador baysiano

def objective(params, X, y, X_early_stop, y_early_stop, scorer, n_folds = 10):

    pos_count = y_train.sum()
    neg_count = len(y_train) - pos_count
    imbalance_ratio = neg_count / pos_count
    
    xgb_clf = XGBClassifier(**params, scale_pos_weight=imbalance_ratio,
                            n_estimators = 2000, n_jobs = 1)

    xgb_fit_params = {'early_stopping_rounds': 50,
                      'eval_metric': ['logloss'],
                      'eval_set': [(X_early_stop, y_early_stop)],
                      'verbose': False
                      }
    
    cv_score = np.mean(cross_val_score(xgb_clf, X_train, y_train, cv = n_folds,
                               fit_params = xgb_fit_params, n_jobs = -1,
                               scoring = scorer))
    
    # hypoeropt minimizes the loss, hence the minus sign behind cv_score
    return {'loss': -cv_score, 'status': hyperopt.STATUS_OK, 'params': params}




In [15]:
# Esta funcao checa aprecisao do estimacao e retorno o valor que para a procura do estimador

def conditional_recall_score(y_true, pred_proba, precision = MIN_PRECISION):
    # Since the PR curve is discreet it might not contain the exact precision value given
    # So we first find the closest existing precision to the given level
    # Then return the highest recall acheiveable at that precision level
    # Taking max() helps in case PR curve is locally flat
    # with multiple recall values for the same precision
    pr, rc,_ = precision_recall_curve(y_true, pred_proba[:,1])
    return np.max(rc[pr >= min_prec])


In [17]:
# Esta e a funcao que "cola" as coisas que foram feitas nas outras duas funcoes...


def tune_xgb(param_space, X_train, y_train, X_early_stop, y_early_stop, n_iter):    
    scorer = make_scorer(conditional_recall_score, needs_proba=True)

    # hyperopt.fmin will only pass the parameter values to objective. So we need to
    # create a partial function to bind the rest of the arguments we want to pass to objective
    obj = partial(objective, scorer = scorer, X = X_train, y = y_train,
                  X_early_stop = X_early_stop, y_early_stop = y_early_stop)

    # A trials object that will store the results of all iterations
    trials = hyperopt.Trials()
    
    hyperopt.fmin(fn = obj, space = param_space, algo = hyperopt.tpe.suggest,
                         max_evals = n_iter, trials = trials)
    
    # returns the values of parameters from the best trial
    return trials.best_trial['result']['params']




In [None]:

# Essa e a parte que vai dar discussao, os parametros do estimador baysiano

param_space = {
        'learning_rate': hyperopt.hp.loguniform('learning_rate', np.log(0.01), np.log(0.2)),
        'max_depth': hyperopt.hp.choice('max_depth', [2, 4, 6, 8, 10]),
        'subsample': hyperopt.hp.uniform('subsample', 0.25, 1),
        'colsample_bytree': hyperopt.hp.uniform('colsample_bytree', 0.7, 1.0),
        'min_child_weight': hyperopt.hp.choice('min_child_weight', [1, 3, 5, 7]),
        'reg_alpha': hyperopt.hp.uniform('reg_alhpa', 0, 1.0),
        # Avoiding lambda = 0. There is a Github issue on strange behaviour with lambda = 0
        'reg_lambda': hyperopt.hp.uniform('reg_lambda', 0.01, 1.0),
        }



In [18]:

# Esta funcao trata de medir o threshould de se adicionar ou nao mais um hiperparametro no ajuste do modelo

def optimal_threshold(estimator, X, y, n_folds = 10, min_prec = 0.05, fit_params = None):
    
    cv_pred_prob = cross_val_predict(estimator, X, y, method='predict_proba',
                                     cv = n_folds, fit_params=fit_params, n_jobs=-1)[:,1]

    # Once again, the PR curve is discreet and may not contain the exact precision level
    # we are looking for. So, we need to find the closest existing precision
    pr, _, threshold = precision_recall_curve(y, cv_pred_prob)
    # precision is always one element longer than threshold and the last one is always set to 1
    # So I drop the last element of precision so I can use it below to index threshold
    pr = pr[:-1]
    return min(threshold[pr >= min_prec])

# Avaliar se manteremos esse calculo para o problema do hackathon (???)


In [20]:

# Dado o resultado a funcao anterior calcula o Falso Positivo e retorna um matriz.


def thresholded_predict(X, estimator, threshold):
    return np.array([1 if (p >= threshold) else 0 for p in estimator.predict_proba(X)[:,1]])

# Avaliar se manteremos esse calculo para o problema do hackathon (???)


In [32]:
# cria uma versao simplificada da base de cartoes
teste = bdcartoes[['FLAG_DESENVOLVIMENTO', 'FRAUDE','LIMITE_CREDITO_CARTAO','SCORE_FRAUDE','SCORE_CREDITO_BUREAU1','SCORE_CREDITO_BUREAU2','RENDA_BUREAU2']]



In [36]:

# Essa e a parte importante que precisamos alterar do codigo para rogar. 
# Vamos criar uma pequena amostra primeiro pra ver se precisamos adatpar algo no codigo em que nos baseamos



if __name__ == "__main__":    
    # Loading the data
    # feito anteriormente - data = pd.read_csv('creditcard.csv')
    aux1 = teste.drop('FRAUDE', axis = 1)
    X = aux1.values
    aux2 = teste['FRAUDE']
    y = aux2.values
    
    
    # Train/test split, 80/20, random_state set for reproducibility
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y,
                                                        test_size = 0.2, random_state = 1)

    # Further splitting the initial training set so that 10% of all data(1/8 of 80%) 
    # can be used as the evaluation set by XGBoost for early stopping
    X_train, X_early_stop, y_train, y_early_stop = train_test_split(X_train, y_train,test_size = 1/8,
                                                                    stratify = y_train, random_state = 1)
    
    
    # The prior probability distribution of parameters for Bayesian optimization
    param_space = {
            'learning_rate': hyperopt.hp.loguniform('learning_rate', np.log(0.01), np.log(0.2)),
            'max_depth': hyperopt.hp.choice('max_depth', [2, 4, 6, 8, 10]),
            'subsample': hyperopt.hp.uniform('subsample', 0.25, 1),
            'colsample_bytree': hyperopt.hp.uniform('colsample_bytree', 0.7, 1.0),
            'min_child_weight': hyperopt.hp.choice('min_child_weight', [1, 3, 5, 7]),
            'reg_alpha': hyperopt.hp.uniform('reg_alhpa', 0, 1.0),
            # Avoiding lambda = 0. There is a Github issue on strange behaviour with lambda = 0
            'reg_lambda': hyperopt.hp.uniform('reg_lambda', 0.01, 1.0),
            }

    # # # # # # # # #
    # Step 1: Tuning hyper-parameters of the XGBoost classifier
    # # # # # # # # #
    print('Step 1: Tuning hyper-parameters using Bayesian Optimization\n')

    best_params = tune_xgb(param_space, X_train, y_train, X_early_stop, y_early_stop, n_iter = 150)
    
    print('\tThe best hyper-parameters found:\n')
    print(*['\t\t%s = %s' % (k, str(round(v, 4))) for k, v in best_params.items()], sep='\n')

    # # # # # # # # #
    # Step 2: Empirical thresholding: finding optimal classification threshold
    # # # # # # # # #
    print('\nStep 2: Empirical Thresholding\n')
    
    # I use 1500 trees which is very close to optimal n_trees found by early stopping while tuning
    xgboost_clf = XGBClassifier(**best_params, n_estimators=1500)
    
    classification_cutoff = optimal_threshold(xgboost_clf, X_train, y_train, min_prec = MIN_PRECISION)
    
    print('\tOptimal classification threshold = %1.3f' % classification_cutoff)
    
    # # # # # # # # #
    # Setp 3: Training and testing the model
    # # # # # # # # #
    print('\nStep 3: Training and testing the model\n')
    
    # Training on all the training data (excluding the small validation set to avoid overfitting)
    xgboost_clf.fit(X_train, y_train, verbose = False)
    
    y_pred = thresholded_predict(X_test, xgboost_clf, threshold = classification_cutoff)
    
    test_recall = recall_score(y_test, y_pred)
    test_precision = precision_score(y_test, y_pred)
    
    print('\tTest set performance:')
    print('\tRecall    = %2.3f' % test_recall)
    print('\tPrecision = %2.3f' % test_precision)


teste 1
Step 1: Tuning hyper-parameters using Bayesian Optimization

  0%|                                 | 0/150 [00:00<?, ?it/s, best loss: ?]


IndexError: too many indices for array