In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb

from sklearn.preprocessing import OneHotEncoder
from IPython.display import display

CATEGORICAL_COLS = [
    'ind_empleado',
    'pais_residencia',
    'conyuemp',
    'sexo',
    'indrel_1mes',
    'tiprel_1mes',
    'indresi',
    'indext',
    'canal_entrada',
    'indfall',
    'cod_prov',
    'segmento'
]

UNUSED_COLS = [
    'nomprov',
    'day',
    'day_alta'
]

Y_COLS = [
    'ind_ahor_fin_ult1',
    'ind_aval_fin_ult1',
    'ind_cco_fin_ult1',
    'ind_cder_fin_ult1',
    'ind_cno_fin_ult1',
    'ind_ctju_fin_ult1',
    'ind_ctma_fin_ult1',
    'ind_ctop_fin_ult1',
    'ind_ctpp_fin_ult1',
    'ind_deco_fin_ult1',
    'ind_deme_fin_ult1',
    'ind_dela_fin_ult1',
    'ind_ecue_fin_ult1',
    'ind_fond_fin_ult1',
    'ind_hip_fin_ult1',
    'ind_plan_fin_ult1',
    'ind_pres_fin_ult1',
    'ind_reca_fin_ult1',
    'ind_tjcr_fin_ult1',
    'ind_valo_fin_ult1',
    'ind_viv_fin_ult1',
    'ind_nomina_ult1',
    'ind_nom_pens_ult1',
    'ind_recibo_ult1'
]

DATE_COLS = [
    'fecha_dato',
    'fecha_alta',
    'ult_fec_cli_1t'
]

ID_COLS = [
    'ncodpers'
]

In [2]:
def preprocess(in_data):
    dataset = in_data

    for col in list(dataset.columns):
        if col in CATEGORICAL_COLS:
            dataset[col].fillna("NA", inplace=True)
            dataset[col] = dataset[col].astype("category")
        elif col in UNUSED_COLS:
            dataset = dataset.drop(col, axis=1)
        elif col in ['antiguedad', 'age']:
            dataset[col] = dataset[col].replace(r'^\s*NA\s*$', 0, regex=True)
            dataset[col] = dataset[col].astype(np.int8)
        elif dataset[col].dtype in ['object', np.str]:
            dataset[col] = dataset[col].replace(r'^\s*$', 'NA', regex=True)

    for date_col in DATE_COLS:
        data = dataset[date_col]
    
        dataset["year-%s" % date_col] = data.dt.year
        dataset["year-%s" % date_col] = data.dt.month
        dataset["year-%s" % date_col] = data.dt.day
        dataset['isweekend'] = (data.dt.weekday >= 5).astype(int)
        dataset = dataset.drop(date_col, axis =1)


    return dataset

In [5]:
data_type_dict = {
    'ncodpers': np.str,
}

train_data = pd.read_csv("../inputs/train_1_000_000.csv", dtype=data_type_dict, parse_dates=DATE_COLS)
test_data = pd.read_csv("../inputs/test_10_000.csv", dtype=data_type_dict, parse_dates=DATE_COLS)

print("Train data shape: " + format(train_data.shape))
print("Test data shape:" + format(test_data.shape))

train_data = preprocess(train_data)
test_data = preprocess(test_data)
train_data.describe()
test_data.describe()

  interactivity=interactivity, compiler=compiler, result=result)


Train data shape: (999999, 48)
Test data shape:(9999, 24)


Unnamed: 0,age,ind_nuevo,antiguedad,indrel,tipodom,ind_actividad_cliente,year-fecha_dato,isweekend,year-fecha_alta,year-ult_fec_cli_1t
count,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,13.0
mean,27.578358,0.0015,33.423742,1.127413,1.0,0.373937,28.0,0.0,13.769377,14.307692
std,9.67755,0.038705,1.913346,3.531496,0.0,0.483872,0.0,0.0,7.915475,9.83518
min,4.0,0.0,0.0,1.0,1.0,0.0,28.0,0.0,1.0,2.0
25%,22.0,0.0,33.0,1.0,1.0,0.0,28.0,0.0,9.0,7.0
50%,23.0,0.0,34.0,1.0,1.0,0.0,28.0,0.0,11.0,14.0
75%,28.0,0.0,34.0,1.0,1.0,1.0,28.0,0.0,20.0,22.0
max,97.0,1.0,34.0,99.0,1.0,1.0,28.0,0.0,30.0,28.0


In [None]:
train_data[CATEGORICAL_COLS] = train_data[CATEGORICAL_COLS].apply(lambda x: x.cat.codes)

In [None]:
param = {'max_depth':10, 'eta':0.02, 'silent':1, 'objective':'binary:logistic' }
param['nthread'] = 4
param['eval_metric'] = 'auc'
param['subsample'] = 0.7
param['colsample_bytree']= 0.7
param['min_child_weight'] = 0
param['booster'] = "gblinear"

X_train = train_data

for y_col in Y_COLS:
    print("train:", y_col)
    X_without_y = X_train[list(set(train_data.columns) - set(Y_COLS))]
    Y = X_train[y_col]
    X_float = X_without_y.astype(np.float64)
    
    dtrain = xgb.DMatrix(X_float, label=Y)
    #dtest = xgb.DMatrix(X)
    
    watchlist  = [(dtrain,'train')]
    num_round = 300
    early_stopping_rounds=10
    bst = xgb.train(param, dtrain, num_round, watchlist,early_stopping_rounds=early_stopping_rounds)

    # ypred = bst.predict(dtest)
    #output = pd.DataFrame({ 'ncodpers' : test_data['ncodpers'], y_col: ypred})
    #display(output)
    #output.describe()


train: ind_ahor_fin_ult1
[0]	train-auc:0.690349
Will train until train-auc hasn't improved in 10 rounds.
[1]	train-auc:0.688083
[2]	train-auc:0.684881
[3]	train-auc:0.681578
[4]	train-auc:0.678716
[5]	train-auc:0.67624
[6]	train-auc:0.674154
[7]	train-auc:0.672493
[8]	train-auc:0.671025
[9]	train-auc:0.669742
[10]	train-auc:0.668581
Stopping. Best iteration:
[0]	train-auc:0.690349

train: ind_aval_fin_ult1
[0]	train-auc:0.780617
Will train until train-auc hasn't improved in 10 rounds.
[1]	train-auc:0.777278
[2]	train-auc:0.773426
[3]	train-auc:0.769502
[4]	train-auc:0.765705
[5]	train-auc:0.762504
[6]	train-auc:0.759657
[7]	train-auc:0.75719
[8]	train-auc:0.755218
[9]	train-auc:0.75371
[10]	train-auc:0.752335
Stopping. Best iteration:
[0]	train-auc:0.780617

train: ind_cco_fin_ult1
[0]	train-auc:0.667345
Will train until train-auc hasn't improved in 10 rounds.
[1]	train-auc:0.671296
[2]	train-auc:0.675242
[3]	train-auc:0.678913
[4]	train-auc:0.682118
[5]	train-auc:0.684974
[6]	train-au