In [11]:
import os
import random

import pandas as pd
import numpy as np

import xgboost as xgb

DATA_DIR = 'data'
TEST_X_FILENAME = os.path.join(DATA_DIR, 'crx_data_test_x.csv')
TRAIN_X_FILENAME = os.path.join(DATA_DIR, 'crx_data_train_x.csv')
TRAIN_Y_FILENAME = os.path.join(DATA_DIR, 'crx_data_train_y.csv')
RESULT_FILENAME = os.path.join(DATA_DIR, 'result.csv')

In [2]:
def read_file(filename: str) -> pd.DataFrame:
    df = pd.read_csv(filename, sep=',', header=None)
    return df


def categorical(train, test, col):
    uniques = list(sorted(set(train[col].unique()) | set(test[col].unique()) - {'?'}))
    count = len(uniques)

    columns = ['idx']
    for u in uniques:
        columns.append('{}_{}'.format(col, u))
    
    def make_result(df):
        feature = []
        for idx, item in df[col].iteritems():
            line = [idx]
            if item == '?':
                line.extend([0] * count)
            else:
                suffix = [0] * count
                suffix[uniques.index(item)] = 1
                line.extend(suffix)
            feature.append(line)
        
        result = pd.DataFrame(feature, columns=columns)
        result.set_index('idx', inplace=True)
        return result

    return make_result(train), make_result(test)


def preprocessing(train, test):
    categorical_idx = [0, 3, 4, 5, 6, 8, 9, 11, 12]
    for idx in categorical_idx:
        train_res, test_res = categorical(train, test, idx)
        train = pd.concat([train, train_res], axis=1, sort=False)
        test = pd.concat([test, test_res], axis=1, sort=False)
        del train[idx], test[idx]

    nan_float_idx = [1]
    for idx in nan_float_idx:
        train[idx] = train[idx].apply(lambda x: float(x) if x != '?' else None)
        test[idx] = test[idx].apply(lambda x: float(x) if x != '?' else None)
        mean_value = train[idx].mean()
        train[idx] = train[idx].fillna(mean_value)
        test[idx] = test[idx].fillna(mean_value)

    nan_int_idx = [13]
    for idx in nan_int_idx:
        values = [int(item) for idx, item in train[idx].iteritems() if item != '?']
        train[idx] = train[idx].apply(lambda x: int(x) if x != '?' else random.choice(values))
        test[idx] = test[idx].apply(lambda x: int(x) if x != '?' else random.choice(values))

    return train, test

In [5]:
def pipeline():
    train_x = read_file(TRAIN_X_FILENAME)
    test_x = read_file(TEST_X_FILENAME)
    train_y = read_file(TRAIN_Y_FILENAME)

    train_x, test_x = preprocessing(train_x, test_x)
    
    assert train_x.shape[1] == test_x.shape[1]

    train_dmatrix = xgb.DMatrix(data=train_x, label=train_y)
    test_dmatrix = xgb.DMatrix(data=test_x)

    params = {
        "objective": "binary:logistic",
        "colsample_bytree": 0.3,
        "learning_rate": 0.1,
        "max_depth": 5, 
        "alpha": 10
    }
    xg_reg = xgb.train(params=params, dtrain=train_dmatrix)
    predict = xg_reg.predict(test_dmatrix)
    
    return predict

In [6]:
predict = pipeline()

print(predict)

[21:00:52] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 18 extra nodes, 0 pruned nodes, max_depth=5
[21:00:52] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 12 extra nodes, 0 pruned nodes, max_depth=3
[21:00:52] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 10 extra nodes, 0 pruned nodes, max_depth=4
[21:00:52] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 12 extra nodes, 0 pruned nodes, max_depth=5
[21:00:52] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 12 extra nodes, 0 pruned nodes, max_depth=5
[21:00:52] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 12 extra nodes, 0 pruned nodes, max_depth=5
[21:00:52] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 10 extra nodes, 0 pruned nodes, max_depth=4
[21:00:52] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 10 extra nodes, 0 pruned nodes, max_depth=4
[21:00:52] /work

In [13]:
np.savetxt(RESULT_FILENAME, 1 - predict.round().astype(int), delimiter=',')