In [20]:
import pandas as pd
import numpy as np
import lightgbm as lg 

In [9]:
cd = pd.read_csv('./train.cd', sep='\t', header=-1)
data = pd.read_csv('./train.txt', sep='\t', header=-1)

In [51]:
y = data.iloc[:, 1].values
X = data.iloc[:, 4:].values
features = [str(i) for i in range(500)]

Feature selection

In [99]:
# imitation of standard catboost parameters

lg_params = {
    'nthread': 4,
    'objective': 'regression',
    'metric': 'rmse',
    'n_iterations': 200, 
    'learning_rate': 0.1,
    'max_depth': 6,
    'subsample': 0.66, 
    'bagging_freq': 1,
    'feature_fraction': 1,
}

In [100]:
def lg_fit(lg_params, dstrain, dstest, verbose=10):
    evals_results = {}
    model = lg.train(lg_params, 
                     dstrain, 
                     valid_sets=[dstrain, dstest], 
                     valid_names=['train', 'test'], 
                     evals_result=evals_results, 
                     verbose_eval=verbose)
    return model, evals_results

def cost(q):
    return 1 + 3*min(1, (1.4 - q)/0.15)

def rmse(y, yhat):
    return np.sqrt(np.mean(np.power(y.squeeze() - yhat.squeeze(), 2)))


In [101]:
from sklearn.model_selection import KFold

cv = KFold(5, random_state=42, shuffle=True)

costs = np.zeros(5)
errors = np.zeros(5)
preds = np.zeros(X.shape[0])
fi = np.zeros(500)
for i, (train_idx, test_idx) in enumerate(cv.split(X)):
    # fit model
    X_train, y_train = X[train_idx], y[train_idx]
    X_test, y_test = X[test_idx], y[test_idx]
    dstrain = lg.Dataset(X_train, y_train, feature_name=features)
    dstest = lg.Dataset(X_test, y_test, feature_name=features)    
    model, history = lg_fit(lg_params, dstrain, dstest, verbose=0)
    # predict
    preds[test_idx] = model.predict(X_test)
    errors[i] = rmse(preds[test_idx], y_test)
    # update costs
    q = history['test']['rmse'][model.best_iteration-1]
    estimated_cost = cost(q)
    costs[i] = estimated_cost
    # feature importances
    fi += model.feature_importance()

In [102]:
print("rmse: {:.3f} +- {:.3f}".format(errors.mean(), errors.std()))
print("cost: {:.3f} +- {:.3f}".format(costs.mean(), costs.std()))

rmse: 1.353 +- 0.027
cost: 1.939 +- 0.533


In [103]:
# extract top features
top1 = np.where(fi > 20)[0]
top2 = np.where(fi > 30)[0]
top3 = np.where(fi > 50)[0]

Check different sets of features

In [104]:
def cv_top(X, y, features):

    n_features = features.shape[0]
    X = X[:, features]
    feature_name = [str(f) for f in features]
    
    costs = np.zeros(5)
    errors = np.zeros(5)
    preds = np.zeros(X.shape[0])
    fi = np.zeros(n_features)
    for i, (train_idx, test_idx) in enumerate(cv.split(X)):
        # fit model
        X_train, y_train = X[train_idx], y[train_idx]
        X_test, y_test = X[test_idx], y[test_idx]
        dstrain = lg.Dataset(X_train, y_train, feature_name=feature_name)
        dstest = lg.Dataset(X_test, y_test, feature_name=feature_name)
        model, history = lg_fit(lg_params, dstrain, dstest, verbose=0)
        # predict
        preds[test_idx] = model.predict(X_test)
        errors[i] = rmse(preds[test_idx], y_test)
        # update costs
        q = history['test']['rmse'][model.best_iteration-1]
        estimated_cost = cost(q)
        costs[i] = estimated_cost
        # feature importances
        fi += model.feature_importance()

    print("rmse: {:.3f} +- {:.3f}".format(errors.mean(), errors.std()))
    print("cost: {:.3f} +- {:.3f}".format(costs.mean(), costs.std()))
    return fi

print("top1")
fi_top1 = cv_top(X, y, top1)
print("top2")
fi_top1 = cv_top(X, y, top2)
print("top3")
fi_top1 = cv_top(X, y, top3)

top1
rmse: 1.343 +- 0.023
cost: 2.144 +- 0.456
top2
rmse: 1.293 +- 0.032
cost: 3.101 +- 0.594
top3
rmse: 1.164 +- 0.020
cost: 4.000 +- 0.000


In [105]:
' '.join(map(str, top3.tolist()))

'28 48 90 110 111 129 193 208 283 292 337 345 469 482'