## Summary

The ideas is to start with a superset of predictor columns and find a subset of the same which maximizes validation score using a stepwise greedy algorithm.

In [3]:
import pandas as pd
import numpy as np
import time as time
import xgboost as xgb

In [8]:
INPUT_DIR = '../input/'

In [9]:
ts = time.time()
train = pd.read_csv(INPUT_DIR + 'train.csv')
time.time() - ts

5.548439979553223

In [10]:
ts = time.time()
test = pd.read_csv(INPUT_DIR + 'test.csv')
time.time() - ts

70.9879629611969

In [11]:
train['new_target'] = np.log(train['target'] + 1.0)

### Formalizing the code to a routine.

In [13]:
 def get_model_with_predictors_beyond_given_non_zero_threshold(percent_threshold=10, 
                                                              show_evaluation = True,
                                                              data=train):
    
    rel_cols = get_rel_cols(percent_threshold, data)

    ### Now, let us train gradient boosting model using xgboost

    X_COLUMNS = rel_cols
    Y_COLUMN = 'new_target'

    X = train[X_COLUMNS]
    Y = train[[Y_COLUMN]]

    xgb_complete_data = xgb.DMatrix(X, Y, feature_names=X_COLUMNS)
    xgb_params = {'eta' : 0.01, 'eval_metric' : 'rmse'}

    model_xgboost = xgb.cv(params=xgb_params,
                           dtrain=xgb_complete_data,
                           num_boost_round=1000,
                           nfold=5,
                           early_stopping_rounds=5)

    return (len(rel_cols), model_xgboost)


In [14]:
optim_percent=17

## Find the model using a column subset that maximizes out of sample score

In [23]:
def get_rel_cols(percent_threshold, data):
    cols_apart_from_id_and_target = [col for col in data.columns if col not in ['ID', 'target', 'new_target']]

    non_zero_percent_for_col = np.zeros(len(cols_apart_from_id_and_target))

    for i in range(len(cols_apart_from_id_and_target)):
        non_zero_percent_for_col[i] = \
            100.0* len(data[cols_apart_from_id_and_target[i]].to_numpy().nonzero()[0])/\
        len(data[cols_apart_from_id_and_target[i]].to_numpy())    

    num_rel_entries = np.sum(non_zero_percent_for_col > percent_threshold)

    rel_col_indices = np.argsort(non_zero_percent_for_col)[::-1][0:num_rel_entries]

    rel_cols = [cols_apart_from_id_and_target[col] for col in rel_col_indices]
    
    return rel_cols

In [16]:
FULL_COLS_LIST = get_rel_cols(17, train)

In [17]:
def get_prediction_score(X_COLUMNS, data=train):
    Y_COLUMN = 'new_target'

    X = data[X_COLUMNS]
    Y = data[[Y_COLUMN]]
    
    xgb_complete_data = xgb.DMatrix(X, Y, feature_names=X_COLUMNS)
    xgb_params = {'eta' : 0.01, 'eval_metric' : 'rmse'}
    
    model_xgboost = xgb.cv(params=xgb_params,
                           dtrain=xgb_complete_data,
                           num_boost_round=1000,
                           nfold=5,
                           early_stopping_rounds=5)
                           
    return model_xgboost['test-rmse-mean'].min()

In [18]:
import operator
def get_prediction_score_given_predictors(given_list, data):
    col_list_to_score = {}
    for col in FULL_COLS_LIST:
        if col not in given_list:
            cols_list = given_list + [col]
            score = get_prediction_score(cols_list, data)
            col_list_to_score[str(cols_list)] = score

    return sorted(col_list_to_score.items(), key=operator.itemgetter(1))

In [20]:
import ast
ts = time.time()
columns_list = []
MAX_NUM_PREDICTORS = 50
max_columns_list_to_score = dict()
while len(columns_list) < MAX_NUM_PREDICTORS:
    col_list_to_score = get_prediction_score_given_predictors(columns_list, train)
    columns_list = ast.literal_eval(col_list_to_score[0][0])
    max_columns_list_to_score[col_list_to_score[0][0]] = col_list_to_score[0][1]

time.time() - ts 

['0ff32eb98']
['c5a231d81']
['91f701ba2']
['c47340d97']
['0572565c2']
['adb64ff71']
['f190486d6']
['5c6487af1']
['e176a204a']
['6619d81fc']
['70feb1494']
['23310aa6f']
['190db8488']
['1db387535']
['491b9ee45']
['66ace2992']
['9fd594eec']
['fc99f9426']
['58e2e02e6']
['703885424']
['eeb9cd3aa']
['1931ccfdd']
['324921c7b']
['1702b5bf0']
['fb0f5dbfe']
['f74e8f13d']
['20aa07010']
['26fc93eb7']
['58232a6fb']
['15ace8c9f']
['fb49e4212']
['2ec5b290f']
['62e59a501']
['963a49cdc']
['58e056e12']
['241f0f867']
['6eef030c1']
['b43a7cfd5']
['d6bb78916']
['024c577b9']
['11e12dbe8']
['166008929']
['861076e21']
['f02ecb19c']
['f97d9431e']
['9de83dc23']
['ca2b906e8']
['935ca66a9']
['a09a238d0']
['77deffdf0']
['bb0ce54e9']
['c8d582dd2']
['c0d2348b7']
['8781e4b91']
['bd6da0cca']
['cbb673163']
['68a945b18']
['c10f31664']
['ad009c8b9']
['62fb56487']
['ea772e115']
['bc70cbc26']
['939f628a7']
['4bcf15776']
['1fe5d56b9']
['7e814a30d']
['5d3b81ef8']
['aca228668']
['ef30f6be5']
['070f95c99']
['4da206d28']
['b7c9

631.5326697826385

In [21]:
col_list_to_score

[("['f190486d6', 'eeb9cd3aa']", 1.5907924),
 ("['f190486d6', 'c5a231d81']", 1.5921315999999999),
 ("['f190486d6', '58e2e02e6']", 1.5931056000000001),
 ("['f190486d6', 'c47340d97']", 1.5931978),
 ("['f190486d6', '91f701ba2']", 1.5935762),
 ("['f190486d6', '1db387535']", 1.5941120000000002),
 ("['f190486d6', 'e176a204a']", 1.595348),
 ("['f190486d6', '1931ccfdd']", 1.5977874),
 ("['f190486d6', '15ace8c9f']", 1.5980607999999998),
 ("['f190486d6', '2ec5b290f']", 1.598067),
 ("['f190486d6', '66ace2992']", 1.5988718),
 ("['f190486d6', 'adb64ff71']", 1.5988748),
 ("['f190486d6', '024c577b9']", 1.5993545999999998),
 ("['f190486d6', '0ff32eb98']", 1.5997986000000002),
 ("['f190486d6', '1702b5bf0']", 1.6001374),
 ("['f190486d6', '703885424']", 1.6003938000000002),
 ("['f190486d6', '491b9ee45']", 1.6010132000000001),
 ("['f190486d6', '58e056e12']", 1.6019039999999998),
 ("['f190486d6', '6eef030c1']", 1.6025362),
 ("['f190486d6', '0572565c2']", 1.6030445999999998),
 ("['f190486d6', '62e59a501']", 

In [22]:
max_columns_list_to_score

{"['f190486d6']": 1.6449045999999998, "['f190486d6', 'eeb9cd3aa']": 1.5907924}