In [1]:
%matplotlib inline

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import xgboost as xgb
import operator
from scipy.sparse import csr_matrix

np.random.seed(105)

In [2]:
def create_feature_map(features):
    outfile = open('xgb1.fmap', 'w')
    for i, feat in enumerate(features):
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))
    outfile.close()

def get_importance(gbm, features):
    create_feature_map(features)
    importance = gbm.get_fscore(fmap='xgb.fmap')
    importance = sorted(importance.items(), key=itemgetter(1), reverse=True)
    return importance

def save_sparse_csr(filename,array):
    np.savez(filename,data = array.data ,indices=array.indices,
             indptr =array.indptr, shape=array.shape )

def load_sparse_csr(filename):
    loader = np.load(filename)
    return csr_matrix((  loader['data'], loader['indices'], loader['indptr']),
                         shape = loader['shape'])

In [3]:
df_tr = pd.read_hdf('../input_hdf/df_tr_with_faron.hdf5','table')
df_te = pd.read_hdf('../input_hdf/df_te_with_faron.hdf5','table')

In [10]:
bst = xgb.Booster()
bst.load_model('435461.model')

#(bst.get_score(fmap='xgb.fmap', importance_type='gain'))
importance = bst.get_fscore(fmap='xgb.fmap')
importance = sorted(importance.items(), key=operator.itemgetter(1))
df = pd.DataFrame(importance, columns=['feature', 'avg_gain'])

In [12]:
df.shape

(761, 2)

In [13]:
df_tr_sel = df_tr[df.feature.values].astype(np.float32)
df_te_sel = df_te[df.feature.values].astype(np.float32)

In [14]:
del df_tr
del df_te
import gc
gc.collect()

48

In [20]:
df_te_sel.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1183748 entries, 0 to 1183747
Columns: 761 entries, L1_S25_F2732 to magic3
dtypes: float32(761)
memory usage: 3.4 GB


In [21]:
df_tr_sel.to_hdf('../modelling_new/xgboost/feats_selected/train_init_761.hdf5','table')
df_te_sel.to_hdf('../modelling_new/xgboost/feats_selected/test_init_761.hdf5','table')

In [6]:
create_feature_map(df_tr_sel.columns)

In [7]:
label = pd.read_csv('../input_orig/train_numeric.csv', usecols=['Response']).values

In [8]:
dtrain = xgb.DMatrix(df_tr_sel, label=label)

In [9]:
dtest = xgb.DMatrix(df_te_sel)

In [15]:
dtrain.save_binary("train_with_faron_sel.buffer")
dtest.save_binary("test_with_faron_sel.buffer")

In [10]:
from sklearn.cross_validation import train_test_split
df1 = pd.read_csv('../input_orig/train_numeric.csv', usecols=['Id','Response'])
X_train, X_test, y_train, y_test = train_test_split(df1, df1.Response.values, test_size=0.2, random_state=42, stratify=df1.Response.values)

In [11]:
X_val1, X_val2, y_val1, y_val2 = train_test_split(X_test, y_test, test_size=0.5, random_state=224, stratify=y_test)

In [12]:
dvalid = dtrain.slice(X_test.index)
dvalid.num_row()

236750

In [13]:
dval1 = dtrain.slice(X_val1.index)
dval1.num_row()

118375

In [14]:
dval2 = dtrain.slice(X_val2.index)
dval2.num_row()

118375

In [15]:
dtrain = dtrain.slice(X_train.index)
dtrain.num_row()

#dtrain.save_binary('dtrain.buffer')
#dvalid.save_binary('dvalid.buffer')

946997

In [16]:
dtrain.save_binary('dtrain.buffer')
dval1.save_binary('dval1.buffer')
dval2.save_binary('dval2.buffer')
dvalid.save_binary('dvalid.buffer')