In [1]:
%matplotlib inline

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import xgboost as xgb
import operator
from scipy.sparse import csr_matrix

np.random.seed(105)

In [2]:
def create_feature_map(features):
    outfile = open('xgb_542_feats.fmap', 'w')
    for i, feat in enumerate(features):
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))
    outfile.close()

def get_importance(gbm, features):
    create_feature_map(features)
    importance = gbm.get_fscore(fmap='xgb_542_feats.fmap')
    importance = sorted(importance.items(), key=itemgetter(1), reverse=True)
    return importance

def save_sparse_csr(filename,array):
    np.savez(filename,data = array.data ,indices=array.indices,
             indptr =array.indptr, shape=array.shape )

def load_sparse_csr(filename):
    loader = np.load(filename)
    return csr_matrix((  loader['data'], loader['indices'], loader['indptr']),
                         shape = loader['shape'])

In [3]:
df_train = pd.read_hdf('train-rest-542-unq.hdf5','table')
#df_test = pd.read_hdf('../../search/feats/te_time_diff_adjacent12.hdf5','table')

In [4]:
bst = xgb.Booster()
bst.load_model('xgb_542_0217mcc_0707auc.model')

In [7]:
#(bst.get_score(fmap='xgb.fmap', importance_type='gain'))
importance = bst.get_fscore(fmap='xgb_542_feats.fmap')
importance = sorted(importance.items(), key=operator.itemgetter(1))
df = pd.DataFrame(importance, columns=['feature', 'fscore'])

In [10]:
df_te1 = pd.read_hdf('../../search/feats/te_station_pass1.hdf5','table')
df_te2 = pd.read_hdf('../../search/feats/te_line_pass2.hdf5','table')
df_te3 = pd.read_hdf('../../search/feats/te_number_stations_passed3.hdf5','table')
df_te4 = pd.read_hdf('../../search/feats/te_nonNaN_full4.hdf5','table')
df_te5 = pd.read_hdf('../../search/feats/te_sum_full5.hdf5','table')
df_te6 = pd.read_hdf('../../search/feats/te_line_station_nonNaN6.hdf5','table')
df_te7 = pd.read_hdf('../../search/feats/te_line_station_sum7.hdf5','table')
df_te8 = pd.read_hdf('../../search/feats/te_line_station_minmax8.hdf5','table')
df_te9 = pd.read_hdf('../../search/feats/te_minmaxdiff_full_date9.hdf5','table')
df_te10 = pd.read_hdf('../../search/feats/te_line_station_diff_date10.hdf5','table')
df_te13 = pd.read_hdf('../../search/feats/te_hash_full_date_num_cat13.hdf5','table')

df_test = pd.concat([df_te1,df_te2,df_te3,df_te4,df_te5,df_te6,df_te7,df_te8,df_te9,df_te10,df_te13],axis=1)

In [11]:
df_train_sel = df_train[df.feature.values]
df_test_sel = df_test[df.feature.values]

In [15]:
df_train_sel.to_hdf('feats_selected/train_rest_285_sel.hdf5', 'table')
df_test_sel.to_hdf('feats_selected/test_rest_285_sel.hdf5', 'table')

In [None]:
for col in df_train_sel.columns:
    if str(df_train_sel[col].dtype) == 'float64':
        df_train_sel[col] = df_train_sel[col].astype(np.float32)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [None]:
df_train_sel.info()