In [2]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import Imputer 

In [4]:
X_train = np.loadtxt("X_train.csv")
y_train = np.loadtxt("y_train.csv")
X_test = np.loadtxt("X_test.csv")

In [5]:
def cross_val_pimped(clf, X_train, y_train, n):
    r = np.zeros(n)
    for i in range(n):
        idx_train_1 = (np.where(y_train==1))[0]
        idx_train_0 = (np.where(y_train==0))[0]
        
        idx_test_1 = np.random.choice(idx_train_1, size=278, replace=False)
        idx_test_2 = np.random.choice(idx_train_0, size=1112, replace=False)
        
        idx_train_train_1 = np.setdiff1d(idx_train_1,idx_test_1)
        idx_train_train_2 = np.random.choice(np.setdiff1d(idx_train_0,idx_test_2), size=27994, replace=False)
        
        idx_test = np.r_[idx_test_1, idx_test_2]
        idx_train = np.r_[idx_train_train_1, idx_train_train_2]
        
        X_train_train = X_train[idx_train]
        X_train_test = X_train[idx_test]
        y_train_train = y_train[idx_train]
        y_train_test = y_train[idx_test]
        
        clf.fit(X_train_train, y_train_train)
        y_pred = clf.predict(X_train_test)
        r[i] = (accuracy_score(y_pred, y_train_test))
    print(r)
    return r

In [6]:
imputer = Imputer(strategy = 'median')
X_train_i = imputer.fit_transform(X_train)

In [7]:
feature_1 = X_train_i[:,21]*X_train_i[:,24] + X_train_i[:,26]
feature_2 = X_train_i[:,46]/(X_train_i[:,26]+0.0001)
feature_3 = X_train_i[:,17]-X_train_i[:,33]
feature_4 = X_train_i[:,10]*X_train_i[:,43]
feature_5 = X_train_i[:,12]-X_train_i[:,45]
feature_6 = X_train_i[:,20]+X_train_i[:,61]
feature_7 = X_train_i[:,1]-X_train_i[:,44]
feature_8 = X_train_i[:,26]+X_train_i[:,31]/(X_train_i[:,14]+0.0001)
feature_9 = X_train_i[:,23]/(X_train_i[:,26]+0.0001)

In [8]:
features = np.c_[feature_1,
                 feature_2,
                 feature_3,
                 feature_4,
                 feature_5,
                 feature_6,
                 feature_7,
                 feature_8,
                 feature_9]
feature_nan = np.sum(np.isnan(X_train),axis=1)
X_train_plus = np.c_[X_train, np.isnan(X_train), features, feature_nan]

In [9]:
imputer = Imputer(strategy = 'median')
X_train_plus_i = imputer.fit_transform(X_train_plus)

In [10]:
params1 = {'max_depth': 5, 
           'gamma': 0.5819,
           'colsample_bytree': 0.8125,
           'scale_pos_weight': 86, 
           'n_estimators': 240,
           'learning_rate': 0.1027}

model = XGBClassifier(nthread=4, **params1)
model.fit(X_train_plus_i, y_train)

thresholds = np.sort(model.feature_importances_)
results = np.zeros((len(thresholds),3))

In [46]:
i=0
for thresh in thresholds:
    # select features using threshold
    selection = SelectFromModel(model, threshold=thresh, prefit=True)
    select_X_train = selection.transform(X_train_plus_i)
    from sklearn.feature_selection import SelectFromModel
    # train model
    selection_model = XGBClassifier(nthread=4, **params1)
    scores = cross_val_pimped(selection_model, select_X_train, y_train, 8)

    results[i,0] = np.mean(scores)
    results[i,1] = np.std(scores)
    results[i,2] = thresh
    i=i+1
    if i % 10 == 0:
        print('Iteration',i,'on', len(thresholds))

[ 0.9294964   0.92086331  0.9294964   0.91870504  0.92086331  0.92302158
  0.92014388  0.91510791]
[ 0.91798561  0.90647482  0.91366906  0.92230216  0.91582734  0.92589928
  0.93309353  0.93597122]
[ 0.91510791  0.91366906  0.92374101  0.92517986  0.92086331  0.92302158
  0.92589928  0.91151079]
[ 0.91582734  0.9057554   0.91942446  0.93884892  0.92302158  0.92086331
  0.92230216  0.92230216]
[ 0.92517986  0.92086331  0.92877698  0.93309353  0.92589928  0.91798561
  0.92230216  0.92086331]
[ 0.91438849  0.92374101  0.93093525  0.92733813  0.91942446  0.93021583
  0.92086331  0.93093525]
[ 0.91798561  0.92014388  0.93165468  0.92374101  0.92374101  0.91798561
  0.93021583  0.92446043]
[ 0.91870504  0.92158273  0.92517986  0.93093525  0.91223022  0.92014388
  0.91151079  0.92805755]
[ 0.92661871  0.91294964  0.91654676  0.91223022  0.92446043  0.92517986
  0.92374101  0.92661871]
[ 0.93021583  0.90863309  0.92158273  0.92374101  0.91366906  0.92086331
  0.9323741   0.92302158]
Iteration 

[ 0.93021583  0.91223022  0.91366906  0.92302158  0.92014388  0.92733813
  0.91654676  0.91942446]
[ 0.92230216  0.92014388  0.92230216  0.9294964   0.93093525  0.92374101
  0.91942446  0.92158273]
[ 0.92302158  0.92446043  0.91294964  0.9294964   0.92661871  0.92374101
  0.92733813  0.91366906]
[ 0.92517986  0.92302158  0.92014388  0.92230216  0.92661871  0.91223022
  0.91942446  0.92661871]
[ 0.93093525  0.92302158  0.91438849  0.92374101  0.91654676  0.93093525
  0.91870504  0.92733813]
[ 0.91726619  0.91223022  0.92877698  0.92589928  0.91654676  0.91079137
  0.92014388  0.92014388]
[ 0.91366906  0.92733813  0.92805755  0.92517986  0.93453237  0.93165468
  0.93093525  0.91007194]
[ 0.92446043  0.91007194  0.92877698  0.9294964   0.91582734  0.91942446
  0.91510791  0.92661871]
Iteration 90 on 138
[ 0.91294964  0.92014388  0.91582734  0.92302158  0.90143885  0.91438849
  0.92733813  0.92302158]
[ 0.92517986  0.92014388  0.91942446  0.92446043  0.91510791  0.92733813
  0.92877698  0.

       [  9.26169065e-01,   5.19952177e-03,   0.00000000e+00],
       
       [  9.25449640e-01,   5.74343969e-03,   0.00000000e+00],
       
       [  9.25089928e-01,   2.37076013e-03,   0.00000000e+00],
       
       [  9.26888489e-01,   6.80548066e-03,   2.26323120e-03],
       
       [  9.25359712e-01,   7.57747282e-03,   4.70055733e-03],
       
       [  9.25179856e-01,   8.18691128e-03,   7.13788299e-03],
       
       [  9.25359712e-01,   4.99404438e-03,   8.70473497e-03],
       
       [  9.25000000e-01,   7.82115644e-03,   1.27089135e-02],
       
       [  9.25539568e-01,   7.86446443e-03,   1.65389981e-02],

In [11]:
best_tresh = 2.26323120e-03
print('Best threshold :' ,best_tresh)
selection = SelectFromModel(model, threshold=best_tresh, prefit=False)
selection.fit(X_train_plus_i, y_train)
select_X_train_plus = selection.transform(X_train_plus_i)

Best threshold : 0.0022632312


In [12]:
imputer = Imputer(strategy = 'median')
X_test_i = imputer.fit_transform(X_test)
feature_1 = X_test_i[:,21]*X_test_i[:,24] + X_test_i[:,26]
feature_2 = X_test_i[:,46]/(X_test_i[:,26]+0.0001)
feature_3 = X_test_i[:,17]-X_test_i[:,33]
feature_4 = X_test_i[:,10]*X_test_i[:,43]
feature_5 = X_test_i[:,12]-X_test_i[:,45]
feature_6 = X_test_i[:,20]+X_test_i[:,61]
feature_7 = X_test_i[:,1]-X_test_i[:,44]
feature_8 = X_test_i[:,26]+X_test_i[:,31]/(X_test_i[:,14]+0.0001)
feature_9 = X_test_i[:,23]/(X_test_i[:,26]+0.0001)
features = np.c_[feature_1,
                 feature_2,
                 feature_3,
                 feature_4,
                 feature_5,
                 feature_6,
                 feature_7,
                 feature_8,
                 feature_9]
feature_nan = np.sum(np.isnan(X_test),axis=1)
X_test_plus = np.c_[X_test, np.isnan(X_test), features, feature_nan]
imputer = Imputer(strategy = 'median')
X_test_plus_i = imputer.fit_transform(X_test_plus)

In [13]:
select_X_test_plus = selection.transform(X_test_plus_i)

In [14]:
select_X_test_plus.shape

(4987, 74)

In [15]:
np.savetxt("select_X_test_plus", select_X_test_plus, delimiter=";")
np.savetxt("select_X_train_plus", select_X_train_plus, delimiter=";")