In [28]:
from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>")

In [2]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import Imputer, StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, ExtraTreesRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split, StratifiedKFold
from sklearn.metrics import log_loss
import xgboost as xgb
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import sklearn.manifold as sm
import scipy.sparse as sps
from sklearn.feature_extraction import DictVectorizer as DV
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
import time
from sklearn.metrics import normalized_mutual_info_score
from sklearn.naive_bayes import BernoulliNB, GaussianNB

plt.style.use('ggplot')
%matplotlib inline

In [3]:
import feature_preprocessing as fproc
import feature_extraction as fext

In [4]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

Y_train = df_train['target'].values
id_test = df_test['ID'].values
id_train = df_train['ID'].values

df_train = df_train.drop(['ID','target'],axis=1)
df_test = df_test.drop(['ID'],axis=1)

In [3]:
drop_features = set(['v8','v23','v25','v31','v36','v37','v46',
                'v51','v53','v54','v63','v73','v75','v79','v81','v82',
                'v89','v92','v95','v105','v107','v108','v109','v110',
                'v116','v117','v118','v119','v123','v124','v128'])

In [4]:
cat_features = ['v3', 'v22', 'v24', 'v30', 'v31', 'v47', 'v52', 'v56', 'v66', 'v71', 
                'v74', 'v75', 'v79', 'v91', 'v107', 'v110', 'v112', 'v113', 'v125']
num_features = ['v18', 'v19', 'v12', 'v13', 'v10', 'v11','v16', 'v17', 
                'v14', 'v15', 'v118', 'v119', 'v114', 'v115', 'v116', 
                'v117', 'v111','v89', 'v88', 'v85', 'v84', 'v87', 'v86', 
                'v81', 'v80', 'v83', 'v82', 'v69', 'v68', 'v67', 'v65', 
                'v64', 'v63', 'v62', 'v61', 'v60', 'v92', 'v93', 'v90', 
                'v96', 'v97', 'v94', 'v95', 'v106', 'v98', 'v104', 'v103', 
                'v102', 'v101', 'v100', 'v105', 'v99', 'v78', 'v76', 'v77', 
                'v70', 'v72', 'v73', 'v130', 'v131', 'v124', 'v127', 'v126', 
                'v121', 'v120', 'v123', 'v122', 'v129', 'v128', 'v41', 'v40', 
                'v43', 'v42', 'v45', 'v44', 'v46', 'v49', 'v48', 'v23', 'v21',
                'v20', 'v27', 'v26', 'v25', 'v29', 'v28', 'v57', 'v54', 'v55', 
                'v53', 'v50', 'v51', 'v109', 'v58', 'v59', 'v32', 'v33', 'v34', 
                'v35', 'v36', 'v37', 'v38', 'v39', 'v108', 'v1', 'v2', 
                'v4', 'v5', 'v6', 'v7', 'v8', 'v9']

In [5]:
dcat_features = list(set(cat_features) - drop_features)
dnum_features = list(set(num_features) - drop_features)

In [7]:
df_train = df_train.drop(list(drop_features),axis=1)
df_test = df_test.drop(list(drop_features),axis=1)

In [8]:
df_train[cat_features].describe()

Unnamed: 0,v30,v22,v91,v24,v74,v66,v71,v112,v113,v125,v56,v3,v52,v47
count,54211,113821,114318,114321,114321,114321,114321,113939,59017,114244,107439,110864,114318,114321
unique,7,18210,7,5,3,3,9,22,36,90,122,3,12,10
top,C,AGDF,A,E,B,A,F,F,G,BM,BW,C,J,C
freq,32178,2386,27079,55177,113560,70353,75094,21671,16252,5759,11351,110584,11103,55425


In [28]:
df_train['v47'].value_counts()

C    55425
I    39071
E     5301
F     4322
G     3946
D     3157
J     3010
B       50
A       38
H        1
Name: v47, dtype: int64

count
v3, v74
v91, v30

v71: L + D + K

In [33]:
def find_denominator(df, col):
    """
    Function that trying to find an approximate denominator used for scaling.
    So we can undo the feature scaling.
    """
    vals = df[col].dropna().sort_values().round(8)
    vals = pd.rolling_apply(vals, 2, lambda x: x[1] - x[0])
    vals = vals[vals > 1e-5]
    
    return vals.value_counts().idxmax()

In [34]:
def df_scale(df_train, df_test, num_features):
    df = pd.concat([df_train, df_test], axis=0)
    for f in tqdm(num_features):
        if f not in df_train.columns:
            continue

        df_train.loc[df_train[f].round(5) == 0, f] = 0
        df_test.loc[df_test[f].round(5) == 0, f] = 0

        denominator = find_denominator(df, f)
        df_train[f] *= 1/denominator
        df_test[f] *= 1/denominator
    return df_train, df_test

In [36]:
df_train, df_test = df_scale(df_train, df_test, dnum_features)

100%|██████████| 86/86 [00:25<00:00,  3.71it/s]


### Количество пропущенных значений:

In [40]:
is_missing = df_train.isnull().values.astype(int)
print '1. Количество пропущенных элементов в таблице с обучающей выборкой: ', float(np.sum(is_missing))
print '2. Количество объектов имеющие хотя бы один пропуск: ', np.sum(np.sum(is_missing, axis=1) > 0)
print '3. Количество признаков имеющие хотя бы один пропуск: ', np.sum(np.sum(is_missing, axis=0) > 0)

 1. Количество пропущенных элементов в таблице с обучающей выборкой:  3813061.0
2. Количество объектов имеющие хотя бы один пропуск:  96565
3. Количество признаков имеющие хотя бы один пропуск:  91


### Факторизация категориальных признаков

In [39]:
factor = fproc.Factorize(nan_strategy=np.nan)

fcat_features = list(set(cat_features).intersection(drop_features))
fac_train = factor.fit_transform(df_train[fcat_features])
fac_test = factor.transform(df_test[fcat_features])

100%|██████████| 5/5 [00:00<00:00, 65.20it/s]


### Sort

In [40]:
df_cat = pd.concat((df_train[dcat_features], df_test[dcat_features]), axis=0)

In [41]:
ranker = fext.RankCount()
ranker.fit(df_cat)

100%|██████████| 14/14 [00:00<00:00, 24.11it/s]


RankCount()

In [42]:
rank_train = ranker.transform(df_train[dcat_features])
rank_test = ranker.transform(df_test[dcat_features])

14it [00:00, 19.66it/s]
14it [00:00, 19.91it/s]


### NaN-features

In [44]:
nan_encoder = fext.NanEncoding()
nan_train = nan_encoder.fit_transform(df_train)
nan_test = nan_encoder.transform(df_test)

### Избавляемся от NaN'ов

In [45]:
imp = fproc.SuperImputer(-999.0, 0, num_features, dcat_features)
nonan_train = imp.fit_transform(pd.concat((df_train[num_features], rank_train + 1), axis=1))
nonan_test = imp.transform(pd.concat((df_test[num_features], rank_test + 1), axis=1))

In [31]:
imp = fproc.SuperImputer(-999.0, 999.0, num_features, cat_features)
rnonan_train = imp.fit_transform(pd.concat((df_train[num_features], rank_train), axis=1))
rnonan_test = imp.transform(pd.concat((df_test[num_features], rank_test), axis=1))

NameError: name 'rank_train' is not defined

### OneHotEncoding

In [46]:
encoder = OneHotEncoder()
norm_features = list(set(dcat_features) - set(['v22']))
df_enc = encoder.fit_transform(pd.concat([nonan_train[norm_features], nonan_test[norm_features]], axis=0))

In [47]:
enc_train = df_enc.toarray()[:len(df_train)]
enc_test = df_enc.toarray()[len(df_train):]

In [48]:
imp = fproc.SuperImputer(-999.0, -999.0, num_features, dcat_features)
nonan_train = imp.fit_transform(pd.concat((df_train[num_features], rank_train), axis=1))
nonan_test = imp.transform(pd.concat((df_test[num_features], rank_test), axis=1))

### Счетчики

In [50]:
counter = fext.CountEncoding(n_folds=7, verbose=True)
count_train = counter.fit_transform(nonan_train[dcat_features], Y_train)
count_test = counter.transform(nonan_test[dcat_features])

14it [00:09,  1.47it/s]
14it [00:00, 45.21it/s]


### СуперФичи

In [17]:
imp_features = ['v50', 'v10', 'v12', 'v14', 'v40', 'v114']
norm_features = list(set(cat_features) - set(['v22']))

In [27]:
imp = fproc.SuperImputer('mean', -999, imp_features, norm_features)
no_train = imp.fit_transform(pd.concat((df_train[imp_features], fac_train[norm_features]), axis=1))
no_test = imp.transform(pd.concat((df_test[imp_features], fac_test[norm_features]), axis=1))

In [36]:
reload(fext)

<module 'feature_extraction' from 'feature_extraction.py'>

In [37]:
super_encoder = fext.SuperCatEncoder(cat_features=norm_features, num_features=imp_features)
df_super = super_encoder.get(pd.concat([no_train, no_test]))

100%|██████████| 13/13 [10:59<00:00, 48.99s/it]


In [39]:
super_train = df_super[:len(df_train)]
super_test = df_super[len(df_train):]

### Парные признаки

In [51]:
pair_train, pair_test = fext.make_pair_features(df_train[cat_features], df_test[cat_features])

19it [00:24,  1.29s/it]


In [52]:
factor = fproc.Factorize(nan_strategy=-999.0)
fac_pair_train = factor.fit_transform(pair_train)
fac_pair_test = factor.transform(pair_test)

100%|██████████| 171/171 [00:04<00:00, 45.28it/s]


## Save data

In [42]:
super_train.to_csv('super_train.csv', index=False)
super_test.to_csv('super_test.csv', index=False)

In [62]:
nonan_train.to_csv('nonan_train.csv', index=False)
nonan_test.to_csv('nonan_test.csv', index=False)

fac_train.to_csv('fac_train.csv', index=False)
fac_test.to_csv('fac_test.csv', index=False)

fac_pair_train.to_csv('fac_pair_train.csv', index=False)
fac_pair_test.to_csv('fac_pair_test.csv', index=False)

nan_train.to_csv('nan_train.csv', index=False)
nan_test.to_csv('nan_test.csv', index=False)

count_train.to_csv('count_train.csv', index=False)
count_test.to_csv('count_test.csv', index=False)

pd.DataFrame(enc_train).to_csv('enc_train.csv', index=False)
pd.DataFrame(enc_test).to_csv('enc_test.csv', index=False)

## Downland data

In [19]:
super_train = pd.read_csv('super_train.csv')
super_test = pd.read_csv('super_test.csv')

In [6]:
nonan_train = pd.read_csv('nonan_train.csv')
nonan_test = pd.read_csv('nonan_test.csv')

fac_train = pd.read_csv('fac_train.csv')
fac_test = pd.read_csv('fac_test.csv')

fac_pair_train = pd.read_csv('fac_pair_train.csv')
fac_pair_test = pd.read_csv('fac_pair_test.csv')

nan_train = pd.read_csv('nan_train.csv')
nan_test = pd.read_csv('nan_test.csv')

count_train = pd.read_csv('count_train.csv')
count_test = pd.read_csv('count_test.csv')

enc_train = pd.read_csv('enc_train.csv')
enc_test = pd.read_csv('enc_test.csv')

# Обучение

In [7]:
def nocv_loss(X_train, Y_train, clf, n_run=5, reg=False):
    x_train, x_test, y_train, y_test = train_test_split(X_train, Y_train, test_size=0.3, random_state=44)
    
    clf.fit(x_train, y_train)
    if reg:
        pred = clf.predict(x_test)
    else:
        pred = clf.predict_proba(x_test)[:, 1]
    print log_loss(y_test, pred)
    return pred, y_test

### ExtraTrees

In [28]:
X_train = sps.hstack((nonan_train, count_train, fac_pair_train, nan_train), format='csr')

In [29]:
start = time.time()
extc = ExtraTreesClassifier(n_estimators=1200,
                            max_features=40,
                            criterion='entropy',
                            min_samples_split=2,
                            max_depth=30, 
                            min_samples_leaf=2, 
                            n_jobs=-1)    

clf_ext_preds, y_test = nocv_loss(nonan_train, Y_train, extc, 1)
print 'Time:', (time.time() - start) / 60

0.458534101931
Time: 19.3043187499


In [193]:
nonan_train.columns[70]

'v50'

In [202]:
imp_features[:10]

['v50', 'v66', 'v113', 'v52', 'v10', 'v12', 'v91', 'v14', 'v40', 'v114']

In [197]:
importances = extc.feature_importances_
imp_features = []
indices = np.argsort(importances)[::-1]
for i in range(nonan_train.shape[1]):
    imp_features.append(nonan_train.columns[indices[i]])
    print("feature %s (%f)" % (nonan_train.columns[indices[i]], importances[indices[i]]))

feature v50 (0.098883)
feature v66 (0.029814)
feature v113 (0.029281)
feature v52 (0.026626)
feature v10 (0.026609)
feature v12 (0.025828)
feature v91 (0.025454)
feature v14 (0.024851)
feature v40 (0.024734)
feature v114 (0.024679)
feature v125 (0.023512)
feature v22 (0.023218)
feature v34 (0.022912)
feature v21 (0.021780)
feature v24 (0.021645)
feature v56 (0.021406)
feature v112 (0.021309)
feature v47 (0.018832)
feature v71 (0.018186)
feature v30 (0.016790)
feature v62 (0.015922)
feature v72 (0.012248)
feature v129 (0.011290)
feature v87 (0.007303)
feature v98 (0.007297)
feature v5 (0.006973)
feature v70 (0.006927)
feature v120 (0.006916)
feature v58 (0.006846)
feature v1 (0.006808)
feature v131 (0.006657)
feature v28 (0.006591)
feature v100 (0.006564)
feature v88 (0.006525)
feature v99 (0.006333)
feature v16 (0.006275)
feature v85 (0.006239)
feature v6 (0.006219)
feature v127 (0.006117)
feature v2 (0.006078)
feature v102 (0.006056)
feature v80 (0.006052)
feature v69 (0.005969)
featu

In [56]:
pd.DataFrame(clf_ext_preds).to_csv('CV_EXTREES.csv', index=False)
pd.DataFrame(clf_xgb_preds).to_csv('CV_XGB_CLF.csv', index=False)
pd.DataFrame(areg_xgb_preds).to_csv('CV_XGB_REG.csv', index=False)

In [36]:
clf_ext_preds = pd.read_csv('CV_EXTREES.csv')['0'].values

In [51]:
log_loss(y_test, (clf_ext_preds - clf_ext_preds.min()) / (clf_ext_preds.max() - clf_ext_preds.min()) )

0.45618779389320396

In [175]:
start = time.time()
extc = RandomForestClassifier(n_estimators=1200,
                            max_features=30,
                            criterion='entropy',
                            min_samples_split=2,
                            max_depth=30, 
                            min_samples_leaf=2, 
                            n_jobs=-1)    
clf_ext_preds, y_test = nocv_loss(train, Y_train, extc, 1)
print (time.time() - start) / 60

0.46334877936
28.1292908669


In [2]:
def get_metafeatures(X_train, X_test, targets, clfs):
    n_folds = 5
    verbose = True
    shuffle = False

    X, y, X_submission = X_train, targets, X_test

    if shuffle:
        idx = np.random.permutation(y.size)
        X = X[idx]
        y = y[idx]

    skf = list(StratifiedKFold(y, n_folds))
    
    scaler = StandardScaler()

    dataset_blend_train = np.zeros((X.shape[0], len(clfs)))
    dataset_blend_test = np.zeros((X_submission.shape[0], len(clfs)))

    for j, clf in enumerate(clfs):
        print j, clf
        dataset_blend_test_j = np.zeros((X_submission.shape[0], len(skf)))
        for i, (train, test) in enumerate(skf):
            print i
            Xtrain = X[train]
            y_train = y[train]
            Xtest = X[test]
            y_test = y[test]
        
            clf.fit(Xtrain, y_train)
            y_submission = clf.predict_proba(Xtest)[:,1]
            dataset_blend_train[test, j] = y_submission
            dataset_blend_test_j[:, i] = clf.predict_proba(X_submission)[:,1]
        dataset_blend_test[:,j] = dataset_blend_test_j.mean(1)
    return dataset_blend_train, dataset_blend_test

## BEST MODEL

In [65]:
X_train = sps.hstack((nonan_train, count_train, enc_train, fac_pair_train, nan_train), format='csr')
X_test = sps.hstack((nonan_test, count_test, enc_test, fac_pair_test, nan_test), format='csr')

In [67]:
X_train = sps.hstack((nonan_train[cat_features], count_train, super_train, fac_pair_train), format='csr')
X_test = sps.hstack((nonan_test, count_test, enc_test, fac_pair_test, nan_test), format='csr')

KeyboardInterrupt: 

In [66]:
xgb_reg = xgb.XGBRegressor(colsample_bytree=0.4,
                                colsample_bylevel=0.7,
                                learning_rate=0.0095,
                                max_depth=15,
                                n_estimators=750,
                                nthread=4,
                                objective='reg:linear',
                                silent=1,
                                subsample=0.8,
                                min_child_weight=6)

start = time.time()
areg_xgb_preds, y_test = nocv_loss(X_train, Y_train, xgb_reg, 1, reg=True)
areg_xgb_preds[areg_xgb_preds > 0.99] = 0.98
print log_loss(y_test, areg_xgb_preds)
print 'Time:', (time.time() - start) / 60

nan
0.453094681269
Time: 33.8757515987


In [27]:
areg_xgb_preds[areg_xgb_preds >= 0.97] = 0.98
print log_loss(y_test, areg_xgb_preds)

0.453137222615


In [27]:
xgb_clf = xgb.XGBClassifier(colsample_bytree=0.9,
                            colsample_bylevel=0.2,
                            learning_rate=0.0095,
                            max_depth=15,
                            n_estimators=1000,
                            nthread=4,
                            objective='binary:logistic',
                            silent=1,
                            subsample=0.85,
                            min_child_weight=3)

start = time.time()
clf_xgb_preds, y_test = nocv_loss(X_train, Y_train, xgb_clf, 1)
print (time.time() - start) / 60

0.45239517589
33.8972441514


In [8]:
from sklearn.tree import DecisionTreeRegressor

In [19]:
0.0095*750 / 100

0.07125

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
# Any results you write to the current directory are saved as output.

from sklearn import ensemble, metrics, linear_model
import random

#Some parameters to play with
rnd=12
random.seed(rnd)
n_ft=20 #Number of features to add
max_elts=3 #Maximum size of a group of linear features

class addNearestNeighbourLinearFeatures:
    
    def __init__(self, n_neighbours=1, max_elts=None, verbose=True, random_state=None):
        self.rnd=random_state
        self.n=n_neighbours
        self.max_elts=max_elts
        self.verbose=verbose
        self.neighbours=[]
        self.clfs=[]
        
    def fit(self,train,y):
        if self.rnd!=None:
            random.seed(rnd)
        if self.max_elts==None:
            self.max_elts=len(train.columns)
        list_vars=list(train.columns)
        random.shuffle(list_vars)
        
        lastscores=np.zeros(self.n)+1e15

        for elt in list_vars[:self.n]:
            self.neighbours.append([elt])
        list_vars=list_vars[self.n:]
        
        for elt in list_vars:
            indice=0
            scores=[]
            for elt2 in self.neighbours:
                if len(elt2)<self.max_elts:
                    clf=xgb.XGBRegressor(
                                colsample_bylevel=0.7,
                                learning_rate=0.07125,
                                max_depth=15,
                                n_estimators=100,
                                nthread=4,
                                objective='reg:linear',
                                silent=1,
                                subsample=0.8,
                                min_child_weight=6)
                    
                    clf.fit(train[elt2+[elt]], y)
                    scores.append(metrics.log_loss(y,clf.predict(train[elt2 + [elt]])))
                    indice=indice+1
                else:
                    scores.append(lastscores[indice])
                    indice=indice+1
            gains=lastscores-scores
            if gains.max()>0:
                temp=gains.argmax()
                lastscores[temp]=scores[temp]
                self.neighbours[temp].append(elt)

        indice=0
        for elt in self.neighbours:
            clf=xgb.XGBRegressor(
                                colsample_bylevel=0.7,
                                learning_rate=0.07125,
                                max_depth=15,
                                n_estimators=100,
                                nthread=4,
                                objective='reg:linear',
                                silent=1,
                                subsample=0.8,
                                min_child_weight=6)
            clf.fit(train[elt], y)
            self.clfs.append(clf)
            if self.verbose:
                print(indice, lastscores[indice], elt)
            indice=indice+1
                    
    def transform(self, train):
        indice=0
        for elt in self.neighbours:
            train['_'.join(pd.Series(elt).sort_values().values)]=self.clfs[indice].predict(train[elt])
            indice=indice+1
        return train
    
    def fit_transform(self, train, y):
        self.fit(train, y)
        return self.transform(train)
    
    
train = df_train
target = Y_train
test = df_test
id_test = id_test

train['v22-1']=train['v22'].fillna('@@@@').apply(lambda x:'@'*(4-len(str(x)))+str(x)).apply(lambda x:ord(x[0]))
test['v22-1']=test['v22'].fillna('@@@@').apply(lambda x:'@'*(4-len(str(x)))+str(x)).apply(lambda x:ord(x[0]))
train['v22-2']=train['v22'].fillna('@@@@').apply(lambda x:'@'*(4-len(str(x)))+str(x)).apply(lambda x:ord(x[1]))
test['v22-2']=test['v22'].fillna('@@@@').apply(lambda x:'@'*(4-len(str(x)))+str(x)).apply(lambda x:ord(x[1]))
train['v22-3']=train['v22'].fillna('@@@@').apply(lambda x:'@'*(4-len(str(x)))+str(x)).apply(lambda x:ord(x[2]))
test['v22-3']=test['v22'].fillna('@@@@').apply(lambda x:'@'*(4-len(str(x)))+str(x)).apply(lambda x:ord(x[2]))
train['v22-4']=train['v22'].fillna('@@@@').apply(lambda x:'@'*(4-len(str(x)))+str(x)).apply(lambda x:ord(x[3]))
test['v22-4']=test['v22'].fillna('@@@@').apply(lambda x:'@'*(4-len(str(x)))+str(x)).apply(lambda x:ord(x[3]))

drop_list=['v91','v1', 'v8', 'v10', 'v15', 'v17', 'v25', 'v29', 'v34', 'v41', 'v46', 'v54', 'v64', 'v67', 'v97', 'v105', 'v111', 'v122']
train = train.drop(drop_list,axis=1).fillna(-999)
test = test.drop(drop_list,axis=1).fillna(-999)

refcols=list(train.columns)

for elt in refcols:
    if train[elt].dtype=='O':
        train[elt], temp = pd.factorize(train[elt])
        test[elt]=temp.get_indexer(test[elt])
    else:
        train[elt]=train[elt].round(5)
        test[elt]=test[elt].round(5)
        
a=addNearestNeighbourLinearFeatures(n_neighbours=n_ft, max_elts=max_elts, verbose=True, random_state=rnd)
a.fit(train, target)

train = a.transform(train)
test = a.transform(test)

In [14]:
linear_new = list(set(train.columns) - set(df_train.columns))

In [16]:
linear_train = train[linear_new]
linear_test = test[linear_new]

In [33]:
clf = ensemble.ExtraTreesClassifier(n_estimators=1200,max_features=50,criterion= 'entropy',min_samples_split= 4,
                        max_depth= 35, min_samples_leaf= 2, n_jobs = -1, random_state=rnd)

nclf_ext_preds, y_test = nocv_loss(train, Y_train, clf, 1)

0.454062622178


In [62]:
xgb_clf = xgb.XGBClassifier(colsample_bytree=0.9,
                            colsample_bylevel=0.9,
                            learning_rate=0.0095,
                            max_depth=9,
                            n_estimators=1000,
                            nthread=4,
                            objective='binary:logistic',
                            silent=1,
                            subsample=0.9,
                            min_child_weight=1)

start = time.time()
sclf_xgb_preds, y_test = nocv_loss(X_train, Y_train, xgb_clf, 1)
print (time.time() - start) / 60

0.489410291235
25.2838318984


In [51]:
xgb_clf = xgb.XGBClassifier(colsample_bytree=0.2,
                            colsample_bylevel=0.2,
                            learning_rate=0.0095,
                            max_depth=15,
                            n_estimators=300,
                            nthread=4,
                            objective='binary:logistic',
                            silent=1,
                            subsample=0.85,
                            min_child_weight=3)

start = time.time()
sclf_xgb_preds, y_test = nocv_loss(X_train[:45000], Y_train[:45000], xgb_clf, 1)
print (time.time() - start) / 60

0.480531062899
1.46316971779


In [None]:
0.47384843154
3.98084379832

### Blending

In [11]:
def blending(targets, preds1, preds2, p):
    print 'p:', p, log_loss(targets, preds1*p + preds2*(1-p))
    return True

In [70]:
for p in np.arange(0, 1.05, 0.1):
    for q in np.arange(0, 1.05 - p, 0.1):
        print p, q, 1-p-q, log_loss(y_test, p*nclf_ext_preds + q*clf_xgb_preds + (1 - p - q)*areg_xgb_preds)

0.0 0.0 1.0 0.453094681393
0.0 0.1 0.9 0.451899259129
0.0 0.2 0.8 0.450908491289
0.0 0.3 0.7 0.450132092501
0.0 0.4 0.6 0.44958225219
0.0 0.5 0.5 0.449274766165
0.0 0.6 0.4 0.449229953446
0.0 0.7 0.3 0.449474455858
0.0 0.8 0.2 0.45004437333
0.0 0.9 0.1 0.450991122205
0.0 1.0 0.0 0.452395176051
0.1 0.0 0.9 0.45204919472
0.1 0.1 0.8 0.450899520043
0.1 0.2 0.7 0.4499533474
0.1 0.3 0.6 0.449221216244
0.1 0.4 0.5 0.448716308531
0.1 0.5 0.4 0.448455573737
0.1 0.6 0.3 0.448460918479
0.1 0.7 0.2 0.448761480206
0.1 0.8 0.1 0.449397207994
0.1 0.9 0.0 0.450427594013
0.2 0.0 0.8 0.4513005497
0.2 0.1 0.7 0.450197789733
0.2 0.2 0.6 0.449299395046
0.2 0.3 0.5 0.448616740214
0.2 0.4 0.4 0.448164285206
0.2 0.5 0.3 0.447960663452
0.2 0.6 0.2 0.448030104161
0.2 0.7 0.1 0.448405432916
0.2 0.8 0.0 0.449133964348
0.3 0.0 0.7 0.450815484538
0.3 0.1 0.6 0.449762777223
0.3 0.2 0.5 0.448916799093
0.3 0.3 0.4 0.44829013256
0.3 0.4 0.3 0.4478988412
0.3 0.5 0.2 0.447763653801
0.3 0.6 0.1 0.447912456865
0.3 0.7 -1.

In [69]:
for p in np.arange(0, 1.05, 0.05):
    blending(y_test, clf_xgb_preds, areg_xgb_preds, p)

p: 0.0 0.453094681269
p: 0.05 0.452471879375
p: 0.1 0.451899259238
p: 0.15 0.451377758719
p: 0.2 0.450908491194
p: 0.25 0.450492799703
p: 0.3 0.450132092031
p: 0.35 0.449827966103
p: 0.4 0.449582251864
p: 0.45 0.449397060069
p: 0.5 0.449274765864
p: 0.55 0.449218037244
p: 0.6 0.449229953233
p: 0.65 0.449314052684
p: 0.7 0.449474455927
p: 0.75 0.449715977026
p: 0.8 0.450044373733
p: 0.85 0.450466580091
p: 0.9 0.450991121728
p: 0.95 0.451629018197
p: 1.0 0.45239517589


In [469]:
a = np.round(enc_xgb_preds, 1)

In [470]:
log_loss(y_test, a)

0.56132094504557029

In [69]:
clf = xgb.XGBRegressor(colsample_bytree=0.4,
                                colsample_bylevel=0.7,
                                learning_rate=0.0095,
                                max_depth=15,
                                n_estimators=750,
                                nthread=4,
                                objective='reg:linear',
                                silent=1,
                                subsample=0.8,
                                min_child_weight=6)

rxgbypred = np.zeros(X_test.shape[0])
for i in range(1):
    print i
    clf.fit(X_train, Y_train)
    rxgbypred += clf.predict(X_test)
rxgbypred /= 1

0


In [8]:
rxgbypred += clf.predict(X_test)

In [62]:
clf = xgb.XGBClassifier(colsample_bytree=0.9,
                            colsample_bylevel=0.2,
                            learning_rate=0.0095,
                            max_depth=15,
                            n_estimators=1000,
                            nthread=4,
                            objective='binary:logistic',
                            silent=1,
                            subsample=0.85,
                            min_child_weight=3)

xgbypred = np.zeros(X_test.shape[0])
for i in range(1):
    print i
    clf.fit(X_train, Y_train)
    xgbypred += clf.predict_proba(X_test)[:, 1]
xgbypred /= 1

0


In [10]:
rxgbypred[rxgbypred > 0.99] = 0.97

In [40]:
clf = ExtraTreesClassifier(n_estimators=1200,
                            max_features=30,
                            criterion='entropy',
                            min_samples_split=2,
                            max_depth=30, 
                            min_samples_leaf=2, 
                            n_jobs=-1) 
exypred = np.zeros(test.shape[0])
for i in range(1):
    print i
    clf.fit(train, Y_train)
    exypred += clf.predict_proba(test)[:, 1]
exypred /= 1

0


In [53]:
clf = ensemble.ExtraTreesClassifier(n_estimators=1200,max_features=50,criterion= 'entropy',min_samples_split= 4,
                        max_depth= 35, min_samples_leaf= 2, n_jobs = -1, random_state=rnd)

exypred = np.zeros(test.shape[0])
for i in range(1):
    print i
    clf.fit(train, Y_train)
    exypred += clf.predict_proba(test)[:, 1]
exypred /= 1

0


In [54]:
def make_submission(pred, id_test, filename):
    df_pred = pd.DataFrame()
    df_pred['ID'] = id_test
    df_pred['PredictedProb'] = pred

    df_pred.to_csv(filename, index = False)

In [64]:
make_submission(exypred*0.5 + 0.5*xgbypred, id_test, 'best.csv')

In [63]:
make_submission(xgbypred, id_test, 'xgb.csv')

In [73]:
rxgbypred[rxgbypred >= 0.99] = 0.98

In [58]:
best = pd.read_csv('EX3XGB5REG2.csv')['PredictedProb']

In [34]:
blend_xgb_extra = pextra['PredictedProb']*0.4 + pxgb['PredictedProb']*0.6