In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import cross_val_score, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import manifold, decomposition, linear_model, ensemble, neighbors, cross_validation,\
metrics, model_selection, base, neighbors, svm, preprocessing



import xgboost
from xgboost import DMatrix



In [2]:
datasets_name = ["base", "credit_pay", "target_encode", "simple"]

In [3]:
train_datasets = [pd.read_csv("../datasets/train_{0}.csv".format(name)) for name in datasets_name]
test_datasets = [pd.read_csv("../datasets/test_{0}.csv".format(name)) for name in datasets_name]

In [4]:
targets = []
ans = []
for train, test in zip(train_datasets, test_datasets):
    targets.append(train.target)
    train.drop("target", axis=1, inplace=True)
    train.drop("client_id", axis=1, inplace=True)

    ans.append(test[["client_id"]])
    test.drop("client_id", axis=1, inplace=True)

In [5]:
seed = 0
np.random.seed(seed=seed)

In [6]:
ET_gini = ensemble.ExtraTreesClassifier(n_estimators=700, n_jobs=-1, bootstrap=True, max_features=0.4, 
                          max_depth=25, criterion='gini', random_state = 0, class_weight = "balanced")

ET_entropy = ensemble.ExtraTreesClassifier(n_estimators=700, n_jobs=-1, bootstrap=True, max_features=0.4, 
                          max_depth=25, criterion="entropy", random_state = 0, class_weight = "balanced")

RF_gini = ensemble.RandomForestClassifier(n_estimators=700, n_jobs=-1, max_depth=25, max_features=0.4, random_state=0,
                                    class_weight = "balanced")

RF_entropy = ensemble.RandomForestClassifier(n_estimators=700, n_jobs=-1, max_depth=25, max_features=0.4,\
                                             criterion="entropy", random_state=0, class_weight = "balanced")

adaboost = ensemble.AdaBoostClassifier(n_estimators=250, learning_rate=0.07, random_state=0)

lin_cl = linear_model.LogisticRegression(C=0.85, penalty="l1", class_weight="balanced", n_jobs=-1,\
                                         random_state=0)

xgb = xgboost.XGBClassifier(max_depth=5, learning_rate=0.1, n_estimators=300, seed=0)

In [7]:
cv = cross_validation.StratifiedKFold(y=targets[0], n_folds=6, random_state=seed)

In [8]:
model = [ET_gini, ET_entropy, RF_gini, RF_entropy, adaboost, lin_cl, xgb]
model_name = ["ET_gini", "ET_entropy", "RF_gini", "RF_entropy", "adaboost", "lin_cl", "xgb"]

In [10]:
model = [ET_gini, ET_entropy, RF_gini, RF_entropy, adaboost, lin_cl, xgb]
model_name = ["ET_gini", "ET_entropy", "RF_gini", "RF_entropy", "adaboost", "lin_cl", "xgb"]

for train, test, index in zip(train_datasets, test_datasets, range(len(train_datasets))):

    print(index,"-th dataset")
    y = targets[index]
    for clf, name in zip(model, model_name):
        print(name," is fitting...")
        pred_train = model_selection.cross_val_predict(estimator=clf, X=train, y=y, cv = cv,\
                                                   n_jobs=-1, method="predict_proba", verbose=2)
        clf.fit(X=train, y=y)
    
        pred_test = pd.DataFrame(clf.predict_proba(test)[:,1], columns=[name])
        pred_train = pd.DataFrame(pred_train[:,1], columns=[name])
        print(metrics.roc_auc_score(y, pred_train))
        pred_train.to_csv("../stacking/{0}_train_{1}.csv".format(datasets_name[index], name), index=0)
        pred_test.to_csv("../stacking/{0}_test_{1}.csv".format(datasets_name[index], name), index=0)

0 -th dataset
ET_gini  is fitting...


[Parallel(n_jobs=-1)]: Done   3 out of   6 | elapsed:  8.5min remaining:  8.5min
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed: 12.7min finished


0.722831688309
ET_entropy  is fitting...


[Parallel(n_jobs=-1)]: Done   3 out of   6 | elapsed:  8.9min remaining:  8.9min
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed: 13.3min finished


0.724086492335
RF_gini  is fitting...


[Parallel(n_jobs=-1)]: Done   3 out of   6 | elapsed: 17.1min remaining: 17.1min
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed: 25.7min finished


0.743061736697
RF_entropy  is fitting...


[Parallel(n_jobs=-1)]: Done   3 out of   6 | elapsed: 19.8min remaining: 19.8min
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed: 29.6min finished


0.742559422047
adaboost  is fitting...


[Parallel(n_jobs=-1)]: Done   3 out of   6 | elapsed:  1.6min remaining:  1.6min
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:  2.5min finished


0.7449234981
lin_cl  is fitting...


[Parallel(n_jobs=-1)]: Done   3 out of   6 | elapsed:  2.1min remaining:  2.1min
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:  3.0min finished


0.709024430725
xgb  is fitting...


[Parallel(n_jobs=-1)]: Done   3 out of   6 | elapsed:  3.6min remaining:  3.6min
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:  5.5min finished


0.767639861843
1 -th dataset
ET_gini  is fitting...


[Parallel(n_jobs=-1)]: Done   3 out of   6 | elapsed:  4.7min remaining:  4.7min
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:  6.9min finished


0.72729866885
ET_entropy  is fitting...


[Parallel(n_jobs=-1)]: Done   3 out of   6 | elapsed:  5.0min remaining:  5.0min
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:  7.3min finished


0.727649118877
RF_gini  is fitting...


[Parallel(n_jobs=-1)]: Done   3 out of   6 | elapsed: 11.4min remaining: 11.4min
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed: 17.1min finished


0.741950261329
RF_entropy  is fitting...


[Parallel(n_jobs=-1)]: Done   3 out of   6 | elapsed: 14.0min remaining: 14.0min
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed: 21.0min finished


0.741276492022
adaboost  is fitting...


[Parallel(n_jobs=-1)]: Done   3 out of   6 | elapsed:  1.1min remaining:  1.1min
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:  1.8min finished


0.733585371963
lin_cl  is fitting...


[Parallel(n_jobs=-1)]: Done   3 out of   6 | elapsed:   43.6s remaining:   43.6s
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:  1.1min finished


0.644954041342
xgb  is fitting...


[Parallel(n_jobs=-1)]: Done   3 out of   6 | elapsed:  1.9min remaining:  1.9min
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:  2.8min finished


0.766071042964
2 -th dataset
ET_gini  is fitting...


[Parallel(n_jobs=-1)]: Done   3 out of   6 | elapsed: 20.4min remaining: 20.4min
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed: 30.4min finished


0.743905973568
ET_entropy  is fitting...


[Parallel(n_jobs=-1)]: Done   3 out of   6 | elapsed: 20.5min remaining: 20.5min
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed: 30.6min finished


0.743904359433
RF_gini  is fitting...


[Parallel(n_jobs=-1)]: Done   3 out of   6 | elapsed: 25.2min remaining: 25.2min
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed: 37.7min finished


0.747828359768
RF_entropy  is fitting...


[Parallel(n_jobs=-1)]: Done   3 out of   6 | elapsed: 27.1min remaining: 27.1min
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed: 40.7min finished


0.745144128619
adaboost  is fitting...


[Parallel(n_jobs=-1)]: Done   3 out of   6 | elapsed:  1.8min remaining:  1.8min
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:  2.8min finished


0.744389104704
lin_cl  is fitting...


[Parallel(n_jobs=-1)]: Done   3 out of   6 | elapsed:  1.5min remaining:  1.5min
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:  2.0min finished


0.709734127352
xgb  is fitting...


[Parallel(n_jobs=-1)]: Done   3 out of   6 | elapsed:  5.4min remaining:  5.4min
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:  8.4min finished


0.767511398227
3 -th dataset
ET_gini  is fitting...


[Parallel(n_jobs=-1)]: Done   3 out of   6 | elapsed: 22.3min remaining: 22.3min
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed: 33.3min finished


0.7360619085
ET_entropy  is fitting...


[Parallel(n_jobs=-1)]: Done   3 out of   6 | elapsed: 22.6min remaining: 22.6min
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed: 33.8min finished


0.737088046564
RF_gini  is fitting...


[Parallel(n_jobs=-1)]: Done   3 out of   6 | elapsed: 25.5min remaining: 25.5min
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed: 38.0min finished


0.744787131711
RF_entropy  is fitting...


[Parallel(n_jobs=-1)]: Done   3 out of   6 | elapsed: 26.9min remaining: 26.9min
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed: 39.8min finished


0.742533226484
adaboost  is fitting...


[Parallel(n_jobs=-1)]: Done   3 out of   6 | elapsed:  1.8min remaining:  1.8min
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:  2.8min finished


0.732426281305
lin_cl  is fitting...


[Parallel(n_jobs=-1)]: Done   3 out of   6 | elapsed:  2.9min remaining:  2.9min
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:  4.3min finished


0.655178517621
xgb  is fitting...


[Parallel(n_jobs=-1)]: Done   3 out of   6 | elapsed:  6.0min remaining:  6.0min
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:  9.4min finished


0.766437616021


In [9]:
stack_train_datasets = []
stack_test_datasets = []
for index in range(len(datasets_name)):
    for name in model_name:
        train_file_name = "../stacking/{0}_train_{1}.csv".format(datasets_name[index], name)
        print(train_file_name)
        pred_train = pd.read_csv(train_file_name)
        pred_train.columns = ["{0}_train_{1}".format(datasets_name[index], name)]
        
        test_file_name = "../stacking/{0}_test_{1}.csv".format(datasets_name[index], name)
        print(test_file_name)
        pred_test = pd.read_csv(test_file_name)
        pred_test.columns = ["{0}_test_{1}".format(datasets_name[index], name)]
        
        stack_train_datasets.append(pred_train)
        stack_test_datasets.append(pred_test)

../stacking/base_train_ET_gini.csv
../stacking/base_test_ET_gini.csv
../stacking/base_train_ET_entropy.csv
../stacking/base_test_ET_entropy.csv
../stacking/base_train_RF_gini.csv
../stacking/base_test_RF_gini.csv
../stacking/base_train_RF_entropy.csv
../stacking/base_test_RF_entropy.csv
../stacking/base_train_adaboost.csv
../stacking/base_test_adaboost.csv
../stacking/base_train_lin_cl.csv
../stacking/base_test_lin_cl.csv
../stacking/base_train_xgb.csv
../stacking/base_test_xgb.csv
../stacking/credit_pay_train_ET_gini.csv
../stacking/credit_pay_test_ET_gini.csv
../stacking/credit_pay_train_ET_entropy.csv
../stacking/credit_pay_test_ET_entropy.csv
../stacking/credit_pay_train_RF_gini.csv
../stacking/credit_pay_test_RF_gini.csv
../stacking/credit_pay_train_RF_entropy.csv
../stacking/credit_pay_test_RF_entropy.csv
../stacking/credit_pay_train_adaboost.csv
../stacking/credit_pay_test_adaboost.csv
../stacking/credit_pay_train_lin_cl.csv
../stacking/credit_pay_test_lin_cl.csv
../stacking/cre

In [10]:
stack_train = pd.concat(stack_train_datasets, axis=1)
stack_test = pd.concat(stack_test_datasets, axis=1)
y = targets[0]

In [11]:
stack_train = stack_train.rank()
min_max_scaler = preprocessing.MinMaxScaler()
stack_train = pd.DataFrame(min_max_scaler.fit_transform(stack_train), columns=stack_train.columns)

In [12]:
stack_train.corr().loc["base_train_xgb"]

base_train_ET_gini                0.821709
base_train_ET_entropy             0.824647
base_train_RF_gini                0.888419
base_train_RF_entropy             0.889848
base_train_adaboost               0.902269
base_train_lin_cl                 0.767201
base_train_xgb                    1.000000
credit_pay_train_ET_gini          0.837607
credit_pay_train_ET_entropy       0.838409
credit_pay_train_RF_gini          0.882111
credit_pay_train_RF_entropy       0.882934
credit_pay_train_adaboost         0.857455
credit_pay_train_lin_cl           0.556452
credit_pay_train_xgb              0.980336
target_encode_train_ET_gini       0.891576
target_encode_train_ET_entropy    0.892651
target_encode_train_RF_gini       0.898581
target_encode_train_RF_entropy    0.896397
target_encode_train_adaboost      0.898673
target_encode_train_lin_cl        0.753452
target_encode_train_xgb           0.979443
simple_train_ET_gini              0.854785
simple_train_ET_entropy           0.858842
simple_trai

In [13]:
y = targets[0]

In [14]:
columns = stack_test.columns
stack_test = pd.DataFrame(min_max_scaler.transform(stack_test), columns=stack_test.columns)
sol =  0.5859 * stack_test.base_test_xgb + (1-0.5859) * stack_test[columns[27]]
sol = 0.6970 * sol + (1-0.6970) * stack_test[columns[20]]
sol = 0.9091 * sol + (1-0.9091) * stack_test[columns[16]]

In [15]:
sol = min_max_scaler.fit_transform(sol)
sol



array([ 0.06672655,  0.15621536,  0.31662877, ...,  0.02355083,
        0.31366991,  0.07786225])

In [16]:
ans = ans[0]
ans['val'] = sol
ans.rename(columns={"client_id":"_ID_", "val":"_VAL_"}, inplace=True)
ans.to_csv("../solutions/stacking.csv", index=False, sep=",", header=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  **kwargs)
